Skip to main content

coding_agent_search/storage/
sqlite.rs

1//! `SQLite` backend: schema, pragmas, and migrations.
2
3use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
4use crate::sources::provenance::{LOCAL_SOURCE_ID, Source, SourceKind};
5use anyhow::{Context, Result, anyhow, bail};
6use frankensqlite::{
7    Connection as FrankenConnection, Row as FrankenRow, SqliteValue,
8    compat::{
9        ConnectionExt as FrankenConnectionExt, OpenFlags as FrankenOpenFlags,
10        OptionalExtension as FrankenOptionalExtension, ParamValue, RowExt as FrankenRowExt,
11        Transaction as FrankenTransaction, TransactionExt as FrankenTransactionExt,
12        open_with_flags as open_franken_with_flags, param_slice_to_values, params_from_iter,
13    },
14    migrate::MigrationRunner,
15};
16use serde::{Deserialize, Serialize};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::io::{BufRead, BufReader, Write};
22use std::process::{Command, Stdio};
23use std::sync::{
24    Arc,
25    atomic::{AtomicBool, AtomicI8, AtomicI64, AtomicU64, AtomicUsize, Ordering},
26};
27
28/// Frankensqlite parameter list builder.
29macro_rules! fparams {
30    () => {
31        &[] as &[ParamValue]
32    };
33    ($($val:expr),+ $(,)?) => {
34        &[$(ParamValue::from($val)),+] as &[ParamValue]
35    };
36}
37use std::path::{Path, PathBuf};
38use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
39use thiserror::Error;
40use tracing::info;
41
42const DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT: Duration = Duration::from_secs(30);
43const DOCTOR_MUTATION_LOCK_MAX_METADATA_READ: u64 = 64 * 1024;
44
45// -------------------------------------------------------------------------
46// Lazy FrankenSQLite Connection (bd-1ueu)
47// -------------------------------------------------------------------------
48// Defers opening the database until first use, cutting startup cost for
49// commands that may not need the DB at all.  Thread-safe via parking_lot
50// Mutex; logs the reason and duration of the open on first access.
51
52/// Error from lazy database initialization.
53#[derive(Debug, Error)]
54pub enum LazyDbError {
55    #[error("Database not found at {0}")]
56    NotFound(PathBuf),
57    #[error("Failed to open FrankenSQLite database at {path}: {source}")]
58    FrankenOpenFailed {
59        path: PathBuf,
60        source: frankensqlite::FrankenError,
61    },
62}
63
64// -------------------------------------------------------------------------
65// LazyFrankenDb — lazy wrapper around FrankenConnection
66// -------------------------------------------------------------------------
67
68/// Wrapper around `FrankenConnection` that implements `Send`.
69///
70/// `FrankenConnection` is `!Send` because it uses `Rc` internally.
71/// However, the `Rc` values are entirely self-contained within the Connection
72/// and are not shared externally.  When wrapped in a `Mutex`,
73/// exclusive access is guaranteed, making cross-thread transfer safe.
74pub struct SendFrankenConnection(FrankenConnection, i64, u64);
75
76// Safety: Rc fields inside FrankenConnection are not cloned or shared externally.
77// The Mutex<Option<SendFrankenConnection>> ensures exclusive access.
78unsafe impl Send for SendFrankenConnection {}
79
80impl SendFrankenConnection {
81    pub(crate) fn new(conn: FrankenConnection) -> Self {
82        Self(
83            conn,
84            UNSET_INDEX_WRITER_CHECKPOINT_PAGES,
85            UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS,
86        )
87    }
88
89    pub(crate) fn new_with_index_writer_state(
90        conn: FrankenConnection,
91        checkpoint_pages: i64,
92        busy_timeout_ms: u64,
93    ) -> Self {
94        Self(conn, checkpoint_pages, busy_timeout_ms)
95    }
96
97    pub(crate) fn into_parts(self) -> (FrankenConnection, i64, u64) {
98        (self.0, self.1, self.2)
99    }
100}
101
102impl std::ops::Deref for SendFrankenConnection {
103    type Target = FrankenConnection;
104    fn deref(&self) -> &FrankenConnection {
105        &self.0
106    }
107}
108
109/// Lazy-opening wrapper for `FrankenConnection` (frankensqlite).
110///
111/// Constructing a `LazyFrankenDb` is cheap (no I/O).  The underlying
112/// `FrankenConnection` is opened on the first call to [`get`].
113/// Subsequent calls return the cached connection.
114pub struct LazyFrankenDb {
115    path: PathBuf,
116    conn: parking_lot::Mutex<Option<SendFrankenConnection>>,
117}
118
119/// RAII guard that dereferences to the inner `FrankenConnection`.
120pub struct LazyFrankenDbGuard<'a>(parking_lot::MutexGuard<'a, Option<SendFrankenConnection>>);
121
122impl std::fmt::Debug for LazyFrankenDbGuard<'_> {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        f.debug_tuple("LazyFrankenDbGuard")
125            .field(&self.0.is_some())
126            .finish()
127    }
128}
129
130impl std::ops::Deref for LazyFrankenDbGuard<'_> {
131    type Target = FrankenConnection;
132    fn deref(&self) -> &FrankenConnection {
133        self.0
134            .as_ref()
135            .expect("LazyFrankenDb connection must be initialized before access")
136    }
137}
138
139impl LazyFrankenDb {
140    /// Create a lazy handle pointing at `path`.  No I/O is performed.
141    pub fn new(path: PathBuf) -> Self {
142        Self {
143            path,
144            conn: parking_lot::Mutex::new(None),
145        }
146    }
147
148    /// Resolve path from optional CLI overrides.
149    ///
150    /// Uses `data_dir / agent_search.db` as fallback.
151    pub fn from_overrides(data_dir: &Option<PathBuf>, db_override: Option<PathBuf>) -> Self {
152        let data_dir = data_dir.clone().unwrap_or_else(crate::default_data_dir);
153        let path = db_override.unwrap_or_else(|| data_dir.join("agent_search.db"));
154        Self::new(path)
155    }
156
157    /// Get the connection, opening the database on first access.
158    ///
159    /// `reason` is logged alongside the open duration so callers can
160    /// identify which command triggered the open.
161    pub fn get(&self, reason: &str) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
162        let mut guard = self.conn.lock();
163        if guard.is_none() {
164            if !self.path.exists() {
165                return Err(LazyDbError::NotFound(self.path.clone()));
166            }
167            let start = Instant::now();
168            let _doctor_guard = acquire_doctor_mutation_db_open_guard(
169                &self.path,
170                DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT,
171            )
172            .map_err(|err| LazyDbError::FrankenOpenFailed {
173                path: self.path.clone(),
174                source: frankensqlite::FrankenError::Internal(err.to_string()),
175            })?;
176            let conn =
177                FrankenConnection::open(self.path.to_string_lossy().into_owned()).map_err(|e| {
178                    LazyDbError::FrankenOpenFailed {
179                        path: self.path.clone(),
180                        source: e,
181                    }
182                })?;
183            let elapsed_ms = start.elapsed().as_millis();
184            info!(
185                path = %self.path.display(),
186                elapsed_ms = elapsed_ms,
187                reason = reason,
188                "lazily opened FrankenSQLite database"
189            );
190            *guard = Some(SendFrankenConnection::new(conn));
191        }
192        Ok(LazyFrankenDbGuard(guard))
193    }
194
195    /// Get the connection with a timeout, opening the database on first access.
196    ///
197    /// Like [`get`] but spawns the open in a background thread and waits up to
198    /// `timeout` for it to complete. Returns `LazyDbError::FrankenOpenFailed`
199    /// with a descriptive message if the timeout elapses. Fix for #128.
200    pub fn get_with_timeout(
201        &self,
202        reason: &str,
203        timeout: Duration,
204    ) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
205        let mut guard = self.conn.lock();
206        if guard.is_none() {
207            if !self.path.exists() {
208                return Err(LazyDbError::NotFound(self.path.clone()));
209            }
210            let start = Instant::now();
211            let path_owned = self.path.to_string_lossy().into_owned();
212            let path_for_guard = self.path.clone();
213            let (tx, rx) = std::sync::mpsc::channel();
214            std::thread::spawn(move || {
215                let _doctor_guard =
216                    match acquire_doctor_mutation_db_open_guard(&path_for_guard, timeout) {
217                        Ok(guard) => guard,
218                        Err(err) => {
219                            let _ = tx
220                                .send(Err(frankensqlite::FrankenError::Internal(err.to_string())));
221                            return;
222                        }
223                    };
224                let _ =
225                    tx.send(FrankenConnection::open(path_owned).map(SendFrankenConnection::new));
226            });
227            let conn = rx
228                .recv_timeout(timeout)
229                .map_err(|_| LazyDbError::FrankenOpenFailed {
230                    path: self.path.clone(),
231                    source: frankensqlite::FrankenError::Internal(format!(
232                        "database open timed out after {}s (possible corruption or lock contention)",
233                        timeout.as_secs()
234                    )),
235                })?
236                .map_err(|e| LazyDbError::FrankenOpenFailed {
237                    path: self.path.clone(),
238                    source: e,
239                })?;
240            let elapsed_ms = start.elapsed().as_millis();
241            info!(
242                path = %self.path.display(),
243                elapsed_ms = elapsed_ms,
244                reason = reason,
245                "lazily opened FrankenSQLite database (with timeout)"
246            );
247            *guard = Some(conn);
248        }
249        Ok(LazyFrankenDbGuard(guard))
250    }
251
252    /// Path to the database file (even if not yet opened).
253    pub fn path(&self) -> &Path {
254        &self.path
255    }
256
257    /// Whether the connection has been opened.
258    pub fn is_open(&self) -> bool {
259        self.conn.lock().is_some()
260    }
261}
262
263static FRANKEN_RETRY_JITTER_STATE: AtomicU64 = AtomicU64::new(0x9e37_79b9_7f4a_7c15);
264static DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH: AtomicUsize = AtomicUsize::new(0);
265static MESSAGE_LOOKUP_TRACE_ENABLED: AtomicBool = AtomicBool::new(false);
266static MESSAGE_LOOKUP_EXACT_IDX_PROBES: AtomicU64 = AtomicU64::new(0);
267static MESSAGE_LOOKUP_BOUNDED_QUERIES: AtomicU64 = AtomicU64::new(0);
268static MESSAGE_LOOKUP_FULL_SCAN_QUERIES: AtomicU64 = AtomicU64::new(0);
269static MESSAGE_LOOKUP_ROWS_MATERIALIZED: AtomicU64 = AtomicU64::new(0);
270static DEFAULT_DEFER_ANALYTICS_UPDATES: AtomicBool = AtomicBool::new(false);
271
272#[derive(Debug, Clone, Copy, Default, Serialize)]
273pub(crate) struct MessageLookupTraceCounters {
274    pub exact_idx_probes: u64,
275    pub bounded_lookup_queries: u64,
276    pub full_scan_queries: u64,
277    pub rows_materialized: u64,
278}
279
280impl MessageLookupTraceCounters {
281    pub(crate) fn saturating_sub(self, before: Self) -> Self {
282        Self {
283            exact_idx_probes: self
284                .exact_idx_probes
285                .saturating_sub(before.exact_idx_probes),
286            bounded_lookup_queries: self
287                .bounded_lookup_queries
288                .saturating_sub(before.bounded_lookup_queries),
289            full_scan_queries: self
290                .full_scan_queries
291                .saturating_sub(before.full_scan_queries),
292            rows_materialized: self
293                .rows_materialized
294                .saturating_sub(before.rows_materialized),
295        }
296    }
297
298    pub(crate) fn lookups_against_global(self) -> u64 {
299        self.exact_idx_probes.saturating_add(self.rows_materialized)
300    }
301}
302
303pub(crate) fn set_message_lookup_trace_enabled(enabled: bool) -> bool {
304    MESSAGE_LOOKUP_TRACE_ENABLED.swap(enabled, Ordering::Relaxed)
305}
306
307pub(crate) fn message_lookup_trace_snapshot() -> MessageLookupTraceCounters {
308    MessageLookupTraceCounters {
309        exact_idx_probes: MESSAGE_LOOKUP_EXACT_IDX_PROBES.load(Ordering::Relaxed),
310        bounded_lookup_queries: MESSAGE_LOOKUP_BOUNDED_QUERIES.load(Ordering::Relaxed),
311        full_scan_queries: MESSAGE_LOOKUP_FULL_SCAN_QUERIES.load(Ordering::Relaxed),
312        rows_materialized: MESSAGE_LOOKUP_ROWS_MATERIALIZED.load(Ordering::Relaxed),
313    }
314}
315
316pub(crate) struct DefaultDeferAnalyticsUpdatesGuard {
317    previous: bool,
318}
319
320impl Drop for DefaultDeferAnalyticsUpdatesGuard {
321    fn drop(&mut self) {
322        DEFAULT_DEFER_ANALYTICS_UPDATES.store(self.previous, Ordering::Relaxed);
323    }
324}
325
326pub(crate) fn default_defer_analytics_updates_guard(
327    enabled: bool,
328) -> DefaultDeferAnalyticsUpdatesGuard {
329    let previous = DEFAULT_DEFER_ANALYTICS_UPDATES.swap(enabled, Ordering::Relaxed);
330    DefaultDeferAnalyticsUpdatesGuard { previous }
331}
332
333fn record_message_lookup_bounded_queries(query_count: u64, rows: usize) {
334    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
335        MESSAGE_LOOKUP_BOUNDED_QUERIES.fetch_add(query_count, Ordering::Relaxed);
336        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
337    }
338}
339
340fn record_message_lookup_full_scan_query(rows: usize) {
341    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
342        MESSAGE_LOOKUP_FULL_SCAN_QUERIES.fetch_add(1, Ordering::Relaxed);
343        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
344    }
345}
346
347pub(crate) struct DoctorMutationDbOpenBypassGuard;
348
349impl Drop for DoctorMutationDbOpenBypassGuard {
350    fn drop(&mut self) {
351        DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_sub(1, Ordering::SeqCst);
352    }
353}
354
355pub(crate) fn enter_doctor_mutation_db_open_bypass() -> DoctorMutationDbOpenBypassGuard {
356    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_add(1, Ordering::SeqCst);
357    DoctorMutationDbOpenBypassGuard
358}
359
360fn doctor_mutation_db_open_bypass_active() -> bool {
361    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.load(Ordering::SeqCst) > 0
362}
363
364fn next_franken_retry_jitter_ms(max_inclusive: u64) -> u64 {
365    let mut value = FRANKEN_RETRY_JITTER_STATE.fetch_add(0x9e37_79b9_7f4a_7c15, Ordering::Relaxed);
366    value ^= value >> 30;
367    value = value.wrapping_mul(0xbf58_476d_1ce4_e5b9);
368    value ^= value >> 27;
369    value = value.wrapping_mul(0x94d0_49bb_1331_11eb);
370    value ^= value >> 31;
371    value % max_inclusive.saturating_add(1)
372}
373
374/// Sleep with jittered exponential backoff to avoid lock-step retry storms
375/// when many threads hit the same transient SQLite/frankensqlite contention.
376pub(crate) fn sleep_with_franken_retry_backoff(
377    backoff: &mut Duration,
378    remaining: Duration,
379    max_backoff: Duration,
380) {
381    let capped = (*backoff).min(remaining);
382    let extra_budget = remaining.saturating_sub(capped).min(capped);
383    let extra_ms = extra_budget.as_millis().min(u128::from(u64::MAX)) as u64;
384    let sleep_for = if extra_ms == 0 {
385        capped
386    } else {
387        capped
388            .saturating_add(Duration::from_millis(next_franken_retry_jitter_ms(
389                extra_ms,
390            )))
391            .min(remaining)
392    };
393    std::thread::sleep(sleep_for);
394    *backoff = backoff.saturating_mul(2).min(max_backoff);
395}
396
397struct DoctorMutationDbOpenGuard(Option<fs::File>);
398
399impl Drop for DoctorMutationDbOpenGuard {
400    fn drop(&mut self) {
401        if let Some(file) = self.0.as_ref() {
402            let _ = fs2::FileExt::unlock(file);
403        }
404    }
405}
406
407fn doctor_mutation_lock_path_for_db_open(db_path: &Path) -> Option<PathBuf> {
408    if db_path.file_name().and_then(|name| name.to_str()) != Some("agent_search.db") {
409        return None;
410    }
411
412    Some(
413        db_path
414            .parent()?
415            .join("doctor")
416            .join("locks")
417            .join("doctor-repair.lock"),
418    )
419}
420
421fn doctor_lock_metadata_pid_is_current_process(raw: &str) -> bool {
422    raw.lines().any(|line| {
423        let Some((key, value)) = line.split_once('=') else {
424            return false;
425        };
426        key.trim() == "pid"
427            && value
428                .trim()
429                .parse::<u32>()
430                .is_ok_and(|pid| pid == std::process::id())
431    })
432}
433
434fn doctor_lock_file_pid_is_current_process(file: &fs::File) -> bool {
435    use std::io::Read as _;
436
437    let Ok(mut file) = file.try_clone() else {
438        return false;
439    };
440    let mut raw = String::new();
441    let _ = std::io::Read::take(&mut file, DOCTOR_MUTATION_LOCK_MAX_METADATA_READ)
442        .read_to_string(&mut raw);
443    doctor_lock_metadata_pid_is_current_process(&raw)
444}
445
446fn doctor_mutation_lock_error_is_active(err: &std::io::Error) -> bool {
447    if err.kind() == std::io::ErrorKind::WouldBlock {
448        return true;
449    }
450
451    #[cfg(windows)]
452    {
453        err.raw_os_error() == Some(33)
454    }
455    #[cfg(not(windows))]
456    {
457        false
458    }
459}
460
461fn acquire_doctor_mutation_db_open_guard(
462    db_path: &Path,
463    timeout: Duration,
464) -> Result<DoctorMutationDbOpenGuard> {
465    let Some(lock_path) = doctor_mutation_lock_path_for_db_open(db_path) else {
466        return Ok(DoctorMutationDbOpenGuard(None));
467    };
468    if doctor_mutation_db_open_bypass_active() {
469        return Ok(DoctorMutationDbOpenGuard(None));
470    }
471
472    if let Some(parent) = lock_path.parent() {
473        fs::create_dir_all(parent).with_context(|| {
474            format!(
475                "creating doctor mutation lock directory {} before opening {}",
476                parent.display(),
477                db_path.display()
478            )
479        })?;
480    }
481
482    let deadline = Instant::now() + timeout;
483    let mut backoff = Duration::from_millis(4);
484    loop {
485        let file = fs::OpenOptions::new()
486            .create(true)
487            .truncate(false)
488            .read(true)
489            .write(true)
490            .open(&lock_path)
491            .with_context(|| {
492                format!(
493                    "opening doctor mutation lock {} before opening {}",
494                    lock_path.display(),
495                    db_path.display()
496                )
497            })?;
498
499        if doctor_lock_file_pid_is_current_process(&file) {
500            return Ok(DoctorMutationDbOpenGuard(None));
501        }
502
503        match fs2::FileExt::try_lock_shared(&file) {
504            Ok(()) => return Ok(DoctorMutationDbOpenGuard(Some(file))),
505            Err(err) if doctor_mutation_lock_error_is_active(&err) => {
506                let now = Instant::now();
507                if now >= deadline {
508                    return Err(anyhow!(
509                        "doctor mutation lock {} is active while opening {}; refusing to open during repair after waiting {}ms",
510                        lock_path.display(),
511                        db_path.display(),
512                        timeout.as_millis()
513                    ));
514                }
515                let remaining = deadline.saturating_duration_since(now);
516                sleep_with_franken_retry_backoff(
517                    &mut backoff,
518                    remaining,
519                    Duration::from_millis(128),
520                );
521            }
522            Err(err) => {
523                return Err(anyhow!(
524                    "failed to acquire shared doctor mutation lock {} before opening {}: {}",
525                    lock_path.display(),
526                    db_path.display(),
527                    err
528                ));
529            }
530        }
531    }
532}
533
534pub(crate) fn open_franken_storage_with_timeout(
535    path: &Path,
536    timeout: Duration,
537) -> Result<FrankenStorage> {
538    if !path.exists() {
539        return Err(anyhow!("Database not found at {}", path.display()));
540    }
541
542    let deadline = Instant::now() + timeout;
543    let mut backoff = Duration::from_millis(4);
544    loop {
545        match FrankenStorage::open(path) {
546            Ok(storage) => return Ok(storage),
547            Err(err) if retryable_franken_anyhow(&err) => {
548                let now = Instant::now();
549                if now >= deadline {
550                    return Err(err);
551                }
552                let remaining = deadline.saturating_duration_since(now);
553                sleep_with_franken_retry_backoff(
554                    &mut backoff,
555                    remaining,
556                    Duration::from_millis(128),
557                );
558            }
559            Err(err) => return Err(err),
560        }
561    }
562}
563
564pub(crate) fn open_current_schema_storage_with_timeout(
565    path: &Path,
566    timeout: Duration,
567) -> Result<Option<FrankenStorage>> {
568    if !path.exists() {
569        return Ok(None);
570    }
571
572    let mut storage = FrankenStorage::new(
573        open_franken_raw_connection_with_timeout(path, timeout)?,
574        path.to_path_buf(),
575    );
576    storage.apply_open_stage_busy_timeout();
577
578    let version = storage
579        .raw()
580        .query("SELECT value FROM meta WHERE key = 'schema_version';")
581        .ok()
582        .and_then(|rows| rows.first().cloned())
583        .and_then(|row| row.get_typed::<String>(0).ok())
584        .and_then(|raw| raw.parse::<i64>().ok());
585
586    if version != Some(CURRENT_SCHEMA_VERSION) {
587        if let Err(close_err) = storage.close_without_checkpoint_in_place() {
588            tracing::debug!(
589                error = %close_err,
590                db_path = %path.display(),
591                "open_current_schema_storage_with_timeout: close_without_checkpoint_in_place failed; falling back to best-effort close"
592            );
593            storage.close_best_effort_in_place();
594        }
595        return Ok(None);
596    }
597
598    transition_from_meta_version(&storage.conn)?;
599    storage.repair_missing_current_schema_objects()?;
600    storage.apply_config()?;
601    Ok(Some(storage))
602}
603
604pub(crate) fn open_franken_readonly_storage_with_timeout(
605    path: &Path,
606    timeout: Duration,
607) -> Result<FrankenStorage> {
608    if !path.exists() {
609        return Err(anyhow!("Database not found at {}", path.display()));
610    }
611
612    let deadline = Instant::now() + timeout;
613    let mut backoff = Duration::from_millis(4);
614    loop {
615        match FrankenStorage::open_readonly(path) {
616            Ok(storage) => return Ok(storage),
617            Err(err) if retryable_franken_anyhow(&err) => {
618                let now = Instant::now();
619                if now >= deadline {
620                    return Err(err);
621                }
622                let remaining = deadline.saturating_duration_since(now);
623                sleep_with_franken_retry_backoff(
624                    &mut backoff,
625                    remaining,
626                    Duration::from_millis(128),
627                );
628            }
629            Err(err) => return Err(err),
630        }
631    }
632}
633
634pub(crate) fn open_franken_raw_connection_with_timeout(
635    path: &Path,
636    timeout: Duration,
637) -> Result<FrankenConnection> {
638    if !path.exists() {
639        return Err(anyhow!("Database not found at {}", path.display()));
640    }
641
642    let path_str = path.to_string_lossy().to_string();
643    let deadline = Instant::now() + timeout;
644    let mut backoff = Duration::from_millis(4);
645    loop {
646        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
647        match FrankenConnection::open(&path_str)
648            .with_context(|| format!("opening raw frankensqlite db at {}", path.display()))
649        {
650            Ok(conn) => return Ok(conn),
651            Err(err) if retryable_franken_anyhow(&err) => {
652                let now = Instant::now();
653                if now >= deadline {
654                    return Err(err);
655                }
656                let remaining = deadline.saturating_duration_since(now);
657                sleep_with_franken_retry_backoff(
658                    &mut backoff,
659                    remaining,
660                    Duration::from_millis(128),
661                );
662            }
663            Err(err) => return Err(err),
664        }
665    }
666}
667
668pub(crate) fn open_franken_raw_readonly_connection_with_timeout(
669    path: &Path,
670    timeout: Duration,
671) -> Result<FrankenConnection> {
672    if !path.exists() {
673        return Err(anyhow!("Database not found at {}", path.display()));
674    }
675
676    let path_str = path.to_string_lossy().to_string();
677    let deadline = Instant::now() + timeout;
678    let mut backoff = Duration::from_millis(4);
679    loop {
680        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
681        match open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
682            .with_context(|| {
683                format!(
684                    "opening raw frankensqlite db readonly at {}",
685                    path.display()
686                )
687            }) {
688            Ok(conn) => return Ok(conn),
689            Err(err) if retryable_franken_anyhow(&err) => {
690                let now = Instant::now();
691                if now >= deadline {
692                    return Err(err);
693                }
694                let remaining = deadline.saturating_duration_since(now);
695                sleep_with_franken_retry_backoff(
696                    &mut backoff,
697                    remaining,
698                    Duration::from_millis(128),
699                );
700            }
701            Err(err) => return Err(err),
702        }
703    }
704}
705
706pub(crate) fn retryable_franken_error(err: &frankensqlite::FrankenError) -> bool {
707    matches!(
708        err,
709        frankensqlite::FrankenError::Busy
710            | frankensqlite::FrankenError::BusyRecovery
711            | frankensqlite::FrankenError::BusySnapshot { .. }
712            | frankensqlite::FrankenError::DatabaseLocked { .. }
713            | frankensqlite::FrankenError::LockFailed { .. }
714            | frankensqlite::FrankenError::WriteConflict { .. }
715            | frankensqlite::FrankenError::SerializationFailure { .. }
716    ) || retryable_storage_error_message(&err.to_string())
717}
718
719pub(crate) fn retryable_storage_error_message(message: &str) -> bool {
720    let lower = message.to_ascii_lowercase();
721    lower.contains("busy")
722        || lower.contains("locked")
723        || lower.contains("locking")
724        || lower.contains("contention")
725        || lower.contains("temporarily unavailable")
726        || lower.contains("would block")
727}
728
729pub(crate) fn retryable_franken_anyhow(err: &anyhow::Error) -> bool {
730    err.chain().any(|cause| {
731        cause
732            .downcast_ref::<frankensqlite::FrankenError>()
733            .is_some_and(retryable_franken_error)
734            || retryable_storage_error_message(&cause.to_string())
735    })
736}
737
738impl Drop for LazyFrankenDb {
739    fn drop(&mut self) {
740        let Some(mut conn) = self.conn.get_mut().take() else {
741            return;
742        };
743        conn.0.close_best_effort_in_place();
744    }
745}
746
747// -------------------------------------------------------------------------
748// FrankenSQLite Connection Manager (bead 3rlf8)
749// -------------------------------------------------------------------------
750// Multi-connection management: reader pool + concurrent writer connections.
751// Replaces the LazyFrankenDb single-connection bottleneck for high-throughput
752// scenarios (indexer parallel writes, concurrent TUI reads + indexer writes).
753
754/// Configuration for the [`FrankenConnectionManager`].
755#[derive(Debug, Clone)]
756pub struct ConnectionManagerConfig {
757    /// Number of pre-opened reader connections (default: 4).
758    pub reader_count: usize,
759    /// Maximum concurrent writer connections (default: available parallelism).
760    pub max_writers: usize,
761}
762
763impl Default for ConnectionManagerConfig {
764    fn default() -> Self {
765        let cpus = std::thread::available_parallelism()
766            .map(|n| n.get())
767            .unwrap_or(4);
768        Self {
769            reader_count: 4,
770            max_writers: cpus,
771        }
772    }
773}
774
775/// Multi-connection manager for frankensqlite.
776///
777/// Provides:
778/// - A pool of pre-opened reader connections (round-robin, Mutex-protected)
779/// - Controlled creation of writer connections with token-based limits
780/// - RAII guards that auto-rollback uncommitted transactions on drop
781///
782/// Thread-safe: reader connections are wrapped in Mutex (FrankenConnection is !Sync).
783/// Writer connections are created per-request (each thread gets its own).
784pub struct FrankenConnectionManager {
785    db_path: PathBuf,
786    readers: Vec<parking_lot::Mutex<SendFrankenConnection>>,
787    reader_idx: std::sync::atomic::AtomicUsize,
788    /// Token-based writer limit: channel pre-filled with `max_writers` tokens.
789    /// `recv()` = acquire slot, `send()` = release slot.
790    writer_tokens: (
791        crossbeam_channel::Sender<()>,
792        crossbeam_channel::Receiver<()>,
793    ),
794    config: ConnectionManagerConfig,
795}
796
797// Safety: FrankenConnectionManager is Send+Sync because:
798// - readers wrapped in Mutex<SendFrankenConnection> (exclusive access)
799// - writer_tokens uses crossbeam (Send+Sync)
800// - db_path is PathBuf (Send+Sync)
801unsafe impl Send for FrankenConnectionManager {}
802unsafe impl Sync for FrankenConnectionManager {}
803
804impl FrankenConnectionManager {
805    /// Create a new connection manager.
806    ///
807    /// Opens `config.reader_count` reader connections immediately.
808    /// Writer connections are created on demand (up to `config.max_writers`).
809    pub fn new(db_path: impl Into<PathBuf>, config: ConnectionManagerConfig) -> Result<Self> {
810        let db_path = db_path.into();
811        let path_str = db_path.to_string_lossy().to_string();
812
813        let reader_count = config.reader_count.max(1);
814        let mut readers = Vec::with_capacity(reader_count);
815        for _ in 0..reader_count {
816            let conn = FrankenConnection::open(&path_str)
817                .with_context(|| format!("opening reader connection at {}", db_path.display()))?;
818            // Apply read-tuned config (no migration, no write PRAGMAs)
819            let _ = conn.execute("PRAGMA busy_timeout = 5000;"); // match writer config
820            let _ = conn.execute("PRAGMA cache_size = -16384;"); // 16MB reader cache
821            readers.push(parking_lot::Mutex::new(SendFrankenConnection::new(conn)));
822        }
823
824        let max_writers = config.max_writers.max(1);
825
826        // Pre-fill bounded channel with tokens (acts as counting semaphore).
827        // A zero-capacity channel with no initial tokens would make the first
828        // writer acquisition block forever.
829        let (tx, rx) = crossbeam_channel::bounded(max_writers);
830        for _ in 0..max_writers {
831            tx.send(())
832                .map_err(|_| anyhow!("writer token channel closed during initialization"))?;
833        }
834
835        Ok(Self {
836            db_path,
837            readers,
838            reader_idx: std::sync::atomic::AtomicUsize::new(0),
839            writer_tokens: (tx, rx),
840            config: ConnectionManagerConfig {
841                reader_count,
842                max_writers,
843            },
844        })
845    }
846
847    /// Get a reader connection (round-robin from the pool).
848    ///
849    /// Returns a mutex guard wrapping the connection. The guard prevents
850    /// concurrent access to the same connection (FrankenConnection is !Sync).
851    pub fn reader(&self) -> parking_lot::MutexGuard<'_, SendFrankenConnection> {
852        let idx = self
853            .reader_idx
854            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
855        self.readers[idx % self.readers.len()].lock()
856    }
857
858    /// Acquire a writer connection.
859    ///
860    /// Opens a new frankensqlite connection with full config (no migration).
861    /// Blocks if `max_writers` connections are already in use.
862    /// The returned [`WriterGuard`] auto-rolls back on drop.
863    pub fn writer(&self) -> Result<WriterGuard<'_>> {
864        self.writer_tokens
865            .1
866            .recv()
867            .map_err(|_| anyhow!("writer token channel closed"))?;
868        let path_str = self.db_path.to_string_lossy().to_string();
869        let conn = match FrankenConnection::open(&path_str) {
870            Ok(c) => c,
871            Err(e) => {
872                let _ = self.writer_tokens.0.send(());
873                return Err(anyhow::Error::from(e).context(format!(
874                    "opening writer connection at {}",
875                    self.db_path.display()
876                )));
877            }
878        };
879        let storage = FrankenStorage::new(conn, self.db_path.clone());
880        if let Err(e) = storage.apply_config() {
881            let _ = self.writer_tokens.0.send(());
882            return Err(e);
883        }
884        Ok(WriterGuard {
885            storage,
886            mgr: self,
887            committed: false,
888        })
889    }
890
891    /// Acquire a concurrent writer connection (BEGIN CONCURRENT via MVCC).
892    ///
893    /// Similar to [`writer`] but tuned for the parallel indexer write pool.
894    /// Uses reduced cache size and is designed for short-lived batch inserts.
895    pub fn concurrent_writer(&self) -> Result<WriterGuard<'_>> {
896        self.writer_tokens
897            .1
898            .recv()
899            .map_err(|_| anyhow!("writer token channel closed"))?;
900        let path_str = self.db_path.to_string_lossy().to_string();
901        let conn = match FrankenConnection::open(&path_str) {
902            Ok(c) => c,
903            Err(e) => {
904                let _ = self.writer_tokens.0.send(());
905                return Err(anyhow::Error::from(e).context(format!(
906                    "opening concurrent writer at {}",
907                    self.db_path.display()
908                )));
909            }
910        };
911        let storage = FrankenStorage::new(conn, self.db_path.clone());
912        if let Err(e) = storage.apply_config() {
913            let _ = self.writer_tokens.0.send(());
914            return Err(e);
915        }
916        // Reduced cache for concurrent writers (they're short-lived)
917        let _ = storage.raw().execute("PRAGMA cache_size = -4096;");
918        Ok(WriterGuard {
919            storage,
920            mgr: self,
921            committed: false,
922        })
923    }
924
925    /// Database path managed by this pool.
926    pub fn db_path(&self) -> &Path {
927        &self.db_path
928    }
929
930    /// Number of reader connections in the pool.
931    pub fn reader_count(&self) -> usize {
932        self.readers.len()
933    }
934
935    /// Maximum concurrent writers allowed.
936    pub fn max_writers(&self) -> usize {
937        self.config.max_writers
938    }
939}
940
941impl Drop for FrankenConnectionManager {
942    fn drop(&mut self) {
943        for reader in &mut self.readers {
944            reader.get_mut().0.close_best_effort_in_place();
945        }
946    }
947}
948
949/// RAII guard for a writer connection.
950///
951/// Provides access to a [`FrankenStorage`] for write operations.
952/// Releases the writer semaphore slot when dropped.
953pub struct WriterGuard<'a> {
954    storage: FrankenStorage,
955    mgr: &'a FrankenConnectionManager,
956    committed: bool,
957}
958
959impl<'a> WriterGuard<'a> {
960    /// Access the underlying storage for read/write operations.
961    pub fn storage(&self) -> &FrankenStorage {
962        &self.storage
963    }
964
965    /// Mark this writer as successfully committed.
966    ///
967    /// Call after your transaction's `commit()` succeeds. Prevents the drop
968    /// guard from attempting a rollback.
969    pub fn mark_committed(&mut self) {
970        self.committed = true;
971    }
972}
973
974impl Drop for WriterGuard<'_> {
975    fn drop(&mut self) {
976        if !self.committed {
977            // Best-effort rollback — connection may already be in autocommit
978            let _ = self.storage.raw().execute("ROLLBACK;");
979        }
980        self.storage.close_best_effort_in_place();
981        // Release writer token
982        let _ = self.mgr.writer_tokens.0.send(());
983    }
984}
985
986// -------------------------------------------------------------------------
987// Binary Metadata Serialization (Opt 3.1)
988// -------------------------------------------------------------------------
989// MessagePack provides 50-70% storage reduction vs JSON and faster parsing.
990// New rows use binary columns; existing JSON is read on fallback.
991
992/// Serialize a JSON value to MessagePack bytes.
993/// Returns None for null/empty values to save storage.
994fn serialize_json_to_msgpack(value: &serde_json::Value) -> Option<Vec<u8>> {
995    if value.is_null() || value.as_object().is_some_and(|o| o.is_empty()) {
996        return None;
997    }
998    rmp_serde::to_vec(value).ok()
999}
1000
1001/// Deserialize MessagePack bytes to a JSON value.
1002/// Returns default Value::Object({}) on error or empty input.
1003fn deserialize_msgpack_to_json(bytes: &[u8]) -> serde_json::Value {
1004    if bytes.is_empty() {
1005        return serde_json::Value::Object(serde_json::Map::new());
1006    }
1007    rmp_serde::from_slice(bytes).unwrap_or_else(|e| {
1008        tracing::debug!(
1009            error = %e,
1010            bytes_len = bytes.len(),
1011            "Failed to deserialize metadata - returning empty object"
1012        );
1013        serde_json::Value::Object(serde_json::Map::new())
1014    })
1015}
1016
1017/// Read metadata from a frankensqlite Row, preferring binary (msgpack) over JSON.
1018fn franken_read_metadata_compat(
1019    row: &FrankenRow,
1020    json_idx: usize,
1021    bin_idx: usize,
1022) -> serde_json::Value {
1023    // Try binary column first (new format)
1024    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1025        && !bytes.is_empty()
1026    {
1027        return deserialize_msgpack_to_json(&bytes);
1028    }
1029
1030    // Fall back to JSON column (old format or migration in progress)
1031    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1032        return serde_json::from_str(&json_str)
1033            .unwrap_or_else(|_| serde_json::Value::Object(serde_json::Map::new()));
1034    }
1035
1036    serde_json::Value::Object(serde_json::Map::new())
1037}
1038
1039fn franken_read_message_extra_compat(
1040    row: &FrankenRow,
1041    json_idx: usize,
1042    bin_idx: usize,
1043) -> serde_json::Value {
1044    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1045        && !bytes.is_empty()
1046    {
1047        return deserialize_msgpack_to_json(&bytes);
1048    }
1049
1050    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1051        return serde_json::from_str(&json_str).unwrap_or(serde_json::Value::Null);
1052    }
1053
1054    serde_json::Value::Null
1055}
1056
1057// -------------------------------------------------------------------------
1058// Migration Error Types (P1.5)
1059// -------------------------------------------------------------------------
1060
1061/// Error type for schema migration operations.
1062#[derive(Debug, Error)]
1063pub enum MigrationError {
1064    /// The schema requires a full rebuild. The database has been backed up.
1065    #[error("Rebuild required: {reason}")]
1066    RebuildRequired {
1067        reason: String,
1068        backup_path: Option<std::path::PathBuf>,
1069    },
1070
1071    /// A database error occurred during migration.
1072    #[error("Database error: {0}")]
1073    Database(#[from] frankensqlite::FrankenError),
1074
1075    /// An I/O error occurred during backup.
1076    #[error("I/O error: {0}")]
1077    Io(#[from] std::io::Error),
1078
1079    /// Other migration error.
1080    #[error("{0}")]
1081    Other(String),
1082}
1083
1084impl From<anyhow::Error> for MigrationError {
1085    fn from(e: anyhow::Error) -> Self {
1086        MigrationError::Other(e.to_string())
1087    }
1088}
1089
1090/// Maximum number of backup files to retain.
1091const MAX_BACKUPS: usize = 3;
1092const BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA: &str = "PRAGMA busy_timeout = 30000;";
1093
1094/// Files that contain user-authored state and must NEVER be deleted during rebuild.
1095const USER_DATA_FILES: &[&str] = &["bookmarks.db", "tui_state.json", "sources.toml", ".env"];
1096
1097/// Check if a file is user-authored data that must be preserved during rebuild.
1098pub fn is_user_data_file(path: &Path) -> bool {
1099    path.file_name()
1100        .and_then(|n| n.to_str())
1101        .map(|name| USER_DATA_FILES.contains(&name))
1102        .unwrap_or(false)
1103}
1104
1105/// SQL to register the FTS5 virtual table on a frankensqlite connection.
1106///
1107/// FrankenSQLite skips virtual-table entries (rootpage=0) when loading
1108/// `sqlite_master` from a stock-SQLite database.  Executing this CREATE
1109/// triggers the legacy FTS5 fallback path and materialises the table so
1110/// subsequent FTS queries work.
1111pub const FTS5_REGISTER_SQL: &str = "\
1112    CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(\
1113        content, title, agent, workspace, source_path, \
1114        created_at UNINDEXED, \
1115        content='', tokenize='porter'\
1116    )";
1117
1118const FTS_FRANKEN_REBUILD_META_KEY: &str = "fts_frankensqlite_rebuild_generation";
1119const FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY: &str = "fts_frankensqlite_archive_fingerprint";
1120const FTS_FRANKEN_REBUILD_GENERATION: i64 = 1;
1121const DAILY_STATS_HEALTH_META_KEY: &str = "daily_stats_archive_fingerprint";
1122const DAILY_STATS_HEALTH_GENERATION_META_KEY: &str = "daily_stats_health_generation";
1123const DAILY_STATS_HEALTH_GENERATION: i64 = 1;
1124
1125/// SQL to clear all rows from the contentless `fts_messages` table.
1126///
1127/// Contentless FTS5 tables reject ordinary `DELETE FROM ...` statements.
1128pub const FTS5_DELETE_ALL_SQL: &str =
1129    "INSERT INTO fts_messages(fts_messages) VALUES('delete-all');";
1130
1131pub const FTS_MESSAGES_REQUIRED_SHADOW_TABLES: [&str; 5] = [
1132    "fts_messages_config",
1133    "fts_messages_content",
1134    "fts_messages_data",
1135    "fts_messages_docsize",
1136    "fts_messages_idx",
1137];
1138
1139pub const FTS_MESSAGES_INTEGRITY_PROBE_SQL: &str = "SELECT * FROM fts_messages LIMIT 0";
1140
1141pub const FTS_MESSAGES_CORRUPTION_RECOVERY_HINT: &str = "Stop all cass index/watch processes, back up the current database, then run \
1142     'cass doctor check --json' for a read-only diagnosis before using a supported \
1143     repair/rebuild path.";
1144
1145#[derive(Debug, Clone, PartialEq, Eq)]
1146pub struct FtsMessagesIntegrityError {
1147    missing_shadow_tables: Vec<&'static str>,
1148    failed_sql: Option<&'static str>,
1149    source_error: Option<String>,
1150}
1151
1152impl FtsMessagesIntegrityError {
1153    fn new(
1154        missing_shadow_tables: Vec<&'static str>,
1155        failed_sql: Option<&'static str>,
1156        source_error: Option<String>,
1157    ) -> Self {
1158        Self {
1159            missing_shadow_tables,
1160            failed_sql,
1161            source_error,
1162        }
1163    }
1164
1165    pub fn missing_shadow_tables(&self) -> &[&'static str] {
1166        &self.missing_shadow_tables
1167    }
1168}
1169
1170impl std::fmt::Display for FtsMessagesIntegrityError {
1171    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1172        write!(
1173            f,
1174            "CASS database FTS5 index is corrupt: fts_messages exists, but required FTS5 shadow tables are missing or unreadable"
1175        )?;
1176        if !self.missing_shadow_tables.is_empty() {
1177            write!(
1178                f,
1179                "; missing shadow tables: {}",
1180                self.missing_shadow_tables.join(", ")
1181            )?;
1182        }
1183        if let Some(sql) = self.failed_sql {
1184            write!(f, "; failed SQL: {sql}")?;
1185        }
1186        if let Some(source_error) = &self.source_error {
1187            write!(f, "; error: {source_error}")?;
1188        }
1189        write!(
1190            f,
1191            ". Suggested recovery: {FTS_MESSAGES_CORRUPTION_RECOVERY_HINT}"
1192        )
1193    }
1194}
1195
1196impl std::error::Error for FtsMessagesIntegrityError {}
1197
1198pub fn fts_messages_integrity_error_from_message(
1199    source_error: impl Into<String>,
1200) -> Option<FtsMessagesIntegrityError> {
1201    let source_error = source_error.into();
1202    let lower = source_error.to_ascii_lowercase();
1203    if !lower.contains("fts_messages") {
1204        return None;
1205    }
1206
1207    let mentions_structural_fts_failure = lower.contains("shadow table")
1208        || lower.contains("vtable constructor failed")
1209        || lower.contains("sqlite_corrupt")
1210        || lower.contains("databasecorrupt")
1211        || lower.contains("database corrupt")
1212        || lower.contains("missing required");
1213    if !mentions_structural_fts_failure {
1214        return None;
1215    }
1216
1217    let missing_shadow_tables = FTS_MESSAGES_REQUIRED_SHADOW_TABLES
1218        .iter()
1219        .copied()
1220        .filter(|table| lower.contains(&table.to_ascii_lowercase()))
1221        .collect::<Vec<_>>();
1222
1223    Some(FtsMessagesIntegrityError::new(
1224        missing_shadow_tables,
1225        Some(FTS_MESSAGES_INTEGRITY_PROBE_SQL),
1226        Some(source_error),
1227    ))
1228}
1229
1230fn fts_schema_tolerates_missing_shadow_metadata(sql: &str) -> bool {
1231    let normalized = sql
1232        .chars()
1233        .filter(|ch| !ch.is_whitespace())
1234        .collect::<String>()
1235        .to_ascii_lowercase();
1236    normalized.contains("usingfts5(")
1237        && normalized.contains("content=''")
1238        && !normalized.contains("message_id")
1239}
1240
1241pub fn validate_fts_messages_integrity_for_connection(conn: &FrankenConnection) -> Result<()> {
1242    let fts_schema_sql: Vec<String> = conn
1243        .query_map_collect(
1244            "SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'fts_messages'",
1245            fparams![],
1246            |row: &FrankenRow| row.get_typed::<String>(0),
1247        )
1248        .with_context(|| "checking for fts_messages in sqlite_master")?;
1249    if fts_schema_sql.is_empty() {
1250        return Ok(());
1251    }
1252
1253    let probe_error = conn.query(FTS_MESSAGES_INTEGRITY_PROBE_SQL).err();
1254    if probe_error.is_none()
1255        && fts_schema_sql
1256            .iter()
1257            .all(|sql| fts_schema_tolerates_missing_shadow_metadata(sql))
1258    {
1259        return Ok(());
1260    }
1261
1262    let present_shadow_tables: HashSet<String> = conn
1263        .query_map_collect(
1264            "SELECT name FROM sqlite_master
1265             WHERE type = 'table'
1266               AND name IN (
1267                 'fts_messages_config',
1268                 'fts_messages_content',
1269                 'fts_messages_data',
1270                 'fts_messages_docsize',
1271                 'fts_messages_idx'
1272               )",
1273            fparams![],
1274            |row: &FrankenRow| row.get_typed::<String>(0),
1275        )
1276        .map(|rows| rows.into_iter().collect())
1277        .map_err(|err| {
1278            FtsMessagesIntegrityError::new(
1279                Vec::new(),
1280                Some(
1281                    "SELECT name FROM sqlite_master WHERE name IN \
1282                     ('fts_messages_config','fts_messages_content','fts_messages_data','fts_messages_docsize','fts_messages_idx')",
1283                ),
1284                Some(err.to_string()),
1285            )
1286        })?;
1287    let missing_shadow_tables = FTS_MESSAGES_REQUIRED_SHADOW_TABLES
1288        .iter()
1289        .copied()
1290        .filter(|table| !present_shadow_tables.contains(*table))
1291        .collect::<Vec<_>>();
1292
1293    // If every required shadow table is present, the FTS5 schema is
1294    // structurally sound. A probe-SQL failure here typically reflects an
1295    // incomplete FTS5 runtime emulation (e.g. frankensqlite's vtable path)
1296    // rather than fixture corruption — and conflating the two would
1297    // wrongly reject every database with the new message_id schema that
1298    // frankensqlite happens to serve via a different code path. Returning
1299    // Ok here keeps the false-positive surface narrow; the truly-missing-
1300    // shadow case below still surfaces as before.
1301    if missing_shadow_tables.is_empty() {
1302        return Ok(());
1303    }
1304
1305    Err(FtsMessagesIntegrityError::new(
1306        missing_shadow_tables,
1307        probe_error
1308            .as_ref()
1309            .map(|_| FTS_MESSAGES_INTEGRITY_PROBE_SQL),
1310        probe_error.map(|err| err.to_string()),
1311    )
1312    .into())
1313}
1314
1315#[cfg(test)]
1316pub(crate) fn materialize_fresh_fts_schema_via_rusqlite(db_path: &Path) -> Result<()> {
1317    // Delegate to FrankenStorage: DROP TABLE IF EXISTS + CREATE VIRTUAL TABLE
1318    // is fully supported by the frankensqlite FTS5 path at
1319    // FrankenStorage::rebuild_fts_via_frankensqlite. We call rebuild which
1320    // also populates rows, matching the historical semantics ("fresh FTS"
1321    // means the schema exists and is consistent with message rows).
1322    let storage = FrankenStorage::open(db_path).with_context(|| {
1323        format!(
1324            "opening frankensqlite db at {} for FTS materialization",
1325            db_path.display()
1326        )
1327    })?;
1328    storage.rebuild_fts_via_frankensqlite().map(|_| ())
1329}
1330
1331#[cfg(test)]
1332pub(crate) fn rebuild_fts_via_rusqlite(db_path: &Path) -> Result<usize> {
1333    let storage = FrankenStorage::open(db_path).with_context(|| {
1334        format!(
1335            "opening frankensqlite db at {} for FTS rebuild",
1336            db_path.display()
1337        )
1338    })?;
1339    let inserted = storage.rebuild_fts_via_frankensqlite()?;
1340    storage.record_fts_franken_rebuild_generation()?;
1341    Ok(inserted)
1342}
1343
1344pub(crate) fn ensure_fts_consistency_via_rusqlite(db_path: &Path) -> Result<FtsConsistencyRepair> {
1345    // Delegates to the FrankenStorage-native path. The function name retains
1346    // the `_via_rusqlite` suffix only for backwards compatibility with the
1347    // few test-site callers; all operations now run through frankensqlite.
1348    let storage = FrankenStorage::open(db_path).with_context(|| {
1349        format!(
1350            "opening frankensqlite db at {} for FTS consistency check",
1351            db_path.display()
1352        )
1353    })?;
1354    storage.ensure_search_fallback_fts_consistency()
1355}
1356
1357/// Create a uniquely named backup of the database file.
1358///
1359/// Returns the path to the backup file, or None if the source doesn't exist.
1360pub fn create_backup(db_path: &Path) -> Result<Option<std::path::PathBuf>, MigrationError> {
1361    if !bundle_path_exists(db_path)? {
1362        return Ok(None);
1363    }
1364
1365    if !copyable_bundle_file_exists(db_path)? {
1366        return Ok(None);
1367    }
1368    let _ = copyable_bundle_sidecar_sources(db_path)?;
1369
1370    let backup_path = unique_backup_path(db_path);
1371    let vacuum_stage_path = vacuum_stage_backup_path(&backup_path);
1372
1373    // Try to use SQLite's VACUUM INTO command first, which safely handles WAL files
1374    // and produces a clean, minimized backup.
1375    match vacuum_into_backup_stage(db_path, &vacuum_stage_path) {
1376        Ok(()) => {
1377            fs::rename(&vacuum_stage_path, &backup_path)?;
1378        }
1379        Err(err) if backup_vacuum_error_requires_consistent_retry(&err) => {
1380            tracing::warn!(
1381                db_path = %db_path.display(),
1382                error = %err,
1383                "create_backup: VACUUM INTO hit transient contention; refusing raw WAL bundle copy"
1384            );
1385            return Err(MigrationError::Database(err));
1386        }
1387        Err(err) => {
1388            tracing::warn!(
1389                db_path = %db_path.display(),
1390                error = %err,
1391                "create_backup: VACUUM INTO failed; falling back to raw evidence copy"
1392            );
1393        }
1394    }
1395
1396    if backup_path.exists() {
1397        sync_file_if_exists(&backup_path)?;
1398        if let Some(parent) = backup_path.parent() {
1399            sync_parent_directory(parent)?;
1400        }
1401        return Ok(Some(backup_path));
1402    }
1403
1404    // Fallback to a raw evidence copy if VACUUM INTO failed (e.g., older SQLite
1405    // or corruption). Keep this on the same symlink-safe bundle path as
1406    // historical seeding so a malformed archive root cannot make us copy an
1407    // arbitrary symlink target or publish a partial sidecar backup.
1408    copy_database_bundle(db_path, &backup_path)?;
1409
1410    Ok(Some(backup_path))
1411}
1412
1413fn vacuum_into_backup_stage(
1414    db_path: &Path,
1415    stage_path: &Path,
1416) -> std::result::Result<(), frankensqlite::FrankenError> {
1417    let mut conn = open_franken_with_flags(
1418        &db_path.to_string_lossy(),
1419        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
1420    )?;
1421    let result = (|| {
1422        conn.execute(BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA)?;
1423        let path_str = stage_path.to_string_lossy();
1424        conn.execute_compat("VACUUM INTO ?", fparams![path_str.as_ref()])?;
1425        Ok(())
1426    })();
1427    if let Err(close_err) = conn.close_in_place() {
1428        tracing::warn!(
1429            error = %close_err,
1430            db_path = %db_path.display(),
1431            "create_backup: close_in_place failed after VACUUM INTO; falling back to best-effort close"
1432        );
1433        conn.close_best_effort_in_place();
1434    }
1435    result
1436}
1437
1438fn backup_vacuum_error_requires_consistent_retry(err: &frankensqlite::FrankenError) -> bool {
1439    retryable_franken_error(err)
1440}
1441
1442#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1443pub struct DatabaseBundleMoveResult {
1444    pub database: bool,
1445    pub wal: bool,
1446    pub shm: bool,
1447}
1448
1449impl DatabaseBundleMoveResult {
1450    pub fn moved_any(&self) -> bool {
1451        self.database || self.wal || self.shm
1452    }
1453}
1454
1455fn database_sidecar_path(path: &Path, suffix: &str) -> PathBuf {
1456    PathBuf::from(format!("{}{}", path.to_string_lossy(), suffix))
1457}
1458
1459/// Move a database file and its WAL/SHM sidecars to a new basename.
1460///
1461/// This is used for non-destructive quarantine of a corrupted bundle before a
1462/// rebuild. If the main database file is already missing but orphaned sidecars
1463/// remain, those sidecars are still moved so a fresh database can be created
1464/// without inheriting stale WAL state.
1465pub(crate) fn move_database_bundle(
1466    source_root: &Path,
1467    destination_root: &Path,
1468) -> std::io::Result<DatabaseBundleMoveResult> {
1469    let mut moved = DatabaseBundleMoveResult::default();
1470    if let Some(parent) = destination_root.parent() {
1471        fs::create_dir_all(parent)?;
1472        sync_parent_directory(parent)?;
1473    }
1474
1475    if bundle_path_exists(source_root)? {
1476        fs::rename(source_root, destination_root)?;
1477        moved.database = true;
1478    }
1479
1480    let wal_source = database_sidecar_path(source_root, "-wal");
1481    if bundle_path_exists(&wal_source)? {
1482        fs::rename(&wal_source, database_sidecar_path(destination_root, "-wal"))?;
1483        moved.wal = true;
1484    }
1485
1486    let shm_source = database_sidecar_path(source_root, "-shm");
1487    if bundle_path_exists(&shm_source)? {
1488        fs::rename(&shm_source, database_sidecar_path(destination_root, "-shm"))?;
1489        moved.shm = true;
1490    }
1491
1492    if moved.moved_any() {
1493        if let Some(parent) = source_root.parent() {
1494            sync_parent_directory(parent)?;
1495        }
1496        if let Some(parent) = destination_root.parent() {
1497            sync_parent_directory(parent)?;
1498        }
1499    }
1500
1501    Ok(moved)
1502}
1503
1504fn bundle_path_exists(path: &Path) -> std::io::Result<bool> {
1505    match fs::symlink_metadata(path) {
1506        Ok(_) => Ok(true),
1507        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1508        Err(err) => Err(err),
1509    }
1510}
1511
1512fn copy_database_bundle(source_root: &Path, destination_root: &Path) -> Result<()> {
1513    if let Some(parent) = destination_root.parent() {
1514        fs::create_dir_all(parent).with_context(|| {
1515            format!(
1516                "creating destination directory for database bundle copy: {}",
1517                parent.display()
1518            )
1519        })?;
1520        sync_parent_directory(parent)
1521            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1522    }
1523
1524    if !copyable_bundle_file_exists(source_root)? {
1525        bail!(
1526            "database bundle root is missing before copy: {}",
1527            source_root.display()
1528        );
1529    }
1530
1531    let sidecars = copyable_bundle_sidecar_sources(source_root)?;
1532
1533    fs::copy(source_root, destination_root).with_context(|| {
1534        format!(
1535            "copying database bundle {} -> {}",
1536            source_root.display(),
1537            destination_root.display()
1538        )
1539    })?;
1540    sync_file_if_exists(destination_root).with_context(|| {
1541        format!(
1542            "syncing copied database bundle {}",
1543            destination_root.display()
1544        )
1545    })?;
1546
1547    for (source_sidecar, suffix) in sidecars {
1548        let destination_sidecar = database_sidecar_path(destination_root, suffix);
1549        fs::copy(&source_sidecar, &destination_sidecar).with_context(|| {
1550            format!(
1551                "copying database bundle sidecar {} -> {}",
1552                source_sidecar.display(),
1553                destination_sidecar.display()
1554            )
1555        })?;
1556        sync_file_if_exists(&destination_sidecar).with_context(|| {
1557            format!(
1558                "syncing copied database bundle sidecar {}",
1559                destination_sidecar.display()
1560            )
1561        })?;
1562    }
1563
1564    if let Some(parent) = destination_root.parent() {
1565        sync_parent_directory(parent)
1566            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1567    }
1568
1569    Ok(())
1570}
1571
1572fn copyable_bundle_sidecar_sources(source_root: &Path) -> Result<Vec<(PathBuf, &'static str)>> {
1573    let mut sidecars = Vec::new();
1574    for suffix in ["-wal", "-shm"] {
1575        let source_sidecar = database_sidecar_path(source_root, suffix);
1576        if copyable_bundle_file_exists(&source_sidecar)? {
1577            sidecars.push((source_sidecar, suffix));
1578        }
1579    }
1580    Ok(sidecars)
1581}
1582
1583fn copyable_bundle_file_exists(path: &Path) -> Result<bool> {
1584    match fs::symlink_metadata(path) {
1585        Ok(metadata) => {
1586            let file_type = metadata.file_type();
1587            if file_type.is_symlink() {
1588                bail!(
1589                    "refusing to copy database bundle symlink: {}",
1590                    path.display()
1591                );
1592            }
1593            if !file_type.is_file() {
1594                bail!(
1595                    "refusing to copy non-file database bundle path: {}",
1596                    path.display()
1597                );
1598            }
1599            Ok(true)
1600        }
1601        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1602        Err(err) => Err(err).with_context(|| {
1603            format!(
1604                "checking database bundle path before copy: {}",
1605                path.display()
1606            )
1607        }),
1608    }
1609}
1610
1611/// Helper to safely remove a database file and its potential WAL/SHM sidecars.
1612pub(crate) fn remove_database_files(path: &Path) -> std::io::Result<()> {
1613    let mut removed_any = false;
1614
1615    match fs::remove_file(path) {
1616        Ok(()) => removed_any = true,
1617        Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1618        Err(err) => return Err(err),
1619    }
1620
1621    // Best-effort removal of sidecar files (ignore errors if they don't exist)
1622    for suffix in ["-wal", "-shm"] {
1623        match fs::remove_file(database_sidecar_path(path, suffix)) {
1624            Ok(()) => removed_any = true,
1625            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1626            Err(err) => return Err(err),
1627        }
1628    }
1629
1630    if removed_any && let Some(parent) = path.parent() {
1631        sync_parent_directory(parent)?;
1632    }
1633
1634    Ok(())
1635}
1636
1637#[cfg(not(windows))]
1638fn sync_parent_directory(path: &Path) -> std::io::Result<()> {
1639    fs::File::open(path)?.sync_all()
1640}
1641
1642#[cfg(windows)]
1643fn sync_parent_directory(_path: &Path) -> std::io::Result<()> {
1644    Ok(())
1645}
1646
1647#[cfg(not(windows))]
1648fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1649    if path.exists() {
1650        fs::File::open(path)?.sync_all()?;
1651    }
1652    Ok(())
1653}
1654
1655#[cfg(windows)]
1656fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1657    if path.exists() {
1658        fs::OpenOptions::new()
1659            .read(true)
1660            .write(true)
1661            .open(path)?
1662            .sync_all()?;
1663    }
1664    Ok(())
1665}
1666
1667/// Remove old backup files, keeping only the most recent `keep_count`.
1668pub fn cleanup_old_backups(db_path: &Path, keep_count: usize) -> Result<(), std::io::Error> {
1669    let parent = match db_path.parent() {
1670        Some(p) => p,
1671        None => return Ok(()),
1672    };
1673
1674    let db_name = db_path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
1675
1676    let prefix = format!("{}.backup.", db_name);
1677
1678    // Collect backup files matching the pattern
1679    let mut backups: Vec<(std::path::PathBuf, SystemTime)> = Vec::new();
1680
1681    if let Ok(entries) = fs::read_dir(parent) {
1682        for entry in entries.flatten() {
1683            let path = entry.path();
1684            if let Some(name) = path.file_name().and_then(|n| n.to_str())
1685                && is_backup_root_name(name, &prefix)
1686                && let Ok(meta) = fs::metadata(&path)
1687                && meta.is_file()
1688                && let Ok(mtime) = meta.modified()
1689            {
1690                backups.push((path, mtime));
1691            }
1692        }
1693    }
1694
1695    // Sort by modification time, newest first
1696    backups.sort_by_key(|entry| std::cmp::Reverse(entry.1));
1697
1698    // Delete oldest backups beyond keep_count
1699    for (path, _) in backups.into_iter().skip(keep_count) {
1700        let _ = fs::remove_file(&path);
1701
1702        // Also try to cleanup potential sidecars from fs::copy fallback
1703        let _ = fs::remove_file(database_sidecar_path(&path, "-wal"));
1704        let _ = fs::remove_file(database_sidecar_path(&path, "-shm"));
1705    }
1706
1707    Ok(())
1708}
1709
1710#[derive(Debug, Clone)]
1711pub(crate) struct HistoricalDatabaseBundle {
1712    root_path: PathBuf,
1713    total_bytes: u64,
1714    modified_at_ms: i64,
1715    supports_direct_readonly: bool,
1716    probe: HistoricalBundleProbe,
1717}
1718
1719#[derive(Debug, Clone, Copy, Default)]
1720struct HistoricalBundleProbe {
1721    schema_version: Option<i64>,
1722    fts_schema_rows: Option<i64>,
1723    fts_queryable: bool,
1724    max_message_id: i64,
1725}
1726
1727#[cfg(test)]
1728#[allow(dead_code)]
1729#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1730pub(crate) struct SqliteDatabaseHealthProbe {
1731    pub schema_version: Option<i64>,
1732    pub quick_check_ok: bool,
1733    pub fts_schema_rows: i64,
1734    pub fts_queryable: bool,
1735    pub message_count: i64,
1736    pub max_message_id: i64,
1737}
1738
1739#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1740pub(crate) enum FtsConsistencyRepair {
1741    AlreadyHealthy {
1742        rows: usize,
1743    },
1744    IncrementalCatchUp {
1745        inserted_rows: usize,
1746        total_rows: usize,
1747    },
1748    Rebuilt {
1749        inserted_rows: usize,
1750    },
1751}
1752
1753#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
1754pub struct HistoricalSalvageOutcome {
1755    pub bundles_considered: usize,
1756    pub bundles_imported: usize,
1757    pub conversations_imported: usize,
1758    pub messages_imported: usize,
1759}
1760
1761impl HistoricalSalvageOutcome {
1762    pub(crate) fn accumulate(&mut self, other: Self) {
1763        self.bundles_considered += other.bundles_considered;
1764        self.bundles_imported += other.bundles_imported;
1765        self.conversations_imported += other.conversations_imported;
1766        self.messages_imported += other.messages_imported;
1767    }
1768}
1769
1770#[derive(Debug)]
1771struct HistoricalReadConnection {
1772    conn: FrankenConnection,
1773    method: &'static str,
1774    root_path: PathBuf,
1775    _tempdir: Option<tempfile::TempDir>,
1776}
1777
1778const HISTORICAL_RECOVERY_CORE_SCHEMA: &str = r"
1779CREATE TABLE sources (
1780    id TEXT PRIMARY KEY,
1781    kind TEXT,
1782    host_label TEXT,
1783    machine_id TEXT,
1784    platform TEXT,
1785    config_json TEXT,
1786    created_at INTEGER,
1787    updated_at INTEGER
1788);
1789CREATE TABLE agents (
1790    id INTEGER PRIMARY KEY,
1791    slug TEXT,
1792    name TEXT,
1793    version TEXT,
1794    kind TEXT,
1795    created_at INTEGER,
1796    updated_at INTEGER
1797);
1798CREATE TABLE workspaces (
1799    id INTEGER PRIMARY KEY,
1800    path TEXT,
1801    display_name TEXT
1802);
1803CREATE TABLE conversations (
1804    id INTEGER PRIMARY KEY,
1805    agent_id INTEGER,
1806    workspace_id INTEGER,
1807    source_id TEXT,
1808    external_id TEXT,
1809    title TEXT,
1810    source_path TEXT,
1811    started_at INTEGER,
1812    ended_at INTEGER,
1813    approx_tokens INTEGER,
1814    metadata_json TEXT,
1815    origin_host TEXT,
1816    metadata_bin BLOB,
1817    total_input_tokens INTEGER,
1818    total_output_tokens INTEGER,
1819    total_cache_read_tokens INTEGER,
1820    total_cache_creation_tokens INTEGER,
1821    grand_total_tokens INTEGER,
1822    estimated_cost_usd REAL,
1823    primary_model TEXT,
1824    api_call_count INTEGER,
1825    tool_call_count INTEGER,
1826    user_message_count INTEGER,
1827    assistant_message_count INTEGER,
1828    last_message_idx INTEGER,
1829    last_message_created_at INTEGER
1830);
1831CREATE TABLE messages (
1832    id INTEGER PRIMARY KEY,
1833    conversation_id INTEGER,
1834    idx INTEGER,
1835    role TEXT,
1836    author TEXT,
1837    created_at INTEGER,
1838    content TEXT,
1839    extra_json TEXT,
1840    extra_bin BLOB
1841);
1842CREATE TABLE snippets (
1843    id INTEGER PRIMARY KEY,
1844    message_id INTEGER,
1845    file_path TEXT,
1846    start_line INTEGER,
1847    end_line INTEGER,
1848    language TEXT,
1849    snippet_text TEXT
1850);
1851";
1852const HISTORICAL_SALVAGE_LEDGER_VERSION: u32 = 2;
1853const HISTORICAL_SALVAGE_PROGRESS_VERSION: u32 = 1;
1854const SOURCE_PATH_MERGE_START_TOLERANCE_MS: i64 = 5 * 60 * 1000;
1855
1856#[derive(Debug, Clone, Serialize, Deserialize)]
1857struct HistoricalBundleProgress {
1858    progress_version: u32,
1859    path: String,
1860    bytes: u64,
1861    modified_at_ms: i64,
1862    method: String,
1863    last_completed_source_row_id: i64,
1864    conversations_imported: usize,
1865    messages_imported: usize,
1866    updated_at_ms: i64,
1867}
1868
1869#[derive(Debug, Clone)]
1870struct HistoricalBatchEntry {
1871    source_row_id: i64,
1872    agent_id: i64,
1873    workspace_id: Option<i64>,
1874    conversation: Conversation,
1875}
1876
1877#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1878struct HistoricalBatchImportTotals {
1879    inserted_source_rows: usize,
1880    inserted_messages: usize,
1881}
1882
1883fn historical_bundle_root_paths(db_path: &Path) -> Vec<PathBuf> {
1884    let mut roots = Vec::new();
1885    let Some(parent) = db_path.parent() else {
1886        return roots;
1887    };
1888    let db_name = db_path
1889        .file_name()
1890        .and_then(|n| n.to_str())
1891        .unwrap_or("agent_search.db");
1892    let db_stem = db_path
1893        .file_stem()
1894        .and_then(|n| n.to_str())
1895        .unwrap_or("agent_search");
1896
1897    let mut push_root = |path: PathBuf| {
1898        if path == db_path {
1899            return;
1900        }
1901        if !roots.iter().any(|existing| existing == &path) {
1902            roots.push(path);
1903        }
1904    };
1905
1906    if let Ok(entries) = fs::read_dir(parent) {
1907        for entry in entries.flatten() {
1908            let path = entry.path();
1909            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1910                continue;
1911            };
1912            if has_db_sidecar_suffix(name) {
1913                continue;
1914            }
1915            if name.starts_with(&format!("{db_name}.backup."))
1916                || name.starts_with(&format!("{db_stem}.corrupt."))
1917            {
1918                push_root(path);
1919            }
1920        }
1921    }
1922
1923    let backups_dir = parent.join("backups");
1924    if let Ok(entries) = fs::read_dir(backups_dir) {
1925        for entry in entries.flatten() {
1926            let path = entry.path();
1927            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1928                continue;
1929            };
1930            if has_db_sidecar_suffix(name) {
1931                continue;
1932            }
1933            if name.starts_with(&format!("{db_name}.")) && name.ends_with(".bak") {
1934                push_root(path);
1935            }
1936        }
1937    }
1938
1939    push_named_database_children(&mut roots, db_path, &parent.join("repair-lab"), db_name);
1940    push_named_database_children(&mut roots, db_path, &parent.join("snapshots"), db_name);
1941
1942    roots
1943}
1944
1945fn push_named_database_children(
1946    roots: &mut Vec<PathBuf>,
1947    canonical_db_path: &Path,
1948    dir: &Path,
1949    db_name: &str,
1950) {
1951    if let Ok(entries) = fs::read_dir(dir) {
1952        for entry in entries.flatten() {
1953            let candidate = entry.path().join(db_name);
1954            if candidate == canonical_db_path {
1955                continue;
1956            }
1957            if candidate.exists() && !roots.iter().any(|existing| existing == &candidate) {
1958                roots.push(candidate);
1959            }
1960        }
1961    }
1962}
1963
1964fn file_mtime_ms(path: &Path) -> i64 {
1965    fs::metadata(path)
1966        .and_then(|meta| meta.modified())
1967        .ok()
1968        .and_then(|ts| ts.duration_since(UNIX_EPOCH).ok())
1969        .map(|d| d.as_millis() as i64)
1970        .unwrap_or(0)
1971}
1972
1973fn bundle_total_bytes(root_path: &Path) -> u64 {
1974    let mut total = fs::metadata(root_path).map(|meta| meta.len()).unwrap_or(0);
1975    for suffix in ["-wal", "-shm"] {
1976        let sidecar = database_sidecar_path(root_path, suffix);
1977        total = total.saturating_add(fs::metadata(sidecar).map(|meta| meta.len()).unwrap_or(0));
1978    }
1979    total
1980}
1981
1982pub(crate) fn discover_historical_database_bundles(
1983    db_path: &Path,
1984) -> Vec<HistoricalDatabaseBundle> {
1985    let mut bundles: Vec<_> = historical_bundle_root_paths(db_path)
1986        .into_iter()
1987        .filter(|root| root.exists())
1988        .map(|root_path| {
1989            let modified_at_ms = file_mtime_ms(&root_path);
1990            let total_bytes = bundle_total_bytes(&root_path);
1991            let supports_direct_readonly = historical_bundle_supports_direct_readonly(&root_path);
1992            let probe = probe_historical_bundle(&root_path);
1993            HistoricalDatabaseBundle {
1994                modified_at_ms,
1995                total_bytes,
1996                supports_direct_readonly,
1997                root_path,
1998                probe,
1999            }
2000        })
2001        .filter(|bundle| bundle.total_bytes > 0)
2002        .collect();
2003
2004    fn bundle_priority(path: &Path) -> i32 {
2005        let path_str = path.to_string_lossy();
2006        if path_str.contains("/repair-lab/replay-") {
2007            return 5;
2008        }
2009        if path_str.contains("/repair-lab/") {
2010            return 4;
2011        }
2012        if path_str.contains("/snapshots/") {
2013            return 3;
2014        }
2015        if path_str.contains(".corrupt.") || path_str.contains("failed-baseline-seed") {
2016            return 0;
2017        }
2018        1
2019    }
2020
2021    fn bundle_health_rank(bundle: &HistoricalDatabaseBundle) -> i32 {
2022        // Classify FTS health. The probe only sets `fts_queryable = true`
2023        // when `fts_schema_rows == Some(1)` (see
2024        // `historical_bundle_fts_queryable_via_frankensqlite`), so we have
2025        // two legitimate "clean" shapes for a bundle:
2026        //
2027        //   * `fts_schema_rows == Some(1) && fts_queryable` — a pre-V14
2028        //     bundle where the FTS virtual table was eagerly created by
2029        //     migration and is queryable right now.
2030        //
2031        //   * `fts_schema_rows == Some(0) && schema_version == Some(V14+)` —
2032        //     a modern bundle where `MIGRATION_V14` dropped fts_messages on
2033        //     purpose and cass recreates it lazily via
2034        //     `ensure_search_fallback_fts_consistency` on the first open.
2035        //     Gating on `schema_version == CURRENT_SCHEMA_VERSION` is critical
2036        //     so an incomplete pre-V14 bundle with 0 fts rows is not promoted
2037        //     alongside real lazy-V14+ bundles. A `None` schema_version
2038        //     (schema marker unreadable) is excluded for the same reason.
2039        //
2040        // Everything else — `Some(1)` without queryability, `Some(n)` for
2041        // n >= 2 (duplicated CREATE VIRTUAL TABLE rows from a broken legacy
2042        // rebuild), `None` entirely, or `Some(0)` on a non-current schema —
2043        // is not "fts clean".
2044        let fts_clean = match bundle.probe.fts_schema_rows {
2045            Some(1) => bundle.probe.fts_queryable,
2046            Some(0) => bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION),
2047            _ => false,
2048        };
2049
2050        let clean_schema14_fts =
2051            bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION) && fts_clean;
2052        if clean_schema14_fts {
2053            return 5;
2054        }
2055
2056        if fts_clean {
2057            return 4;
2058        }
2059
2060        if bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION)
2061            && bundle.supports_direct_readonly
2062        {
2063            return 3;
2064        }
2065
2066        if bundle.supports_direct_readonly {
2067            return 2;
2068        }
2069
2070        1
2071    }
2072
2073    bundles.sort_by(|left, right| {
2074        bundle_health_rank(right)
2075            .cmp(&bundle_health_rank(left))
2076            .then_with(|| right.probe.max_message_id.cmp(&left.probe.max_message_id))
2077            .then_with(|| bundle_priority(&right.root_path).cmp(&bundle_priority(&left.root_path)))
2078            .then_with(|| {
2079                right
2080                    .supports_direct_readonly
2081                    .cmp(&left.supports_direct_readonly)
2082            })
2083            .then_with(|| right.total_bytes.cmp(&left.total_bytes))
2084            .then_with(|| right.modified_at_ms.cmp(&left.modified_at_ms))
2085            .then_with(|| right.root_path.cmp(&left.root_path))
2086    });
2087    bundles
2088}
2089
2090fn probe_historical_bundle(root_path: &Path) -> HistoricalBundleProbe {
2091    let Ok(conn) = open_historical_bundle_readonly(root_path) else {
2092        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or_default();
2093    };
2094
2095    let schema_version = read_meta_schema_version(&conn).ok().flatten();
2096    let fts_schema_rows: Option<i64> = conn
2097        .query_row_map(
2098            "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2099            fparams![],
2100            |row| row.get_typed(0),
2101        )
2102        .ok();
2103    let fts_queryable =
2104        historical_bundle_fts_queryable_via_frankensqlite(root_path, fts_schema_rows);
2105    let max_message_id: i64 = conn
2106        .query_row_map(
2107            "SELECT COALESCE(MAX(id), 0) FROM messages",
2108            fparams![],
2109            |row| row.get_typed(0),
2110        )
2111        .unwrap_or(0);
2112
2113    let probe = HistoricalBundleProbe {
2114        schema_version,
2115        fts_schema_rows,
2116        fts_queryable,
2117        max_message_id,
2118    };
2119
2120    if probe.schema_version.is_none()
2121        && probe.fts_schema_rows.is_none()
2122        && probe.max_message_id == 0
2123    {
2124        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or(probe);
2125    }
2126
2127    probe
2128}
2129
2130fn probe_historical_bundle_via_sqlite3_metadata(root_path: &Path) -> Option<HistoricalBundleProbe> {
2131    let bundle_uri = format!("file:{}?immutable=1", root_path.to_string_lossy());
2132    let output = Command::new("sqlite3")
2133        .arg("-batch")
2134        .arg("-noheader")
2135        .arg(&bundle_uri)
2136        .arg(
2137            "PRAGMA writable_schema=ON;
2138             SELECT COALESCE((SELECT value FROM meta WHERE key = 'schema_version'), '');
2139             SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages';
2140             SELECT COALESCE(MAX(id), 0) FROM messages;",
2141        )
2142        .output()
2143        .ok()?;
2144    if !output.status.success() {
2145        return None;
2146    }
2147
2148    let stdout = String::from_utf8(output.stdout).ok()?;
2149    let mut lines = stdout.lines();
2150    let schema_version = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
2151    let fts_schema_rows = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
2152    let max_message_id = lines
2153        .next()
2154        .and_then(|raw| raw.trim().parse::<i64>().ok())
2155        .unwrap_or(0);
2156
2157    Some(HistoricalBundleProbe {
2158        schema_version,
2159        fts_schema_rows,
2160        fts_queryable: false,
2161        max_message_id,
2162    })
2163}
2164
2165fn historical_bundle_fts_queryable_via_frankensqlite(
2166    root_path: &Path,
2167    fts_schema_rows: Option<i64>,
2168) -> bool {
2169    matches!(fts_schema_rows, Some(1))
2170        && FrankenStorage::open_readonly(root_path)
2171            .map(|storage| {
2172                storage
2173                    .raw()
2174                    .query("SELECT COUNT(*) FROM fts_messages")
2175                    .is_ok()
2176            })
2177            .unwrap_or(false)
2178}
2179
2180fn historical_bundle_supports_direct_readonly(root_path: &Path) -> bool {
2181    open_historical_bundle_readonly(root_path)
2182        .and_then(|conn| historical_bundle_has_queryable_core_tables(&conn))
2183        .is_ok()
2184}
2185
2186fn historical_table_exists(conn: &FrankenConnection, table: &str) -> Result<bool> {
2187    let found: Option<i64> = conn
2188        .query_row_map(
2189            "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1 LIMIT 1",
2190            fparams![table],
2191            |row| row.get_typed(0),
2192        )
2193        .optional()
2194        .with_context(|| format!("checking for historical table {table}"))?;
2195    Ok(found.is_some())
2196}
2197
2198fn probe_historical_table_reads(conn: &FrankenConnection, table: &str) -> Result<()> {
2199    if !historical_table_exists(conn, table)? {
2200        return Err(anyhow!(
2201            "historical database missing required table {table}"
2202        ));
2203    }
2204
2205    let sql = format!("SELECT rowid FROM {table} LIMIT 1");
2206    let _: Option<i64> = conn
2207        .query_row_map(&sql, fparams![], |row| row.get_typed(0))
2208        .optional()
2209        .with_context(|| format!("probing rows from historical table {table}"))?;
2210    Ok(())
2211}
2212
2213fn historical_bundle_has_queryable_core_tables(conn: &FrankenConnection) -> Result<()> {
2214    probe_historical_table_reads(conn, "conversations")?;
2215    probe_historical_table_reads(conn, "messages")?;
2216    Ok(())
2217}
2218
2219fn open_historical_bundle_readonly(root_path: &Path) -> Result<FrankenConnection> {
2220    let path_str = root_path.to_string_lossy();
2221    let flags = FrankenOpenFlags::SQLITE_OPEN_READ_ONLY;
2222    let conn = open_franken_with_flags(&path_str, flags)
2223        .with_context(|| format!("opening historical database {}", root_path.display()))?;
2224    Ok(conn)
2225}
2226
2227fn is_recoverable_insert_line(line: &str) -> bool {
2228    [
2229        "sources",
2230        "agents",
2231        "workspaces",
2232        "conversations",
2233        "messages",
2234        "snippets",
2235    ]
2236    .iter()
2237    .any(|table| {
2238        line.starts_with(&format!("INSERT INTO '{table}'"))
2239            || line.starts_with(&format!("INSERT OR IGNORE INTO '{table}'"))
2240            || line.starts_with(&format!("INSERT INTO \"{table}\""))
2241            || line.starts_with(&format!("INSERT OR IGNORE INTO \"{table}\""))
2242    })
2243}
2244
2245fn recover_historical_bundle_via_sqlite3(
2246    bundle: &HistoricalDatabaseBundle,
2247) -> Result<HistoricalReadConnection> {
2248    let tempdir = tempfile::TempDir::new().context("creating temporary salvage directory")?;
2249    let recovered_db = tempdir.path().join("historical-recovered.db");
2250    let temp_conn = FrankenConnection::open(recovered_db.to_string_lossy().as_ref())
2251        .with_context(|| format!("creating recovered database {}", recovered_db.display()))?;
2252    temp_conn
2253        .execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA)
2254        .with_context(|| format!("initializing recovered schema {}", recovered_db.display()))?;
2255    drop(temp_conn);
2256
2257    let bundle_uri = format!("file:{}?immutable=1", bundle.root_path.to_string_lossy());
2258    let mut recover = Command::new("sqlite3")
2259        .arg(&bundle_uri)
2260        .arg(".recover")
2261        .stdout(Stdio::piped())
2262        .spawn()
2263        .with_context(|| {
2264            format!(
2265                "launching sqlite3 .recover for historical bundle {}",
2266                bundle.root_path.display()
2267            )
2268        })?;
2269    let recover_stdout = recover
2270        .stdout
2271        .take()
2272        .context("capturing sqlite3 .recover stdout")?;
2273
2274    let mut importer = Command::new("sqlite3")
2275        .arg(&recovered_db)
2276        .stdin(Stdio::piped())
2277        .spawn()
2278        .with_context(|| {
2279            format!(
2280                "launching sqlite3 importer for recovered bundle {}",
2281                recovered_db.display()
2282            )
2283        })?;
2284
2285    {
2286        let importer_stdin = importer
2287            .stdin
2288            .as_mut()
2289            .context("opening sqlite3 importer stdin")?;
2290        importer_stdin
2291            .write_all(b"BEGIN;\n")
2292            .context("starting recovery import transaction")?;
2293
2294        let reader = BufReader::new(recover_stdout);
2295        for line in reader.lines() {
2296            let line = line.context("reading sqlite3 .recover output")?;
2297            if is_recoverable_insert_line(&line) {
2298                importer_stdin
2299                    .write_all(line.as_bytes())
2300                    .context("writing recovered INSERT")?;
2301                importer_stdin
2302                    .write_all(b"\n")
2303                    .context("writing recovered INSERT newline")?;
2304            }
2305        }
2306
2307        importer_stdin
2308            .write_all(b"COMMIT;\n")
2309            .context("committing recovery import transaction")?;
2310    }
2311
2312    let importer_status = importer
2313        .wait()
2314        .context("waiting for sqlite3 recovery importer")?;
2315    let recover_status = recover
2316        .wait()
2317        .context("waiting for sqlite3 .recover process")?;
2318    if !importer_status.success() {
2319        anyhow::bail!(
2320            "sqlite3 recovery importer exited with status {} for {} after sqlite3 .recover exited with status {}",
2321            importer_status,
2322            recovered_db.display(),
2323            recover_status
2324        );
2325    }
2326
2327    let conn = open_historical_bundle_readonly(&recovered_db)?;
2328    historical_bundle_has_queryable_core_tables(&conn)?;
2329    if !recover_status.success() {
2330        let (conversations, messages) = historical_bundle_counts(&conn)?;
2331        if conversations == 0 && messages == 0 {
2332            anyhow::bail!(
2333                "sqlite3 .recover exited with status {} for {} and recovered no core rows",
2334                recover_status,
2335                bundle.root_path.display()
2336            );
2337        }
2338        tracing::warn!(
2339            path = %bundle.root_path.display(),
2340            status = %recover_status,
2341            conversations,
2342            messages,
2343            "sqlite3 .recover exited nonzero after emitting recoverable core rows; continuing with recovered subset"
2344        );
2345    }
2346    Ok(HistoricalReadConnection {
2347        conn,
2348        method: "sqlite3-recover",
2349        root_path: recovered_db,
2350        _tempdir: Some(tempdir),
2351    })
2352}
2353
2354fn open_historical_bundle_for_salvage(
2355    bundle: &HistoricalDatabaseBundle,
2356) -> Result<HistoricalReadConnection> {
2357    match open_historical_bundle_readonly(&bundle.root_path) {
2358        Ok(conn) => {
2359            if historical_bundle_has_queryable_core_tables(&conn).is_ok() {
2360                return Ok(HistoricalReadConnection {
2361                    conn,
2362                    method: "direct-readonly",
2363                    root_path: bundle.root_path.clone(),
2364                    _tempdir: None,
2365                });
2366            }
2367        }
2368        Err(err) => {
2369            tracing::warn!(
2370                path = %bundle.root_path.display(),
2371                error = %err,
2372                "historical bundle direct open failed; falling back to sqlite3 .recover"
2373            );
2374        }
2375    }
2376
2377    recover_historical_bundle_via_sqlite3(bundle)
2378}
2379
2380fn historical_bundle_counts(conn: &FrankenConnection) -> Result<(usize, usize)> {
2381    let conversations: i64 =
2382        conn.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
2383            row.get_typed(0)
2384        })?;
2385    let messages: i64 = conn.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2386        row.get_typed(0)
2387    })?;
2388    Ok((
2389        usize::try_from(conversations.max(0)).unwrap_or(usize::MAX),
2390        usize::try_from(messages.max(0)).unwrap_or(usize::MAX),
2391    ))
2392}
2393
2394fn clear_seeded_runtime_meta(conn: &FrankenConnection) -> Result<()> {
2395    conn.execute(
2396        "DELETE FROM meta
2397         WHERE key LIKE 'historical_bundle_salvaged:%'
2398            OR key IN ('last_scan_ts', 'last_indexed_at', 'last_embedded_message_id')",
2399    )?;
2400    Ok(())
2401}
2402
2403fn record_historical_bundle_import(
2404    conn: &FrankenConnection,
2405    bundle: &HistoricalDatabaseBundle,
2406    method: &str,
2407    conversations_imported: usize,
2408    messages_imported: usize,
2409) -> Result<()> {
2410    let key = FrankenStorage::historical_bundle_meta_key(bundle);
2411    let value = serde_json::json!({
2412        "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
2413        "path": bundle.root_path.display().to_string(),
2414        "bytes": bundle.total_bytes,
2415        "modified_at_ms": bundle.modified_at_ms,
2416        "method": method,
2417        "conversations_imported": conversations_imported,
2418        "messages_imported": messages_imported,
2419        "recorded_at_ms": FrankenStorage::now_millis(),
2420    });
2421    let value_str = serde_json::to_string(&value)?;
2422    conn.execute_compat(
2423        "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
2424        fparams![key, value_str],
2425    )?;
2426    Ok(())
2427}
2428
2429fn scrub_staged_derived_fts_metadata_via_sqlite3(staged_db_path: &Path) -> Result<()> {
2430    let scrub_sql = "PRAGMA writable_schema = ON;
2431         DELETE FROM sqlite_master
2432          WHERE name = 'fts_messages'
2433             OR tbl_name = 'fts_messages'
2434             OR name IN (
2435                'fts_messages_config',
2436                'fts_messages_content',
2437                'fts_messages_data',
2438                'fts_messages_docsize',
2439                'fts_messages_idx'
2440             )
2441             OR tbl_name IN (
2442                'fts_messages_config',
2443                'fts_messages_content',
2444                'fts_messages_data',
2445                'fts_messages_docsize',
2446                'fts_messages_idx'
2447             );
2448         PRAGMA writable_schema = OFF;";
2449
2450    let run_scrub = |disable_defensive: bool| -> Result<std::process::Output> {
2451        let mut command = Command::new("sqlite3");
2452        command.arg("-batch").arg(staged_db_path);
2453        if disable_defensive {
2454            command.arg(".dbconfig defensive off");
2455        }
2456        command.arg(scrub_sql).output().with_context(|| {
2457            format!(
2458                "running sqlite3 staged FTS metadata scrub for {}",
2459                staged_db_path.display()
2460            )
2461        })
2462    };
2463    let render_output = |output: &std::process::Output| -> String {
2464        format!(
2465            "status {}; stdout: {}; stderr: {}",
2466            output.status,
2467            String::from_utf8_lossy(&output.stdout).trim(),
2468            String::from_utf8_lossy(&output.stderr).trim()
2469        )
2470    };
2471
2472    let defensive_off_output = run_scrub(true)?;
2473    if defensive_off_output.status.success() {
2474        return Ok(());
2475    }
2476
2477    let fallback_output = run_scrub(false)?;
2478    if !fallback_output.status.success() {
2479        anyhow::bail!(
2480            "sqlite3 staged FTS metadata scrub failed for {}; defensive-off attempt {}; fallback without .dbconfig {}",
2481            staged_db_path.display(),
2482            render_output(&defensive_off_output),
2483            render_output(&fallback_output)
2484        );
2485    }
2486    Ok(())
2487}
2488
2489fn ensure_seeded_canonical_fts_consistency(staged_db_path: &Path) -> Result<FtsConsistencyRepair> {
2490    match ensure_fts_consistency_via_rusqlite(staged_db_path) {
2491        Ok(repair) => Ok(repair),
2492        Err(err) => {
2493            if fts_messages_integrity_error_from_message(format!("{err:#}")).is_none() {
2494                return Err(err).with_context(|| {
2495                    format!(
2496                        "repairing staged canonical FTS consistency before finalization: {}",
2497                        staged_db_path.display()
2498                    )
2499                });
2500            }
2501
2502            tracing::warn!(
2503                path = %staged_db_path.display(),
2504                error = %err,
2505                "staged historical seed has malformed derived FTS metadata; scrubbing and rebuilding FTS on staged copy"
2506            );
2507            scrub_staged_derived_fts_metadata_via_sqlite3(staged_db_path).with_context(|| {
2508                format!(
2509                    "scrubbing malformed staged FTS metadata before finalization: {}",
2510                    staged_db_path.display()
2511                )
2512            })?;
2513            ensure_fts_consistency_via_rusqlite(staged_db_path).with_context(|| {
2514                format!(
2515                    "repairing staged canonical FTS consistency after metadata scrub: {}",
2516                    staged_db_path.display()
2517                )
2518            })
2519        }
2520    }
2521}
2522
2523fn finalize_seeded_canonical_bundle_via_rusqlite(
2524    canonical_db_path: &Path,
2525    bundle: &HistoricalDatabaseBundle,
2526) -> Result<(usize, usize)> {
2527    let _fts_repair = ensure_seeded_canonical_fts_consistency(canonical_db_path)?;
2528
2529    let path_str = canonical_db_path.to_string_lossy();
2530    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2531        format!(
2532            "opening seeded canonical database for post-seed finalization: {}",
2533            canonical_db_path.display()
2534        )
2535    })?;
2536    conn.execute("PRAGMA busy_timeout = 30000;")
2537        .with_context(|| {
2538            format!(
2539                "configuring busy timeout for seeded canonical database {}",
2540                canonical_db_path.display()
2541            )
2542        })?;
2543    let schema_version = read_meta_schema_version(&conn)?;
2544
2545    if let Some(version) = schema_version
2546        && version < CURRENT_SCHEMA_VERSION
2547        && version != 13
2548    {
2549        anyhow::bail!(
2550            "seeded canonical bundle schema_version {version} is too old for baseline import and cannot be finalized automatically"
2551        );
2552    }
2553
2554    clear_seeded_runtime_meta(&conn)?;
2555    let (conversations_imported, messages_imported) = historical_bundle_counts(&conn)?;
2556
2557    conn.execute_compat(
2558        "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
2559        fparams![CURRENT_SCHEMA_VERSION.to_string()],
2560    )?;
2561
2562    conn.execute_compat(
2563        "INSERT OR IGNORE INTO _schema_migrations(version, name) VALUES(?1, 'fts_contentless')",
2564        fparams![CURRENT_SCHEMA_VERSION],
2565    )?;
2566    record_historical_bundle_import(
2567        &conn,
2568        bundle,
2569        "baseline-bulk-sql-copy",
2570        conversations_imported,
2571        messages_imported,
2572    )?;
2573    Ok((conversations_imported, messages_imported))
2574}
2575
2576fn read_meta_schema_version(conn: &FrankenConnection) -> Result<Option<i64>> {
2577    let version: Option<String> = conn
2578        .query_row_map(
2579            "SELECT value FROM meta WHERE key = 'schema_version'",
2580            fparams![],
2581            |row| row.get_typed(0),
2582        )
2583        .optional()?;
2584    Ok(version.and_then(|raw| raw.parse::<i64>().ok()))
2585}
2586
2587#[cfg(test)]
2588fn franken_fts_schema_rows(conn: &FrankenConnection) -> Result<i64> {
2589    conn.query_row_map(
2590        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2591        fparams![],
2592        |row| row.get_typed(0),
2593    )
2594    .context("counting sqlite_master rows for fts_messages via frankensqlite")
2595}
2596
2597#[cfg(test)]
2598fn franken_fts_limit_probe(conn: &FrankenConnection) -> bool {
2599    conn.query("SELECT COUNT(*) FROM fts_messages").is_ok()
2600}
2601
2602#[cfg(test)]
2603#[allow(dead_code)]
2604pub(crate) fn probe_database_health_via_frankensqlite(
2605    db_path: &Path,
2606) -> Result<SqliteDatabaseHealthProbe> {
2607    let path_str = db_path.to_string_lossy();
2608    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2609        format!(
2610            "opening frankensqlite db at {} for database health probe",
2611            db_path.display()
2612        )
2613    })?;
2614    conn.execute_batch("PRAGMA busy_timeout = 30000;")
2615        .with_context(|| {
2616            format!(
2617                "configuring busy timeout for database health probe at {}",
2618                db_path.display()
2619            )
2620        })?;
2621
2622    let schema_version = read_meta_schema_version(&conn)?;
2623    let quick_check_status: String = conn
2624        .query_row_map("PRAGMA quick_check(1)", fparams![], |row| row.get_typed(0))
2625        .with_context(|| format!("running PRAGMA quick_check(1) for {}", db_path.display()))?;
2626    let quick_check_ok = quick_check_status.trim().eq_ignore_ascii_case("ok");
2627    let fts_schema_rows = franken_fts_schema_rows(&conn)?;
2628    let fts_queryable = fts_schema_rows == 1 && franken_fts_limit_probe(&conn);
2629
2630    if !quick_check_ok {
2631        return Ok(SqliteDatabaseHealthProbe {
2632            schema_version,
2633            quick_check_ok,
2634            fts_schema_rows,
2635            fts_queryable,
2636            message_count: 0,
2637            max_message_id: 0,
2638        });
2639    }
2640
2641    let message_count: i64 = conn
2642        .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2643            row.get_typed(0)
2644        })
2645        .context("counting messages during frankensqlite database health probe")?;
2646    let max_message_id: i64 = conn
2647        .query_row_map(
2648            "SELECT COALESCE(MAX(id), 0) FROM messages",
2649            fparams![],
2650            |row| row.get_typed(0),
2651        )
2652        .context("reading max message id during frankensqlite database health probe")?;
2653
2654    Ok(SqliteDatabaseHealthProbe {
2655        schema_version,
2656        quick_check_ok,
2657        fts_schema_rows,
2658        fts_queryable,
2659        message_count,
2660        max_message_id,
2661    })
2662}
2663
2664struct StagedHistoricalSeed {
2665    tempdir: tempfile::TempDir,
2666    db_path: PathBuf,
2667}
2668
2669fn stage_historical_bundle_for_seed(
2670    canonical_db_path: &Path,
2671    source_root_path: &Path,
2672) -> Result<StagedHistoricalSeed> {
2673    let canonical_parent = canonical_db_path.parent().unwrap_or_else(|| Path::new("."));
2674    fs::create_dir_all(canonical_parent).with_context(|| {
2675        format!(
2676            "creating canonical database directory before bulk historical seed import: {}",
2677            canonical_parent.display()
2678        )
2679    })?;
2680    let tempdir = tempfile::TempDir::new_in(canonical_parent)
2681        .context("creating temporary baseline seed directory")?;
2682    let staged_seed_db = tempdir.path().join("baseline-seed-output.db");
2683    copy_database_bundle(source_root_path, &staged_seed_db)?;
2684
2685    Ok(StagedHistoricalSeed {
2686        tempdir,
2687        db_path: staged_seed_db,
2688    })
2689}
2690
2691fn stage_and_finalize_historical_seed(
2692    canonical_db_path: &Path,
2693    bundle: &HistoricalDatabaseBundle,
2694    source_root_path: &Path,
2695) -> Result<(StagedHistoricalSeed, usize, usize)> {
2696    let staged_seed = stage_historical_bundle_for_seed(canonical_db_path, source_root_path)?;
2697    let (conversations_imported, messages_imported) =
2698        finalize_seeded_canonical_bundle_via_rusqlite(&staged_seed.db_path, bundle)?;
2699    Ok((staged_seed, conversations_imported, messages_imported))
2700}
2701
2702fn promote_staged_historical_seed(
2703    canonical_db_path: &Path,
2704    staged_seed: &StagedHistoricalSeed,
2705) -> Result<()> {
2706    let canonical_backup = staged_seed
2707        .tempdir
2708        .path()
2709        .join("pre-seed-canonical-backup.db");
2710    let had_canonical = canonical_db_path.exists()
2711        || database_sidecar_path(canonical_db_path, "-wal").exists()
2712        || database_sidecar_path(canonical_db_path, "-shm").exists();
2713
2714    if had_canonical {
2715        move_database_bundle(canonical_db_path, &canonical_backup).with_context(|| {
2716            format!(
2717                "backing up canonical database before promoting staged historical seed import: {}",
2718                canonical_db_path.display()
2719            )
2720        })?;
2721    }
2722
2723    if let Err(err) =
2724        move_database_bundle(&staged_seed.db_path, canonical_db_path).with_context(|| {
2725            format!(
2726                "promoting staged historical seed database bundle {} into canonical path {}",
2727                staged_seed.db_path.display(),
2728                canonical_db_path.display()
2729            )
2730        })
2731    {
2732        if had_canonical {
2733            let _ = move_database_bundle(&canonical_backup, canonical_db_path);
2734        }
2735        return Err(err);
2736    }
2737
2738    Ok(())
2739}
2740
2741pub(crate) fn seed_canonical_from_best_historical_bundle(
2742    canonical_db_path: &Path,
2743) -> Result<Option<HistoricalSalvageOutcome>> {
2744    let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
2745    let mut last_seed_error: Option<anyhow::Error> = None;
2746    for bundle in ordered_bundles {
2747        if let Some(version) = bundle.probe.schema_version
2748            && version < 13
2749        {
2750            let err = anyhow!(
2751                "historical bundle {} schema_version {version} is too old for baseline import",
2752                bundle.root_path.display()
2753            );
2754            tracing::warn!(
2755                path = %bundle.root_path.display(),
2756                schema_version = version,
2757                "historical bundle is too old for baseline seed import"
2758            );
2759            last_seed_error = Some(err);
2760            continue;
2761        }
2762
2763        let (staged_seed, conversations_imported, messages_imported) =
2764            match stage_and_finalize_historical_seed(canonical_db_path, &bundle, &bundle.root_path)
2765            {
2766                Ok(result) => result,
2767                Err(primary_err) => {
2768                    tracing::warn!(
2769                        path = %bundle.root_path.display(),
2770                        error = %primary_err,
2771                        "direct bulk baseline seed from historical bundle failed; trying sqlite3 salvage copy"
2772                    );
2773                    let source = match open_historical_bundle_for_salvage(&bundle).with_context(
2774                        || {
2775                            format!(
2776                                "opening historical seed bundle {} for baseline import",
2777                                bundle.root_path.display()
2778                            )
2779                        },
2780                    ) {
2781                        Ok(source) => source,
2782                        Err(salvage_err) => {
2783                            last_seed_error = Some(anyhow!(
2784                                "direct baseline seed from {} failed: {primary_err:#}; sqlite3 salvage open also failed: {salvage_err:#}",
2785                                bundle.root_path.display()
2786                            ));
2787                            continue;
2788                        }
2789                    };
2790                    match stage_and_finalize_historical_seed(
2791                        canonical_db_path,
2792                        &bundle,
2793                        &source.root_path,
2794                    ) {
2795                        Ok(result) => result,
2796                        Err(err) => {
2797                            tracing::warn!(
2798                                path = %bundle.root_path.display(),
2799                                source_path = %source.root_path.display(),
2800                                error = %err,
2801                                "bulk baseline seed staging from sqlite3-salvaged historical bundle failed; trying next candidate"
2802                            );
2803                            last_seed_error = Some(err);
2804                            continue;
2805                        }
2806                    }
2807                }
2808            };
2809
2810        if conversations_imported == 0 && messages_imported == 0 {
2811            let err = anyhow!(
2812                "historical bundle {} has no core rows for baseline import",
2813                bundle.root_path.display()
2814            );
2815            tracing::warn!(
2816                path = %bundle.root_path.display(),
2817                "historical bundle has no core rows for baseline seed import"
2818            );
2819            last_seed_error = Some(err);
2820            continue;
2821        }
2822
2823        if let Err(err) = promote_staged_historical_seed(canonical_db_path, &staged_seed) {
2824            tracing::warn!(
2825                path = %bundle.root_path.display(),
2826                error = %err,
2827                "promoting staged historical seed import failed; trying next candidate"
2828            );
2829            last_seed_error = Some(err);
2830            continue;
2831        }
2832
2833        tracing::info!(
2834            path = %bundle.root_path.display(),
2835            conversations_imported,
2836            messages_imported,
2837            "seeded empty canonical database from largest healthy historical bundle"
2838        );
2839
2840        return Ok(Some(HistoricalSalvageOutcome {
2841            bundles_considered: 0,
2842            bundles_imported: 1,
2843            conversations_imported,
2844            messages_imported,
2845        }));
2846    }
2847    if let Some(err) = last_seed_error {
2848        return Err(err);
2849    }
2850    Ok(None)
2851}
2852
2853fn parse_json_column(value: Option<String>) -> serde_json::Value {
2854    value
2855        .and_then(|raw| serde_json::from_str(&raw).ok())
2856        .unwrap_or(serde_json::Value::Null)
2857}
2858
2859const HISTORICAL_RAW_JSON_SENTINEL_KEY: &str = "__cass_historical_raw_json__";
2860
2861fn wrap_historical_raw_json(raw: String) -> serde_json::Value {
2862    serde_json::json!({ HISTORICAL_RAW_JSON_SENTINEL_KEY: raw })
2863}
2864
2865fn historical_raw_json(value: &serde_json::Value) -> Option<&str> {
2866    match value {
2867        serde_json::Value::Object(map) if map.len() == 1 => map
2868            .get(HISTORICAL_RAW_JSON_SENTINEL_KEY)
2869            .and_then(serde_json::Value::as_str),
2870        _ => None,
2871    }
2872}
2873
2874fn parse_historical_json_column(value: Option<String>) -> serde_json::Value {
2875    match value {
2876        Some(raw) if raw.trim().is_empty() => serde_json::Value::Null,
2877        Some(raw) => wrap_historical_raw_json(raw),
2878        None => serde_json::Value::Null,
2879    }
2880}
2881
2882fn historical_salvage_debug_enabled() -> bool {
2883    std::env::var_os("CASS_DEBUG_HISTORICAL_SALVAGE").is_some()
2884}
2885
2886#[derive(Debug, Clone, Copy)]
2887struct HistoricalImportBatchLimits {
2888    conversations: usize,
2889    messages: usize,
2890    payload_chars: usize,
2891}
2892
2893fn env_positive_usize(key: &str) -> Option<usize> {
2894    dotenvy::var(key)
2895        .ok()
2896        .and_then(|value| value.parse::<usize>().ok())
2897        .filter(|value| *value > 0)
2898}
2899
2900fn historical_import_batch_limits() -> HistoricalImportBatchLimits {
2901    let cpu_count = std::thread::available_parallelism()
2902        .map(std::num::NonZeroUsize::get)
2903        .unwrap_or(1);
2904
2905    let default_limits = if cpu_count >= 32 {
2906        HistoricalImportBatchLimits {
2907            conversations: 128,
2908            messages: 16_384,
2909            payload_chars: 12_000_000,
2910        }
2911    } else {
2912        HistoricalImportBatchLimits {
2913            conversations: 32,
2914            messages: 4_096,
2915            payload_chars: 3_000_000,
2916        }
2917    };
2918
2919    HistoricalImportBatchLimits {
2920        conversations: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CONVERSATIONS")
2921            .unwrap_or(default_limits.conversations),
2922        messages: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_MESSAGES")
2923            .unwrap_or(default_limits.messages),
2924        payload_chars: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CHARS")
2925            .unwrap_or(default_limits.payload_chars),
2926    }
2927}
2928
2929fn json_value_size_hint(value: &serde_json::Value) -> usize {
2930    if let Some(raw) = historical_raw_json(value) {
2931        return raw.len();
2932    }
2933    match value {
2934        serde_json::Value::Null => 0,
2935        other => serde_json::to_string(other)
2936            .map(|raw| raw.len())
2937            .unwrap_or(0),
2938    }
2939}
2940
2941fn message_payload_size_hint(message: &Message) -> usize {
2942    message
2943        .content
2944        .len()
2945        .saturating_add(json_value_size_hint(&message.extra_json))
2946}
2947
2948fn is_backup_root_name(name: &str, prefix: &str) -> bool {
2949    name.starts_with(prefix) && !name.ends_with("-wal") && !name.ends_with("-shm")
2950}
2951
2952// Suffixes that mark sqlite sidecar files we must never re-open as a DB root.
2953// Includes the standard -wal/-shm pair plus frankensqlite's Windows advisory-
2954// lock sidecars (-lock-shared/-lock-reserved/-lock-pending). Used by directory
2955// enumeration paths in `historical_bundle_root_paths`; deliberately NOT used
2956// by `is_backup_root_name`, because the existing backup-rotation cleanup must
2957// continue to sweep up any pre-existing orphan lock sidecars.
2958fn has_db_sidecar_suffix(name: &str) -> bool {
2959    const SIDECAR_SUFFIXES: &[&str] = &[
2960        "-wal",
2961        "-shm",
2962        "-lock-shared",
2963        "-lock-reserved",
2964        "-lock-pending",
2965    ];
2966    SIDECAR_SUFFIXES.iter().any(|suffix| name.ends_with(suffix))
2967}
2968
2969/// Public schema version constant for external checks.
2970pub const CURRENT_SCHEMA_VERSION: i64 = 20;
2971const MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION: i64 = 13;
2972
2973/// Result of checking schema compatibility.
2974#[derive(Debug, Clone)]
2975pub enum SchemaCheck {
2976    /// Schema is up to date, no migration needed.
2977    Compatible,
2978    /// Schema needs migration but can be done incrementally.
2979    NeedsMigration,
2980    /// Schema is incompatible and needs a full rebuild (with reason).
2981    NeedsRebuild(String),
2982}
2983
2984fn schema_check_error_requires_rebuild(err: &frankensqlite::FrankenError) -> bool {
2985    // Only on-disk corruption classes justify destructive rebuild.
2986    // Locking, open, and generic I/O failures are often transient and must
2987    // surface as errors rather than deleting the database under the caller.
2988    matches!(
2989        err,
2990        frankensqlite::FrankenError::DatabaseCorrupt { .. }
2991            | frankensqlite::FrankenError::WalCorrupt { .. }
2992            | frankensqlite::FrankenError::NotADatabase { .. }
2993            | frankensqlite::FrankenError::ShortRead { .. }
2994    )
2995}
2996
2997fn unique_backup_path(path: &Path) -> PathBuf {
2998    static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
2999
3000    let timestamp = SystemTime::now()
3001        .duration_since(UNIX_EPOCH)
3002        .map(|d| d.as_nanos())
3003        .unwrap_or(0);
3004    let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
3005    let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
3006
3007    path.with_file_name(format!(
3008        "{file_name}.backup.{}.{}.{}",
3009        std::process::id(),
3010        timestamp,
3011        nonce
3012    ))
3013}
3014
3015fn vacuum_stage_backup_path(backup_path: &Path) -> PathBuf {
3016    let file_name = backup_path
3017        .file_name()
3018        .and_then(|name| name.to_str())
3019        .unwrap_or("db.backup");
3020    backup_path.with_file_name(format!(".{file_name}.vacuum-in-progress"))
3021}
3022
3023/// Check schema compatibility without modifying the database.
3024///
3025/// Opens the database read-only and checks the schema version.
3026fn check_schema_compatibility(
3027    path: &Path,
3028) -> std::result::Result<SchemaCheck, frankensqlite::FrankenError> {
3029    let mut conn = open_franken_with_flags(
3030        &path.to_string_lossy(),
3031        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
3032    )?;
3033
3034    let result = (|| {
3035        // Check if meta table exists
3036        let meta_exists: i32 = conn.query_row_map(
3037            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='meta'",
3038            fparams![],
3039            |row| row.get_typed(0),
3040        )?;
3041
3042        if meta_exists == 0 {
3043            // No meta table - could be empty or very old schema, needs rebuild
3044            // But first check if there are any tables at all
3045            let table_count: i32 = conn.query_row_map(
3046                "SELECT COUNT(*) FROM sqlite_master WHERE type='table'",
3047                fparams![],
3048                |row| row.get_typed(0),
3049            )?;
3050
3051            if table_count == 0 {
3052                // Empty database, will be initialized fresh
3053                return Ok(SchemaCheck::NeedsMigration);
3054            }
3055
3056            // Has tables but no meta - very old or corrupted
3057            return Ok(SchemaCheck::NeedsRebuild(
3058                "Database missing schema version metadata".to_string(),
3059            ));
3060        }
3061
3062        // Get the schema version
3063        let version: Option<i64> = conn
3064            .query_row_map(
3065                "SELECT value FROM meta WHERE key = 'schema_version'",
3066                fparams![],
3067                |row| Ok(row.get_typed::<String>(0)?.parse().ok()),
3068            )
3069            .ok()
3070            .flatten();
3071
3072        match version {
3073            Some(v) if v == SCHEMA_VERSION => Ok(SchemaCheck::Compatible),
3074            Some(v) if (MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION..SCHEMA_VERSION).contains(&v) => {
3075                Ok(SchemaCheck::NeedsMigration)
3076            }
3077            Some(v) if v > 0 && v < MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION => {
3078                Ok(SchemaCheck::NeedsRebuild(format!(
3079                    "Schema version {} is too old for in-place migration; supported upgrade path starts at version {}",
3080                    v, MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION
3081                )))
3082            }
3083            Some(v) => {
3084                // v > SCHEMA_VERSION - database is from a newer version
3085                Ok(SchemaCheck::NeedsRebuild(format!(
3086                    "Schema version {} is newer than supported version {}",
3087                    v, SCHEMA_VERSION
3088                )))
3089            }
3090            None => Ok(SchemaCheck::NeedsRebuild(
3091                "Schema version not found or invalid".to_string(),
3092            )),
3093        }
3094    })();
3095
3096    if let Err(close_err) = conn.close_in_place() {
3097        tracing::warn!(
3098            error = %close_err,
3099            db_path = %path.display(),
3100            "check_schema_compatibility: close_in_place failed; falling back to best-effort close"
3101        );
3102        conn.close_best_effort_in_place();
3103    }
3104
3105    result
3106}
3107
3108const SCHEMA_VERSION: i64 = CURRENT_SCHEMA_VERSION;
3109
3110#[cfg(test)]
3111const MIGRATION_V1: &str = r"
3112PRAGMA foreign_keys = ON;
3113
3114CREATE TABLE IF NOT EXISTS meta (
3115    key TEXT PRIMARY KEY,
3116    value TEXT NOT NULL
3117);
3118
3119CREATE TABLE IF NOT EXISTS agents (
3120    id INTEGER PRIMARY KEY,
3121    slug TEXT NOT NULL UNIQUE,
3122    name TEXT NOT NULL,
3123    version TEXT,
3124    kind TEXT NOT NULL,
3125    created_at INTEGER NOT NULL,
3126    updated_at INTEGER NOT NULL
3127);
3128
3129CREATE TABLE IF NOT EXISTS workspaces (
3130    id INTEGER PRIMARY KEY,
3131    path TEXT NOT NULL UNIQUE,
3132    display_name TEXT
3133);
3134
3135CREATE TABLE IF NOT EXISTS conversations (
3136    id INTEGER PRIMARY KEY,
3137    agent_id INTEGER NOT NULL REFERENCES agents(id),
3138    workspace_id INTEGER REFERENCES workspaces(id),
3139    external_id TEXT,
3140    title TEXT,
3141    source_path TEXT NOT NULL,
3142    started_at INTEGER,
3143    ended_at INTEGER,
3144    approx_tokens INTEGER,
3145    metadata_json TEXT,
3146    UNIQUE(agent_id, external_id)
3147);
3148
3149CREATE TABLE IF NOT EXISTS messages (
3150    id INTEGER PRIMARY KEY,
3151    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
3152    idx INTEGER NOT NULL,
3153    role TEXT NOT NULL,
3154    author TEXT,
3155    created_at INTEGER,
3156    content TEXT NOT NULL,
3157    extra_json TEXT,
3158    UNIQUE(conversation_id, idx)
3159);
3160
3161CREATE TABLE IF NOT EXISTS snippets (
3162    id INTEGER PRIMARY KEY,
3163    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
3164    file_path TEXT,
3165    start_line INTEGER,
3166    end_line INTEGER,
3167    language TEXT,
3168    snippet_text TEXT
3169);
3170
3171CREATE TABLE IF NOT EXISTS tags (
3172    id INTEGER PRIMARY KEY,
3173    name TEXT NOT NULL UNIQUE
3174);
3175
3176CREATE TABLE IF NOT EXISTS conversation_tags (
3177    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
3178    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
3179    PRIMARY KEY (conversation_id, tag_id)
3180);
3181
3182CREATE INDEX IF NOT EXISTS idx_conversations_agent_started
3183    ON conversations(agent_id, started_at DESC);
3184
3185CREATE INDEX IF NOT EXISTS idx_messages_conv_idx
3186    ON messages(conversation_id, idx);
3187
3188";
3189
3190#[cfg(test)]
3191const MIGRATION_V2: &str = r"
3192CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(
3193    content,
3194    title,
3195    agent,
3196    workspace,
3197    source_path,
3198    created_at UNINDEXED,
3199    message_id UNINDEXED,
3200    tokenize='porter'
3201);
3202INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
3203SELECT
3204    m.content,
3205    c.title,
3206    a.slug,
3207    w.path,
3208    c.source_path,
3209    m.created_at,
3210    m.id
3211FROM messages m
3212JOIN conversations c ON m.conversation_id = c.id
3213JOIN agents a ON c.agent_id = a.id
3214LEFT JOIN workspaces w ON c.workspace_id = w.id;
3215";
3216
3217#[cfg(test)]
3218#[allow(dead_code)]
3219const MIGRATION_V3: &str = r"
3220DROP TABLE IF EXISTS fts_messages;
3221CREATE VIRTUAL TABLE fts_messages USING fts5(
3222    content,
3223    title,
3224    agent,
3225    workspace,
3226    source_path,
3227    created_at UNINDEXED,
3228    message_id UNINDEXED,
3229    tokenize='porter'
3230);
3231INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
3232SELECT
3233    m.content,
3234    c.title,
3235    a.slug,
3236    w.path,
3237    c.source_path,
3238    m.created_at,
3239    m.id
3240FROM messages m
3241JOIN conversations c ON m.conversation_id = c.id
3242JOIN agents a ON c.agent_id = a.id
3243LEFT JOIN workspaces w ON c.workspace_id = w.id;
3244";
3245
3246#[cfg(test)]
3247const MIGRATION_V4: &str = r"
3248-- Sources table for tracking where conversations come from
3249CREATE TABLE IF NOT EXISTS sources (
3250    id TEXT PRIMARY KEY,           -- source_id (e.g., 'local', 'work-laptop')
3251    kind TEXT NOT NULL,            -- 'local', 'ssh', etc.
3252    host_label TEXT,               -- display label
3253    machine_id TEXT,               -- optional stable machine id
3254    platform TEXT,                 -- 'macos', 'linux', 'windows'
3255    config_json TEXT,              -- JSON blob for extra config (SSH params, path rewrites)
3256    created_at INTEGER NOT NULL,
3257    updated_at INTEGER NOT NULL
3258);
3259
3260-- Bootstrap: Insert the default 'local' source
3261INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
3262VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
3263";
3264
3265#[cfg(test)]
3266const MIGRATION_V5: &str = r"
3267-- Add provenance columns to conversations table
3268-- SQLite cannot alter unique constraints, so we need to recreate the table
3269
3270-- Create new table with provenance columns and updated unique constraint
3271CREATE TABLE conversations_new (
3272    id INTEGER PRIMARY KEY,
3273    agent_id INTEGER NOT NULL REFERENCES agents(id),
3274    workspace_id INTEGER REFERENCES workspaces(id),
3275    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
3276    external_id TEXT,
3277    title TEXT,
3278    source_path TEXT NOT NULL,
3279    started_at INTEGER,
3280    ended_at INTEGER,
3281    approx_tokens INTEGER,
3282    metadata_json TEXT,
3283    origin_host TEXT,
3284    UNIQUE(source_id, agent_id, external_id)
3285);
3286
3287-- Copy data from old table (all existing conversations get source_id='local')
3288INSERT INTO conversations_new (id, agent_id, workspace_id, source_id, external_id, title,
3289                               source_path, started_at, ended_at, approx_tokens, metadata_json, origin_host)
3290SELECT id, agent_id, workspace_id, 'local', external_id, title,
3291       source_path, started_at, ended_at, approx_tokens, metadata_json, NULL
3292FROM conversations;
3293
3294-- Drop old table and rename new
3295DROP TABLE conversations;
3296ALTER TABLE conversations_new RENAME TO conversations;
3297
3298-- Recreate indexes
3299CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
3300CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
3301";
3302
3303#[cfg(test)]
3304const MIGRATION_V6: &str = r"
3305-- Optimize lookup by source_path (used by TUI detail view)
3306CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
3307";
3308
3309#[cfg(test)]
3310const MIGRATION_V7: &str = r"
3311-- Add binary columns for MessagePack serialization (Opt 3.1)
3312-- Binary format is 50-70% smaller than JSON and faster to parse
3313ALTER TABLE conversations ADD COLUMN metadata_bin BLOB;
3314ALTER TABLE messages ADD COLUMN extra_bin BLOB;
3315";
3316
3317#[cfg(test)]
3318const MIGRATION_V8: &str = r"
3319-- Opt 3.2: Daily stats materialized table for O(1) time-range histograms
3320-- Provides fast aggregated queries for stats/dashboard without full table scans
3321
3322CREATE TABLE IF NOT EXISTS daily_stats (
3323    day_id INTEGER NOT NULL,              -- Days since 2020-01-01 (Unix epoch + offset)
3324    agent_slug TEXT NOT NULL,             -- 'all' for totals, or specific agent slug
3325    source_id TEXT NOT NULL DEFAULT 'all', -- 'all' for totals, or specific source
3326    session_count INTEGER NOT NULL DEFAULT 0,
3327    message_count INTEGER NOT NULL DEFAULT 0,
3328    total_chars INTEGER NOT NULL DEFAULT 0,
3329    last_updated INTEGER NOT NULL,
3330    PRIMARY KEY (day_id, agent_slug, source_id)
3331);
3332
3333CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
3334CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
3335";
3336
3337#[cfg(test)]
3338const MIGRATION_V9: &str = r"
3339-- Background embedding jobs tracking table
3340CREATE TABLE IF NOT EXISTS embedding_jobs (
3341    id INTEGER PRIMARY KEY AUTOINCREMENT,
3342    db_path TEXT NOT NULL,
3343    model_id TEXT NOT NULL,
3344    status TEXT NOT NULL DEFAULT 'pending',
3345    total_docs INTEGER NOT NULL DEFAULT 0,
3346    completed_docs INTEGER NOT NULL DEFAULT 0,
3347    error_message TEXT,
3348    created_at TEXT NOT NULL DEFAULT (datetime('now')),
3349    started_at TEXT,
3350    completed_at TEXT
3351);
3352
3353-- Only one pending or running job per (db_path, model_id) at a time.
3354-- Multiple completed/failed/cancelled jobs are allowed for history.
3355CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
3356ON embedding_jobs(db_path, model_id)
3357WHERE status IN ('pending', 'running');
3358";
3359
3360#[cfg(test)]
3361const MIGRATION_V10: &str = r"
3362-- Token analytics: per-message token usage ledger
3363CREATE TABLE IF NOT EXISTS token_usage (
3364    id INTEGER PRIMARY KEY AUTOINCREMENT,
3365    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
3366    conversation_id INTEGER NOT NULL,
3367    agent_id INTEGER NOT NULL,
3368    workspace_id INTEGER,
3369    source_id TEXT NOT NULL DEFAULT 'local',
3370
3371    -- Timing
3372    timestamp_ms INTEGER NOT NULL,
3373    day_id INTEGER NOT NULL,
3374
3375    -- Model identification
3376    model_name TEXT,
3377    model_family TEXT,
3378    model_tier TEXT,
3379    service_tier TEXT,
3380    provider TEXT,
3381
3382    -- Token counts (nullable — not all agents provide all fields)
3383    input_tokens INTEGER,
3384    output_tokens INTEGER,
3385    cache_read_tokens INTEGER,
3386    cache_creation_tokens INTEGER,
3387    thinking_tokens INTEGER,
3388    total_tokens INTEGER,
3389
3390    -- Cost estimation
3391    estimated_cost_usd REAL,
3392
3393    -- Message context
3394    role TEXT NOT NULL,
3395    content_chars INTEGER NOT NULL,
3396    has_tool_calls INTEGER NOT NULL DEFAULT 0,
3397    tool_call_count INTEGER NOT NULL DEFAULT 0,
3398
3399    -- Data quality
3400    data_source TEXT NOT NULL DEFAULT 'api',
3401
3402    UNIQUE(message_id)
3403);
3404
3405CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
3406CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
3407CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
3408CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
3409CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
3410
3411-- Token analytics: pre-aggregated daily rollups
3412CREATE TABLE IF NOT EXISTS token_daily_stats (
3413    day_id INTEGER NOT NULL,
3414    agent_slug TEXT NOT NULL,
3415    source_id TEXT NOT NULL DEFAULT 'all',
3416    model_family TEXT NOT NULL DEFAULT 'all',
3417
3418    api_call_count INTEGER NOT NULL DEFAULT 0,
3419    user_message_count INTEGER NOT NULL DEFAULT 0,
3420    assistant_message_count INTEGER NOT NULL DEFAULT 0,
3421    tool_message_count INTEGER NOT NULL DEFAULT 0,
3422
3423    total_input_tokens INTEGER NOT NULL DEFAULT 0,
3424    total_output_tokens INTEGER NOT NULL DEFAULT 0,
3425    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
3426    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
3427    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
3428    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
3429
3430    total_content_chars INTEGER NOT NULL DEFAULT 0,
3431    total_tool_calls INTEGER NOT NULL DEFAULT 0,
3432
3433    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
3434
3435    session_count INTEGER NOT NULL DEFAULT 0,
3436
3437    last_updated INTEGER NOT NULL,
3438
3439    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
3440);
3441
3442CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
3443CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
3444
3445-- Model pricing lookup table
3446CREATE TABLE IF NOT EXISTS model_pricing (
3447    model_pattern TEXT NOT NULL,
3448    provider TEXT NOT NULL,
3449    input_cost_per_mtok REAL NOT NULL,
3450    output_cost_per_mtok REAL NOT NULL,
3451    cache_read_cost_per_mtok REAL,
3452    cache_creation_cost_per_mtok REAL,
3453    effective_date TEXT NOT NULL,
3454    PRIMARY KEY (model_pattern, effective_date)
3455);
3456
3457-- Seed with current pricing (as of 2026-02)
3458INSERT OR IGNORE INTO model_pricing VALUES
3459    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
3460    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
3461    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
3462    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
3463    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
3464    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3465    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3466    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
3467    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
3468    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
3469
3470-- Extend conversations table with token summary columns
3471ALTER TABLE conversations ADD COLUMN total_input_tokens INTEGER;
3472ALTER TABLE conversations ADD COLUMN total_output_tokens INTEGER;
3473ALTER TABLE conversations ADD COLUMN total_cache_read_tokens INTEGER;
3474ALTER TABLE conversations ADD COLUMN total_cache_creation_tokens INTEGER;
3475ALTER TABLE conversations ADD COLUMN grand_total_tokens INTEGER;
3476ALTER TABLE conversations ADD COLUMN estimated_cost_usd REAL;
3477ALTER TABLE conversations ADD COLUMN primary_model TEXT;
3478ALTER TABLE conversations ADD COLUMN api_call_count INTEGER;
3479ALTER TABLE conversations ADD COLUMN tool_call_count INTEGER;
3480ALTER TABLE conversations ADD COLUMN user_message_count INTEGER;
3481ALTER TABLE conversations ADD COLUMN assistant_message_count INTEGER;
3482";
3483
3484const MIGRATION_V14: &str = r"
3485-- Switch FTS5 from internal-content to contentless mode (CASS #163).
3486-- Drop the old V13 internal-content fts_messages first so that
3487-- sqlite_schema does not contain two conflicting CREATE VIRTUAL TABLE
3488-- entries, which makes the database completely unreadable.
3489-- The current contentless table is recreated lazily after open() only when the
3490-- frankensqlite FTS consistency check finds it missing or malformed.
3491DROP TABLE IF EXISTS fts_messages;
3492";
3493
3494const MIGRATION_V15_TAIL_STATE_TABLE: &str = r"
3495CREATE TABLE IF NOT EXISTS conversation_tail_state (
3496    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3497    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3498    conversation_id INTEGER PRIMARY KEY,
3499    ended_at INTEGER,
3500    last_message_idx INTEGER,
3501    last_message_created_at INTEGER
3502);
3503";
3504
3505const MIGRATION_V16: &str = r"
3506-- UNIQUE(conversation_id, idx) already creates sqlite_autoindex_messages_1,
3507-- which covers the same lookup/order key as idx_messages_conv_idx. Keeping both
3508-- doubles message insert index maintenance on the hot indexing path.
3509DROP INDEX IF EXISTS idx_messages_conv_idx;
3510";
3511
3512const MIGRATION_V17: &str = r"
3513-- Drop the global messages(created_at) secondary index from the ingest hot
3514-- path. Search/time filters are served by the derived search layer and
3515-- conversation/analytics indexes, while this index is maintained on every
3516-- message insert.
3517DROP INDEX IF EXISTS idx_messages_created;
3518";
3519
3520const MIGRATION_V18: &str = r"
3521-- Move append-tail state out of the wide, indexed conversations row. The hot
3522-- append path updates this cache for every appended conversation; keeping it in
3523-- a tiny rowid table avoids rewriting the large conversation record.
3524CREATE TABLE IF NOT EXISTS conversation_tail_state (
3525    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3526    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3527    conversation_id INTEGER PRIMARY KEY,
3528    ended_at INTEGER,
3529    last_message_idx INTEGER,
3530    last_message_created_at INTEGER
3531);
3532
3533INSERT OR REPLACE INTO conversation_tail_state (
3534    conversation_id, ended_at, last_message_idx, last_message_created_at
3535)
3536SELECT id, ended_at, last_message_idx, last_message_created_at
3537FROM conversations
3538WHERE ended_at IS NOT NULL
3539   OR last_message_idx IS NOT NULL
3540   OR last_message_created_at IS NOT NULL;
3541";
3542
3543const MIGRATION_V19: &str = r"
3544-- Materialize external conversation provenance into one compact lookup key.
3545-- This keeps the hot append/new-conversation probe on a single primary-key
3546-- lookup instead of a composite conversations-table predicate.
3547CREATE TABLE IF NOT EXISTS conversation_external_lookup (
3548    lookup_key TEXT PRIMARY KEY,
3549    conversation_id INTEGER NOT NULL
3550);
3551
3552INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
3553SELECT
3554    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
3555    CAST(agent_id AS TEXT) || ':' ||
3556    CAST(length(external_id) AS TEXT) || ':' || external_id,
3557    id
3558FROM conversations
3559WHERE external_id IS NOT NULL;
3560";
3561
3562const MIGRATION_V20: &str = r"
3563-- Fuse external conversation lookup with append-tail state. Append-heavy
3564-- workloads can resolve both the conversation id and tail plan from one
3565-- primary-key probe.
3566CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
3567    lookup_key TEXT PRIMARY KEY,
3568    conversation_id INTEGER NOT NULL,
3569    ended_at INTEGER,
3570    last_message_idx INTEGER,
3571    last_message_created_at INTEGER
3572);
3573
3574INSERT OR REPLACE INTO conversation_external_tail_lookup (
3575    lookup_key,
3576    conversation_id,
3577    ended_at,
3578    last_message_idx,
3579    last_message_created_at
3580)
3581SELECT
3582    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
3583    CAST(c.agent_id AS TEXT) || ':' ||
3584    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
3585    c.id,
3586    (SELECT ts.ended_at
3587     FROM conversation_tail_state ts
3588     WHERE ts.conversation_id = c.id),
3589    (SELECT ts.last_message_idx
3590     FROM conversation_tail_state ts
3591     WHERE ts.conversation_id = c.id),
3592    (SELECT ts.last_message_created_at
3593     FROM conversation_tail_state ts
3594     WHERE ts.conversation_id = c.id)
3595FROM conversations c
3596WHERE c.external_id IS NOT NULL;
3597";
3598
3599/// Row from the embedding_jobs table.
3600#[derive(Debug, Clone)]
3601pub struct EmbeddingJobRow {
3602    pub id: i64,
3603    pub db_path: String,
3604    pub model_id: String,
3605    pub status: String,
3606    pub total_docs: i64,
3607    pub completed_docs: i64,
3608    pub error_message: Option<String>,
3609    pub created_at: String,
3610    pub started_at: Option<String>,
3611    pub completed_at: Option<String>,
3612}
3613
3614/// Lightweight conversation projection used while rebuilding the lexical index.
3615///
3616/// This intentionally omits `metadata_json` / `metadata_bin` and other bulky
3617/// fields because Tantivy only needs the stable envelope plus provenance
3618/// identifiers. Reading full metadata here can force frankensqlite to traverse
3619/// large overflow chains before the first lexical checkpoint is committed.
3620#[derive(Debug, Clone)]
3621pub struct LexicalRebuildConversationRow {
3622    pub id: Option<i64>,
3623    pub agent_slug: String,
3624    pub workspace: Option<PathBuf>,
3625    pub external_id: Option<String>,
3626    pub title: Option<String>,
3627    pub source_path: PathBuf,
3628    pub started_at: Option<i64>,
3629    pub ended_at: Option<i64>,
3630    pub source_id: String,
3631    pub origin_host: Option<String>,
3632}
3633
3634/// Lightweight per-conversation footprint used to pre-plan lexical rebuild
3635/// shard boundaries without re-reading full message bodies in the hot path.
3636#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3637pub struct LexicalRebuildConversationFootprintRow {
3638    pub conversation_id: i64,
3639    pub message_count: usize,
3640    pub message_bytes: usize,
3641}
3642
3643pub(crate) const LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE: usize = 4 * 1024;
3644const LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT: usize = 64;
3645
3646fn lexical_rebuild_tail_metadata_coverage_is_sufficient(
3647    total_conversations: usize,
3648    covered_conversations: usize,
3649) -> bool {
3650    total_conversations == 0
3651        || total_conversations.saturating_sub(covered_conversations.min(total_conversations))
3652            <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT
3653}
3654
3655fn lexical_rebuild_message_count_from_tail_idx(last_message_idx: Option<i64>) -> Option<usize> {
3656    let last_message_idx = u64::try_from(last_message_idx?).ok()?;
3657    let high_water = last_message_idx.checked_add(1)?;
3658    usize::try_from(high_water).ok()
3659}
3660
3661fn lexical_rebuild_conversation_footprint_from_count(
3662    conversation_id: i64,
3663    message_count: usize,
3664) -> LexicalRebuildConversationFootprintRow {
3665    LexicalRebuildConversationFootprintRow {
3666        conversation_id,
3667        message_count,
3668        message_bytes: message_count
3669            .saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE),
3670    }
3671}
3672
3673/// Lightweight message projection used by the streaming lexical rebuild path.
3674#[derive(Debug, Clone)]
3675pub struct LexicalRebuildMessageRow {
3676    pub conversation_id: i64,
3677    pub id: i64,
3678    pub idx: i64,
3679    pub role: String,
3680    pub author: Option<String>,
3681    pub created_at: Option<i64>,
3682    pub content: String,
3683}
3684
3685/// Even lighter message projection used only by the grouped lexical rebuild
3686/// stream hot path. It keeps just the per-message fields the rebuild consumes
3687/// and tracks the final message id at conversation scope instead.
3688#[derive(Debug, Clone, PartialEq, Eq)]
3689pub struct LexicalRebuildGroupedMessageRow {
3690    pub idx: i64,
3691    pub is_tool_role: bool,
3692    pub created_at: Option<i64>,
3693    pub content: String,
3694}
3695
3696pub type LexicalRebuildGroupedMessageRows = SmallVec<[LexicalRebuildGroupedMessageRow; 32]>;
3697
3698/// Compatibility alias retained while call sites finish converging on `FrankenStorage`.
3699pub type SqliteStorage = FrankenStorage;
3700
3701/// Primary frankensqlite-backed storage backend.
3702pub struct FrankenStorage {
3703    conn: FrankenConnection,
3704    db_path: PathBuf,
3705    ephemeral_writer_preflight_verified: AtomicBool,
3706    index_writer_checkpoint_pages: AtomicI64,
3707    index_writer_busy_timeout_ms: AtomicU64,
3708    cached_ephemeral_writer: parking_lot::Mutex<CachedEphemeralWriter>,
3709    ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3710    ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3711    ensured_conversation_sources: Arc<parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>>,
3712    ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3713    fts_messages_present_cache: AtomicI8,
3714}
3715
3716/// Keep ordinary storage commits from tripping over frequent auto-checkpoints
3717/// while still bounding WAL growth. Bulk index paths may override this through
3718/// their explicit checkpoint policy.
3719const DEFAULT_WAL_AUTOCHECKPOINT_PAGES: i64 = 4096;
3720const UNSET_INDEX_WRITER_CHECKPOINT_PAGES: i64 = i64::MIN;
3721const UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS: u64 = 0;
3722const FTS_MESSAGES_PRESENT_UNKNOWN: i8 = 0;
3723const FTS_MESSAGES_PRESENT_ABSENT: i8 = 1;
3724const FTS_MESSAGES_PRESENT_PRESENT: i8 = 2;
3725
3726enum CachedEphemeralWriter {
3727    Uninitialized,
3728    Cached(Box<SendFrankenConnection>),
3729    InUse,
3730}
3731
3732#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3733struct EnsuredAgentKey {
3734    slug: String,
3735    name: String,
3736    version: Option<String>,
3737    kind: String,
3738}
3739
3740impl EnsuredAgentKey {
3741    fn from_agent(agent: &Agent) -> Self {
3742        Self {
3743            slug: agent.slug.clone(),
3744            name: agent.name.clone(),
3745            version: agent.version.clone(),
3746            kind: agent_kind_str(agent.kind.clone()),
3747        }
3748    }
3749}
3750
3751#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3752struct EnsuredWorkspaceKey {
3753    path: String,
3754    display_name: Option<String>,
3755}
3756
3757impl EnsuredWorkspaceKey {
3758    fn new(path: String, display_name: Option<&str>) -> Self {
3759        Self {
3760            path,
3761            display_name: display_name.map(str::to_owned),
3762        }
3763    }
3764}
3765
3766#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3767struct EnsuredConversationSourceKey {
3768    id: String,
3769    kind: SourceKind,
3770    host_label: Option<String>,
3771}
3772
3773impl EnsuredConversationSourceKey {
3774    fn from_source(source: &Source) -> Self {
3775        Self {
3776            id: source.id.clone(),
3777            kind: source.kind,
3778            host_label: source.host_label.clone(),
3779        }
3780    }
3781}
3782
3783#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3784struct EnsuredDailyStatsKey {
3785    day_id: i64,
3786    agent_slug: String,
3787    source_id: String,
3788}
3789
3790impl EnsuredDailyStatsKey {
3791    fn new(day_id: i64, agent_slug: &str, source_id: &str) -> Self {
3792        Self {
3793            day_id,
3794            agent_slug: agent_slug.to_owned(),
3795            source_id: source_id.to_owned(),
3796        }
3797    }
3798}
3799
3800const AUTOCOMMIT_RETAIN_OFF_PRAGMAS: [&str; 2] = [
3801    "PRAGMA fsqlite.autocommit_retain = OFF;",
3802    "PRAGMA autocommit_retain = OFF;",
3803];
3804
3805fn disable_autocommit_retain<E>(
3806    mut execute: impl FnMut(&'static str) -> std::result::Result<(), E>,
3807) -> Result<&'static str>
3808where
3809    E: std::fmt::Display,
3810{
3811    let mut failures = Vec::new();
3812    for pragma in AUTOCOMMIT_RETAIN_OFF_PRAGMAS {
3813        match execute(pragma) {
3814            Ok(()) => return Ok(pragma),
3815            Err(err) => {
3816                let error = err.to_string();
3817                tracing::debug!(
3818                    %pragma,
3819                    error = %error,
3820                    "autocommit_retain PRAGMA variant not supported"
3821                );
3822                failures.push(format!("{pragma}: {error}"));
3823            }
3824        }
3825    }
3826
3827    Err(anyhow!(
3828        "failed to disable autocommit_retain on frankensqlite connection; \
3829         refusing to keep a long-lived MVCC connection that may accumulate \
3830         unbounded write snapshots. Upgrade frankensqlite to a version that \
3831         supports one of these PRAGMAs or use a short-lived connection path. \
3832         attempts: {}",
3833        failures.join("; ")
3834    ))
3835}
3836
3837pub(crate) fn error_message_indicates_populated_fts_shadow_without_rowid_reload(
3838    message: &str,
3839) -> bool {
3840    let lower = message.to_ascii_lowercase();
3841    let mentions_populated_without_rowid_shadow = (lower
3842        .contains("loading populated without rowid table")
3843        || lower.contains("reloading populated without rowid table"))
3844        && (lower.contains("table `fts_messages_") || lower.contains("table fts_messages_"));
3845
3846    mentions_populated_without_rowid_shadow && lower.contains("not yet supported")
3847}
3848
3849impl FrankenStorage {
3850    fn new(conn: FrankenConnection, db_path: PathBuf) -> Self {
3851        Self::new_with_shared_caches(
3852            conn,
3853            db_path,
3854            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3855            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3856            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3857            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3858        )
3859    }
3860
3861    fn new_with_shared_caches(
3862        conn: FrankenConnection,
3863        db_path: PathBuf,
3864        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3865        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3866        ensured_conversation_sources: Arc<
3867            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3868        >,
3869        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3870    ) -> Self {
3871        Self {
3872            conn,
3873            db_path,
3874            ephemeral_writer_preflight_verified: AtomicBool::new(false),
3875            index_writer_checkpoint_pages: AtomicI64::new(UNSET_INDEX_WRITER_CHECKPOINT_PAGES),
3876            index_writer_busy_timeout_ms: AtomicU64::new(UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS),
3877            cached_ephemeral_writer: parking_lot::Mutex::new(CachedEphemeralWriter::Uninitialized),
3878            ensured_agents,
3879            ensured_workspaces,
3880            ensured_conversation_sources,
3881            ensured_daily_stats_keys,
3882            fts_messages_present_cache: AtomicI8::new(FTS_MESSAGES_PRESENT_UNKNOWN),
3883        }
3884    }
3885
3886    fn apply_open_stage_busy_timeout(&self) {
3887        if let Err(err) = self.conn.execute("PRAGMA busy_timeout = 5000;") {
3888            tracing::debug!(
3889                error = %err,
3890                "failed to apply open-stage busy_timeout before migrations"
3891            );
3892        }
3893    }
3894
3895    /// Open a frankensqlite connection, run migrations, and apply config.
3896    ///
3897    /// This initializes canonical schema state only. Derived fallback search
3898    /// structures like the in-database `fts_messages` table are repaired
3899    /// separately so ordinary opens never block on heavyweight maintenance.
3900    pub fn open(path: &Path) -> Result<Self> {
3901        if let Some(parent) = path.parent() {
3902            fs::create_dir_all(parent)
3903                .with_context(|| format!("creating db directory {}", parent.display()))?;
3904        }
3905
3906        let path_str = path.to_string_lossy().to_string();
3907        let _doctor_guard =
3908            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3909        let conn = FrankenConnection::open(&path_str)
3910            .with_context(|| format!("opening frankensqlite db at {}", path.display()))?;
3911        let storage = Self::new(conn, path.to_path_buf());
3912        storage.apply_open_stage_busy_timeout();
3913        storage.run_migrations()?;
3914        storage.repair_missing_current_schema_objects()?;
3915        storage.apply_config()?;
3916        storage.set_fts_messages_present_cache(true);
3917        Ok(storage)
3918    }
3919
3920    /// Open a writer connection that skips migration (assumes DB already migrated).
3921    ///
3922    /// Used by the BEGIN CONCURRENT parallel writer pool: each writer needs its
3923    /// own connection with config applied, but migrations have already been run
3924    /// by the primary connection.
3925    pub fn open_writer(path: &Path) -> Result<Self> {
3926        Self::open_writer_with_shared_caches(
3927            path,
3928            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3929            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3930            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3931            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3932        )
3933    }
3934
3935    fn open_writer_with_shared_caches(
3936        path: &Path,
3937        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3938        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3939        ensured_conversation_sources: Arc<
3940            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3941        >,
3942        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3943    ) -> Result<Self> {
3944        let path_str = path.to_string_lossy().to_string();
3945        let _doctor_guard =
3946            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3947        let conn = FrankenConnection::open(&path_str)
3948            .with_context(|| format!("opening frankensqlite writer at {}", path.display()))?;
3949        let storage = Self::new_with_shared_caches(
3950            conn,
3951            path.to_path_buf(),
3952            ensured_agents,
3953            ensured_workspaces,
3954            ensured_conversation_sources,
3955            ensured_daily_stats_keys,
3956        );
3957        storage.apply_config()?;
3958        storage.set_fts_messages_present_cache(true);
3959        Ok(storage)
3960    }
3961
3962    pub(crate) fn acquire_cached_ephemeral_writer(&self) -> Result<(Self, bool)> {
3963        let mut cached = self.cached_ephemeral_writer.lock();
3964        match std::mem::replace(&mut *cached, CachedEphemeralWriter::InUse) {
3965            CachedEphemeralWriter::Cached(conn) => {
3966                let (conn, checkpoint_pages, busy_timeout_ms) = (*conn).into_parts();
3967                let writer = Self::new_with_shared_caches(
3968                    conn,
3969                    self.db_path.clone(),
3970                    Arc::clone(&self.ensured_agents),
3971                    Arc::clone(&self.ensured_workspaces),
3972                    Arc::clone(&self.ensured_conversation_sources),
3973                    Arc::clone(&self.ensured_daily_stats_keys),
3974                );
3975                writer
3976                    .index_writer_checkpoint_pages
3977                    .store(checkpoint_pages, Ordering::Relaxed);
3978                writer
3979                    .index_writer_busy_timeout_ms
3980                    .store(busy_timeout_ms, Ordering::Relaxed);
3981                writer.set_fts_messages_present_cache(true);
3982                Ok((writer, true))
3983            }
3984            CachedEphemeralWriter::Uninitialized => {
3985                drop(cached);
3986                match Self::open_writer_with_shared_caches(
3987                    &self.db_path,
3988                    Arc::clone(&self.ensured_agents),
3989                    Arc::clone(&self.ensured_workspaces),
3990                    Arc::clone(&self.ensured_conversation_sources),
3991                    Arc::clone(&self.ensured_daily_stats_keys),
3992                ) {
3993                    Ok(writer) => Ok((writer, true)),
3994                    Err(err) => {
3995                        let mut cached = self.cached_ephemeral_writer.lock();
3996                        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3997                            *cached = CachedEphemeralWriter::Uninitialized;
3998                        }
3999                        Err(err)
4000                    }
4001                }
4002            }
4003            CachedEphemeralWriter::InUse => {
4004                *cached = CachedEphemeralWriter::InUse;
4005                drop(cached);
4006                Ok((
4007                    Self::open_writer_with_shared_caches(
4008                        &self.db_path,
4009                        Arc::clone(&self.ensured_agents),
4010                        Arc::clone(&self.ensured_workspaces),
4011                        Arc::clone(&self.ensured_conversation_sources),
4012                        Arc::clone(&self.ensured_daily_stats_keys),
4013                    )?,
4014                    false,
4015                ))
4016            }
4017        }
4018    }
4019
4020    pub(crate) fn release_cached_ephemeral_writer(&self, writer: Self) {
4021        let checkpoint_pages = writer.index_writer_checkpoint_pages.load(Ordering::Relaxed);
4022        let busy_timeout_ms = writer.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
4023        let conn = writer.into_raw();
4024        let mut cached = self.cached_ephemeral_writer.lock();
4025        debug_assert!(
4026            matches!(&*cached, CachedEphemeralWriter::InUse),
4027            "cached ephemeral writer state should be in-use when releasing"
4028        );
4029        *cached = CachedEphemeralWriter::Cached(Box::new(
4030            SendFrankenConnection::new_with_index_writer_state(
4031                conn,
4032                checkpoint_pages,
4033                busy_timeout_ms,
4034            ),
4035        ));
4036    }
4037
4038    pub(crate) fn discard_cached_ephemeral_writer(&self, mut writer: Self) {
4039        writer.close_best_effort_in_place();
4040        let mut cached = self.cached_ephemeral_writer.lock();
4041        if matches!(&*cached, CachedEphemeralWriter::InUse) {
4042            *cached = CachedEphemeralWriter::Uninitialized;
4043        }
4044    }
4045
4046    fn cached_agent_id(&self, key: &EnsuredAgentKey) -> Option<i64> {
4047        self.ensured_agents.lock().get(key).copied()
4048    }
4049
4050    fn mark_agent_ensured(&self, key: EnsuredAgentKey, id: i64) {
4051        self.ensured_agents.lock().insert(key, id);
4052    }
4053
4054    fn cached_workspace_id(&self, key: &EnsuredWorkspaceKey) -> Option<i64> {
4055        self.ensured_workspaces.lock().get(key).copied()
4056    }
4057
4058    fn mark_workspace_ensured(&self, key: EnsuredWorkspaceKey, id: i64) {
4059        self.ensured_workspaces.lock().insert(key, id);
4060    }
4061
4062    fn conversation_source_already_ensured(&self, key: &EnsuredConversationSourceKey) -> bool {
4063        self.ensured_conversation_sources.lock().contains(key)
4064    }
4065
4066    fn mark_conversation_source_ensured(&self, key: EnsuredConversationSourceKey) {
4067        self.ensured_conversation_sources.lock().insert(key);
4068    }
4069
4070    fn daily_stats_key_already_ensured(&self, key: &EnsuredDailyStatsKey) -> bool {
4071        self.ensured_daily_stats_keys.lock().contains(key)
4072    }
4073
4074    fn daily_stats_keys_already_ensured(&self, keys: &[EnsuredDailyStatsKey; 4]) -> bool {
4075        let ensured = self.ensured_daily_stats_keys.lock();
4076        keys.iter().all(|key| ensured.contains(key))
4077    }
4078
4079    fn mark_daily_stats_key_ensured(&self, key: EnsuredDailyStatsKey) {
4080        self.ensured_daily_stats_keys.lock().insert(key);
4081    }
4082
4083    fn fts_messages_present_cached(&self, tx: &FrankenTransaction<'_>) -> bool {
4084        match self.fts_messages_present_cache.load(Ordering::Acquire) {
4085            FTS_MESSAGES_PRESENT_PRESENT => return true,
4086            FTS_MESSAGES_PRESENT_ABSENT => return false,
4087            _ => {}
4088        }
4089
4090        let present = tx
4091            .query_row_map(
4092                "SELECT COUNT(*) FROM sqlite_master
4093                 WHERE name = 'fts_messages'
4094                   AND rootpage > 0",
4095                fparams![],
4096                |row| row.get_typed::<i64>(0),
4097            )
4098            .map(|count| count > 0)
4099            .unwrap_or_else(|err| {
4100                tracing::debug!(
4101                    error = %err,
4102                    "failed to probe fts_messages presence; skipping db-resident FTS maintenance"
4103                );
4104                false
4105            });
4106        self.set_fts_messages_present_cache(present);
4107        present
4108    }
4109
4110    fn set_fts_messages_present_cache(&self, present: bool) {
4111        self.fts_messages_present_cache.store(
4112            if present {
4113                FTS_MESSAGES_PRESENT_PRESENT
4114            } else {
4115                FTS_MESSAGES_PRESENT_ABSENT
4116            },
4117            Ordering::Release,
4118        );
4119    }
4120
4121    fn invalidate_fts_messages_present_cache(&self) {
4122        self.fts_messages_present_cache
4123            .store(FTS_MESSAGES_PRESENT_UNKNOWN, Ordering::Release);
4124    }
4125
4126    fn invalidate_conversation_source_cache(&self, source_id: &str) {
4127        self.ensured_conversation_sources
4128            .lock()
4129            .retain(|key| key.id != source_id);
4130    }
4131
4132    fn close_cached_ephemeral_writer_best_effort_in_place(&mut self) {
4133        let cached = self.cached_ephemeral_writer.get_mut();
4134        if let CachedEphemeralWriter::Cached(conn) =
4135            std::mem::replace(cached, CachedEphemeralWriter::Uninitialized)
4136        {
4137            let mut conn = conn;
4138            conn.0.close_best_effort_in_place();
4139        }
4140    }
4141
4142    fn close_cached_ephemeral_writer_without_checkpoint_in_place(&mut self) -> Result<()> {
4143        let cached = self.cached_ephemeral_writer.get_mut();
4144        match std::mem::replace(cached, CachedEphemeralWriter::Uninitialized) {
4145            CachedEphemeralWriter::Cached(mut conn) => conn
4146                .0
4147                .close_without_checkpoint_in_place()
4148                .with_context(|| "closing cached frankensqlite writer without final checkpoint"),
4149            CachedEphemeralWriter::Uninitialized | CachedEphemeralWriter::InUse => Ok(()),
4150        }
4151    }
4152
4153    /// Open in read-only mode using frankensqlite compat flags.
4154    pub fn open_readonly(path: &Path) -> Result<Self> {
4155        Self::open_readonly_with_doctor_lock_timeout(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)
4156    }
4157
4158    /// Open in read-only mode with an explicit doctor mutation-lock timeout.
4159    ///
4160    /// This is primarily useful for probes that need to prove a reader would
4161    /// not enter the archive while `cass doctor --fix` owns the repair lock.
4162    pub fn open_readonly_with_doctor_lock_timeout(path: &Path, timeout: Duration) -> Result<Self> {
4163        let path_str = path.to_string_lossy().to_string();
4164        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
4165        let conn = open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
4166            .with_context(|| format!("opening frankensqlite db readonly at {}", path.display()))?;
4167        let storage = Self::new(conn, path.to_path_buf());
4168        storage.apply_readonly_config()?;
4169        Ok(storage)
4170    }
4171
4172    pub fn close(self) -> Result<()> {
4173        let mut this = self;
4174        this.close_cached_ephemeral_writer_best_effort_in_place();
4175        this.conn
4176            .close()
4177            .with_context(|| "closing frankensqlite connection")
4178    }
4179
4180    pub fn close_without_checkpoint(self) -> Result<()> {
4181        let mut this = self;
4182        this.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
4183        this.conn
4184            .close_without_checkpoint()
4185            .with_context(|| "closing frankensqlite connection without final checkpoint")
4186    }
4187
4188    pub fn close_best_effort_in_place(&mut self) {
4189        self.close_cached_ephemeral_writer_best_effort_in_place();
4190        self.conn.close_best_effort_in_place();
4191    }
4192
4193    pub fn close_without_checkpoint_in_place(&mut self) -> Result<()> {
4194        self.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
4195        self.conn
4196            .close_without_checkpoint_in_place()
4197            .with_context(|| "closing frankensqlite connection without final checkpoint")
4198    }
4199
4200    /// Access the raw frankensqlite connection.
4201    pub fn raw(&self) -> &FrankenConnection {
4202        &self.conn
4203    }
4204
4205    /// Consume the storage wrapper and return the underlying frankensqlite
4206    /// connection after migrations/repair have already been applied.
4207    pub fn into_raw(self) -> FrankenConnection {
4208        let mut this = self;
4209        this.close_cached_ephemeral_writer_best_effort_in_place();
4210        this.conn
4211    }
4212
4213    /// Apply connection PRAGMAs for parity with SqliteStorage's `apply_pragmas()`.
4214    ///
4215    /// Frankensqlite supports all PRAGMAs cass uses (journal_mode, synchronous,
4216    /// cache_size, foreign_keys, busy_timeout). Its default journal_mode is already
4217    /// WAL and default synchronous is NORMAL, matching cass's requirements.
4218    ///
4219    pub fn apply_config(&self) -> Result<()> {
4220        // journal_mode: frankensqlite defaults to WAL, same as cass.
4221        // synchronous: frankensqlite defaults to NORMAL, same as cass.
4222        // Both are set explicitly for clarity.
4223        self.conn
4224            .execute("PRAGMA journal_mode = WAL;")
4225            .with_context(|| "setting journal_mode")?;
4226        self.conn
4227            .execute("PRAGMA synchronous = NORMAL;")
4228            .with_context(|| "setting synchronous")?;
4229
4230        // cache_size: 64MB (negative value = KiB).
4231        self.conn
4232            .execute("PRAGMA cache_size = -65536;")
4233            .with_context(|| "setting cache_size")?;
4234
4235        // foreign_keys: enable constraint enforcement.
4236        self.conn
4237            .execute("PRAGMA foreign_keys = ON;")
4238            .with_context(|| "setting foreign_keys")?;
4239
4240        // busy_timeout: 5 seconds (in milliseconds).
4241        self.conn
4242            .execute("PRAGMA busy_timeout = 5000;")
4243            .with_context(|| "setting busy_timeout")?;
4244
4245        // temp_store = MEMORY and mmap_size are C SQLite performance knobs.
4246        // In frankensqlite's architecture (in-memory MVCC engine with pager
4247        // backend), temp_store is always memory-resident and mmap_size does not
4248        // apply. Skipped intentionally — these are no-ops or errors.
4249
4250        // wal_autocheckpoint: use a bounded cadence that avoids checkpointing
4251        // inside common append batches without deferring checkpoints forever.
4252        let checkpoint_pragma =
4253            format!("PRAGMA wal_autocheckpoint = {DEFAULT_WAL_AUTOCHECKPOINT_PAGES};");
4254        let _ = self.conn.execute(&checkpoint_pragma);
4255        self.index_writer_checkpoint_pages
4256            .store(DEFAULT_WAL_AUTOCHECKPOINT_PAGES, Ordering::Relaxed);
4257        // Explicitly enable concurrent writer mode for BEGIN/transaction paths.
4258        // Try both namespace variants for compatibility across fsqlite builds.
4259        let _ = self.conn.execute("PRAGMA fsqlite.concurrent_mode = ON;");
4260        let _ = self.conn.execute("PRAGMA concurrent_mode = ON;");
4261        // Frankensqlite retained autocommit currently mis-serves same-connection
4262        // read-after-write queries on cass's storage paths; keep it off here
4263        // until the upstream visibility bug is fixed.
4264        //
4265        // CASS #163 item 3: If neither PRAGMA variant succeeds, the MVCC engine
4266        // will accumulate write snapshots for the lifetime of the connection,
4267        // causing unbounded memory growth on long-lived watch-mode handles.
4268        // Log at warn level so the failure is visible instead of silently
4269        // swallowed, and set a flag for callers that need to periodically
4270        // recycle the connection.
4271        match disable_autocommit_retain(|pragma| self.conn.execute(pragma).map(|_| ())) {
4272            Ok(autocommit_pragma) => {
4273                tracing::debug!(
4274                    pragma = autocommit_pragma,
4275                    "disabled frankensqlite autocommit_retain for storage connection"
4276                );
4277            }
4278            Err(err) => {
4279                let detail = format!("{err:#}");
4280                if error_message_indicates_populated_fts_shadow_without_rowid_reload(&detail) {
4281                    tracing::warn!(
4282                        error = %detail,
4283                        "frankensqlite could not disable autocommit_retain because a populated derived FTS shadow table cannot yet be reloaded; continuing so canonical indexing can proceed"
4284                    );
4285                } else {
4286                    return Err(err);
4287                }
4288            }
4289        }
4290
4291        Ok(())
4292    }
4293
4294    fn apply_readonly_config(&self) -> Result<()> {
4295        self.conn
4296            .execute("PRAGMA query_only = 1;")
4297            .with_context(|| "setting query_only")?;
4298        self.conn
4299            .execute("PRAGMA busy_timeout = 5000;")
4300            .with_context(|| "setting busy_timeout")?;
4301        self.conn
4302            .execute("PRAGMA cache_size = -65536;")
4303            .with_context(|| "setting cache_size")?;
4304        self.conn
4305            .execute("PRAGMA foreign_keys = ON;")
4306            .with_context(|| "setting foreign_keys")?;
4307        Ok(())
4308    }
4309
4310    /// Run all schema migrations, handling transition from meta table versioning.
4311    ///
4312    /// The existing `SqliteStorage` tracks schema version in a `meta` table entry.
4313    /// The new `MigrationRunner` uses a `_schema_migrations` table. This method:
4314    /// 1. Transitions existing databases from meta table → `_schema_migrations`
4315    /// 2. Runs pending migrations via `MigrationRunner`
4316    /// 3. Syncs `meta.schema_version` for backward compatibility
4317    ///
4318    /// # Fresh vs existing databases
4319    ///
4320    /// Fresh databases use a single combined migration (`MIGRATION_FRESH_SCHEMA`)
4321    /// that creates the complete V13 schema directly. This avoids the incremental
4322    /// V5 migration which uses `DROP TABLE` — an operation that triggers a known
4323    /// frankensqlite autoindex limitation.
4324    ///
4325    /// Existing databases (transitioned from SqliteStorage) are typically at
4326    /// V13 or newer already; additive post-V13 migrations are applied normally.
4327    pub fn run_migrations(&self) -> Result<()> {
4328        transition_from_meta_version(&self.conn)?;
4329
4330        let base_result = build_cass_migrations_before_tail_cache()
4331            .run(&self.conn)
4332            .with_context(|| "running base schema migrations")?;
4333
4334        let mut applied = base_result.applied;
4335        if apply_conversation_tail_state_cache_migration(&self.conn)
4336            .with_context(|| "running conversation tail-state cache migration")?
4337        {
4338            applied.push(15);
4339        }
4340
4341        let post_result = build_cass_migrations_after_tail_cache()
4342            .run(&self.conn)
4343            .with_context(|| "running post-tail-cache schema migrations")?;
4344        applied.extend(post_result.applied);
4345
4346        let current = self.schema_version()?;
4347        if !applied.is_empty() {
4348            info!(
4349                applied = ?applied,
4350                current,
4351                was_fresh = base_result.was_fresh,
4352                "frankensqlite schema migrations applied"
4353            );
4354        }
4355
4356        // Keep meta.schema_version in sync for backward compatibility.
4357        self.sync_meta_schema_version(current)?;
4358
4359        Ok(())
4360    }
4361
4362    /// Some historical canonical rebuild paths produced databases whose
4363    /// version markers claim the current schema while post-V10 analytics
4364    /// tables were never materialized. Detect that drift and backfill the
4365    /// idempotent table/index set from the combined schema migration.
4366    fn repair_missing_current_schema_objects(&self) -> Result<()> {
4367        let mut missing_tables = Vec::new();
4368        for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
4369            if let Err(err) = self.conn.query(probe_sql) {
4370                if error_indicates_missing_table(&err) {
4371                    missing_tables.push(table_name);
4372                    continue;
4373                }
4374                return Err(err).with_context(|| {
4375                    format!("probing required schema table {table_name} for completeness")
4376                });
4377            }
4378        }
4379
4380        if !missing_tables.is_empty() {
4381            info!(
4382                missing_tables = ?missing_tables,
4383                "repairing missing current-schema tables on an already-versioned cass database"
4384            );
4385
4386            for batch in current_schema_repair_batches_for_missing_tables(&missing_tables)? {
4387                self.conn
4388                    .execute_batch(batch.sql)
4389                    .with_context(|| format!("repairing current-schema batch {}", batch.name))?;
4390            }
4391
4392            for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
4393                if !missing_tables.contains(&table_name) {
4394                    continue;
4395                }
4396                self.conn
4397                    .query(probe_sql)
4398                    .with_context(|| format!("verifying repaired schema table {table_name}"))?;
4399            }
4400        }
4401        self.repair_missing_conversation_token_columns()?;
4402        Ok(())
4403    }
4404
4405    fn repair_missing_conversation_token_columns(&self) -> Result<()> {
4406        let columns = franken_table_column_names(&self.conn, "conversations")
4407            .with_context(|| "inspecting conversations columns for token-summary repair")?;
4408        let mut missing_columns = Vec::new();
4409        for &(column_name, column_type) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
4410            if columns.contains(column_name) {
4411                continue;
4412            }
4413            let sql = format!("ALTER TABLE conversations ADD COLUMN {column_name} {column_type};");
4414            self.conn.execute(&sql).with_context(|| {
4415                format!("adding missing conversations.{column_name} token-summary column")
4416            })?;
4417            missing_columns.push(column_name);
4418        }
4419        if !missing_columns.is_empty() {
4420            tracing::warn!(
4421                target: "cass::schema_repair",
4422                db_path = %self.db_path.display(),
4423                missing_columns = ?missing_columns,
4424                "cass#222: repaired missing conversations token-summary columns"
4425            );
4426        }
4427        Ok(())
4428    }
4429
4430    /// Detect and remove orphan rows whose FK parent has gone missing.
4431    ///
4432    /// A `Connection` dropped mid-transaction (the `drop_close` warning emitted
4433    /// by frankensqlite's `Drop` impl) can leave child rows persisted without a
4434    /// matching parent — `messages` referencing a `conversation_id` that does
4435    /// not exist, `message_metrics`/`token_usage`/`snippets` referencing a
4436    /// `message_id` that does not exist, etc. With `PRAGMA foreign_keys = ON`,
4437    /// every subsequent indexer pass then trips `FOREIGN KEY constraint failed`
4438    /// on the next write, the session never gets marked indexed, and the
4439    /// pending backlog grows without bound (issue #202).
4440    ///
4441    /// This pass runs at indexer startup as defense in depth: it scans each
4442    /// child table for rows whose parent row has gone missing and removes them
4443    /// in bounded committed chunks, breaking the failure cycle even when the
4444    /// underlying transaction-discipline bug has not been fully root-caused.
4445    /// The pass is idempotent (a clean database is a no-op), and emits a
4446    /// `WARN` after successful cleanup so the upstream `drop_close` condition
4447    /// stays visible.
4448    pub(crate) fn cleanup_orphan_fk_rows(&self) -> Result<OrphanFkCleanupReport> {
4449        let mut report = OrphanFkCleanupReport::default();
4450        let orphan_message_ids = match collect_orphan_message_ids(&self.conn) {
4451            Ok(ids) => ids,
4452            Err(err) if error_indicates_missing_table(&err) => {
4453                tracing::debug!(
4454                    target: "cass::fk_repair",
4455                    child_table = "messages",
4456                    error = %err,
4457                    "skipping orphan-message probe (table or column unavailable)"
4458                );
4459                Vec::new()
4460            }
4461            Err(err) => return Err(err),
4462        };
4463        if !orphan_message_ids.is_empty() {
4464            report.record("messages", orphan_message_ids.len() as i64);
4465        }
4466
4467        if !orphan_message_ids.is_empty() {
4468            delete_orphan_message_ids_bisecting_oom(&self.conn, &orphan_message_ids)
4469                .context("deleting orphan message rows and dependent children")?;
4470        }
4471
4472        for entry in ORPHAN_DIRECT_CHILD_TABLES {
4473            loop {
4474                let ids = match collect_direct_orphan_id_page(&self.conn, entry) {
4475                    Ok(ids) => ids,
4476                    Err(err)
4477                        if error_indicates_missing_table(&err)
4478                            || error_indicates_missing_column(&err) =>
4479                    {
4480                        // Tolerant probe: a missing child/parent table or FK
4481                        // column on older schemas means there is nothing to
4482                        // clean up for this table.
4483                        tracing::debug!(
4484                            target: "cass::fk_repair",
4485                            child_table = entry.child_table,
4486                            error = %err,
4487                            "skipping orphan probe (table or column unavailable)"
4488                        );
4489                        break;
4490                    }
4491                    Err(err) => {
4492                        return Err(err).with_context(|| {
4493                            format!("probing orphan rows in {}", entry.child_table)
4494                        });
4495                    }
4496                };
4497                if ids.is_empty() {
4498                    break;
4499                }
4500
4501                let deleted = delete_direct_orphan_ids_bisecting_oom(&self.conn, entry, &ids)
4502                    .with_context(|| format!("deleting orphan rows from {}", entry.child_table))?;
4503                if deleted == 0 {
4504                    break;
4505                }
4506                report.record(
4507                    entry.child_table,
4508                    i64::try_from(deleted).unwrap_or(i64::MAX),
4509                );
4510            }
4511        }
4512
4513        if report.total == 0 {
4514            return Ok(report);
4515        }
4516
4517        // WARN only fires after a successful commit so the message accurately
4518        // reflects what actually happened on disk. db_path is included so logs
4519        // from concurrent indexers against different databases stay
4520        // disambiguated.
4521        tracing::warn!(
4522            target: "cass::fk_repair",
4523            db_path = %self.db_path.display(),
4524            total_orphans = report.total,
4525            per_table = ?report.per_table,
4526            "cass#202: removed orphan rows left behind by interrupted index transactions"
4527        );
4528
4529        Ok(report)
4530    }
4531
4532    /// Return the current schema version from `_schema_migrations`.
4533    pub fn schema_version(&self) -> Result<i64> {
4534        let rows = self
4535            .conn
4536            .query("SELECT MAX(version) FROM _schema_migrations;")
4537            .with_context(|| "reading schema version from _schema_migrations")?;
4538
4539        if let Some(row) = rows.first()
4540            && let Ok(v) = row.get_typed::<Option<i64>>(0)
4541        {
4542            return Ok(v.unwrap_or(0));
4543        }
4544        Ok(0)
4545    }
4546
4547    /// Keep `meta.schema_version` in sync for backward compatibility with `SqliteStorage`.
4548    fn sync_meta_schema_version(&self, version: i64) -> Result<()> {
4549        // The meta table is created by V1 migration. If it doesn't exist yet,
4550        // there's nothing to sync.
4551        if self.conn.query("SELECT key FROM meta LIMIT 1;").is_err() {
4552            return Ok(());
4553        }
4554
4555        // Only write if the version needs updating to avoid write lock contention
4556        if let Ok(rows) = self
4557            .conn
4558            .query("SELECT value FROM meta WHERE key = 'schema_version';")
4559            && let Some(row) = rows.first()
4560            && let Ok(val) = row.get_typed::<String>(0)
4561            && val == version.to_string()
4562        {
4563            return Ok(()); // Already up to date
4564        }
4565
4566        self.conn
4567            .execute_compat(
4568                "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1);",
4569                &[ParamValue::from(version.to_string())],
4570            )
4571            .with_context(|| "syncing meta schema_version")?;
4572
4573        Ok(())
4574    }
4575
4576    /// Resolve the database file path for this connection.
4577    pub fn database_path(&self) -> Result<PathBuf> {
4578        Ok(self.db_path.clone())
4579    }
4580
4581    pub(crate) fn ephemeral_writer_preflight_verified(&self) -> bool {
4582        self.ephemeral_writer_preflight_verified
4583            .load(Ordering::Relaxed)
4584    }
4585
4586    pub(crate) fn mark_ephemeral_writer_preflight_verified(&self) {
4587        self.ephemeral_writer_preflight_verified
4588            .store(true, Ordering::Relaxed);
4589    }
4590
4591    pub(crate) fn index_writer_checkpoint_pages(&self) -> Option<i64> {
4592        let pages = self.index_writer_checkpoint_pages.load(Ordering::Relaxed);
4593        (pages != UNSET_INDEX_WRITER_CHECKPOINT_PAGES).then_some(pages)
4594    }
4595
4596    pub(crate) fn mark_index_writer_checkpoint_pages(&self, pages: i64) {
4597        self.index_writer_checkpoint_pages
4598            .store(pages, Ordering::Relaxed);
4599    }
4600
4601    pub(crate) fn index_writer_busy_timeout_ms(&self) -> Option<u64> {
4602        let timeout_ms = self.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
4603        (timeout_ms != UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS).then_some(timeout_ms)
4604    }
4605
4606    pub(crate) fn mark_index_writer_busy_timeout_ms(&self, timeout_ms: u64) {
4607        self.index_writer_busy_timeout_ms
4608            .store(timeout_ms, Ordering::Relaxed);
4609    }
4610
4611    /// Open database with migration, backing up if schema is incompatible.
4612    pub fn open_or_rebuild(path: &Path) -> std::result::Result<Self, MigrationError> {
4613        if let Some(parent) = path.parent() {
4614            fs::create_dir_all(parent)?;
4615        }
4616
4617        if path.exists() {
4618            let check_result = check_schema_compatibility(path);
4619            match check_result {
4620                Ok(SchemaCheck::Compatible) | Ok(SchemaCheck::NeedsMigration) => {
4621                    // Continue with normal open
4622                }
4623                Ok(SchemaCheck::NeedsRebuild(reason)) => {
4624                    let backup_path = create_backup(path)?;
4625                    cleanup_old_backups(path, MAX_BACKUPS)?;
4626                    remove_database_files(path)?;
4627                    return Err(MigrationError::RebuildRequired {
4628                        reason,
4629                        backup_path,
4630                    });
4631                }
4632                Err(err) if schema_check_error_requires_rebuild(&err) => {
4633                    let backup_path = create_backup(path)?;
4634                    cleanup_old_backups(path, MAX_BACKUPS)?;
4635                    remove_database_files(path)?;
4636                    return Err(MigrationError::RebuildRequired {
4637                        reason: format!("Database appears corrupted: {err}"),
4638                        backup_path,
4639                    });
4640                }
4641                Err(err) => return Err(MigrationError::Database(err)),
4642            }
4643        }
4644
4645        let storage = Self::open(path).map_err(|e| MigrationError::Other(e.to_string()))?;
4646        Ok(storage)
4647    }
4648}
4649
4650// -------------------------------------------------------------------------
4651// Frankensqlite migration helpers
4652// -------------------------------------------------------------------------
4653
4654/// Build the `MigrationRunner` for the frankensqlite migration path.
4655///
4656/// Uses a single combined migration (version 13) that creates the complete
4657/// final schema in one step. This avoids the V5 `DROP TABLE conversations`
4658/// operation which triggers a known frankensqlite limitation: autoindex entries
4659/// in sqlite_master are not properly cleaned up during DROP TABLE, causing
4660/// "sqlite_master entry not found" errors.
4661///
4662/// For existing databases transitioned from SqliteStorage, the transition
4663/// function backfills `_schema_migrations`; post-V13 additive migrations then
4664/// run normally.
4665fn build_cass_migrations_before_tail_cache() -> MigrationRunner {
4666    MigrationRunner::new()
4667        .add(13, "full_schema_v13", MIGRATION_FRESH_SCHEMA)
4668        .add(14, "fts_contentless", MIGRATION_V14)
4669}
4670
4671fn build_cass_migrations_after_tail_cache() -> MigrationRunner {
4672    MigrationRunner::new()
4673        .add(16, "drop_redundant_message_conv_idx", MIGRATION_V16)
4674        .add(17, "drop_message_created_idx", MIGRATION_V17)
4675        .add(18, "conversation_tail_state_hot_table", MIGRATION_V18)
4676        .add(19, "conversation_external_lookup", MIGRATION_V19)
4677        .add(20, "conversation_external_tail_lookup", MIGRATION_V20)
4678}
4679
4680fn schema_migration_is_applied(conn: &FrankenConnection, version: i64) -> Result<bool> {
4681    let rows = conn
4682        .query_with_params(
4683            "SELECT 1 FROM _schema_migrations WHERE version = ?1 LIMIT 1;",
4684            &[SqliteValue::from(version)],
4685        )
4686        .with_context(|| format!("checking schema migration version {version}"))?;
4687    Ok(!rows.is_empty())
4688}
4689
4690fn apply_conversation_tail_state_cache_migration(conn: &FrankenConnection) -> Result<bool> {
4691    conn.execute("BEGIN IMMEDIATE;")
4692        .with_context(|| "starting v15 conversation tail-state migration transaction")?;
4693
4694    let result = (|| -> Result<bool> {
4695        if schema_migration_is_applied(conn, 15)? {
4696            conn.execute("COMMIT;")
4697                .with_context(|| "committing already-applied v15 migration transaction")?;
4698            return Ok(false);
4699        }
4700
4701        let started = Instant::now();
4702        let conversation_columns = franken_table_column_names(conn, "conversations")
4703            .with_context(|| "inspecting conversations columns before v15 migration")?;
4704        if !conversation_columns.contains("last_message_idx") {
4705            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_idx INTEGER;")
4706                .with_context(|| "adding v15 conversations.last_message_idx column")?;
4707        }
4708        if !conversation_columns.contains("last_message_created_at") {
4709            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_created_at INTEGER;")
4710                .with_context(|| "adding v15 conversations.last_message_created_at column")?;
4711        }
4712        conn.execute_batch(MIGRATION_V15_TAIL_STATE_TABLE)
4713            .with_context(|| "applying v15 conversation tail-state table schema")?;
4714        conn.execute_compat(
4715            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
4716            fparams![15_i64, "conversation_tail_state_cache"],
4717        )
4718        .with_context(|| "recording v15 conversation tail-state migration")?;
4719        conn.execute("COMMIT;")
4720            .with_context(|| "committing v15 conversation tail-state migration")?;
4721        info!(
4722            elapsed_ms = started.elapsed().as_millis(),
4723            "applied v15 conversation tail-state cache migration"
4724        );
4725        Ok(true)
4726    })();
4727
4728    if result.is_err() {
4729        let _ = conn.execute("ROLLBACK;");
4730    }
4731
4732    result
4733}
4734
4735fn franken_table_column_names(
4736    conn: &FrankenConnection,
4737    table_name: &str,
4738) -> Result<HashSet<String>> {
4739    if !table_name
4740        .chars()
4741        .all(|c| c.is_ascii_alphanumeric() || c == '_')
4742    {
4743        return Err(anyhow!(
4744            "unsafe table name for PRAGMA table_info: {table_name}"
4745        ));
4746    }
4747
4748    conn.query_map_collect(
4749        &format!("PRAGMA table_info({table_name})"),
4750        fparams![],
4751        |row: &FrankenRow| row.get_typed::<String>(1),
4752    )
4753    .with_context(|| format!("reading PRAGMA table_info({table_name})"))
4754    .map(|columns| columns.into_iter().collect())
4755}
4756
4757/// Combined V13 schema for fresh databases.
4758///
4759/// Creates the complete final schema in a single migration, avoiding the
4760/// incremental V5 `DROP TABLE conversations` which triggers a frankensqlite
4761/// autoindex limitation. All columns from V1-V13 are included in their
4762/// respective CREATE TABLE statements.
4763///
4764/// Table creation order respects foreign key references:
4765/// sources → agents/workspaces → conversations → messages → snippets, etc.
4766const MIGRATION_FRESH_SCHEMA: &str = r"
4767-- Core tables (V1)
4768CREATE TABLE IF NOT EXISTS meta (
4769    key TEXT PRIMARY KEY,
4770    value TEXT NOT NULL
4771);
4772
4773CREATE TABLE IF NOT EXISTS agents (
4774    id INTEGER PRIMARY KEY,
4775    slug TEXT NOT NULL UNIQUE,
4776    name TEXT NOT NULL,
4777    version TEXT,
4778    kind TEXT NOT NULL,
4779    created_at INTEGER NOT NULL,
4780    updated_at INTEGER NOT NULL
4781);
4782
4783CREATE TABLE IF NOT EXISTS workspaces (
4784    id INTEGER PRIMARY KEY,
4785    path TEXT NOT NULL UNIQUE,
4786    display_name TEXT
4787);
4788
4789-- Sources (V4)
4790CREATE TABLE IF NOT EXISTS sources (
4791    id TEXT PRIMARY KEY,
4792    kind TEXT NOT NULL,
4793    host_label TEXT,
4794    machine_id TEXT,
4795    platform TEXT,
4796    config_json TEXT,
4797    created_at INTEGER NOT NULL,
4798    updated_at INTEGER NOT NULL
4799);
4800
4801INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4802VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4803
4804-- Conversations: V1 base + V5 provenance + V7 metadata_bin + V10 token summary
4805CREATE TABLE IF NOT EXISTS conversations (
4806    id INTEGER PRIMARY KEY,
4807    agent_id INTEGER NOT NULL REFERENCES agents(id),
4808    workspace_id INTEGER REFERENCES workspaces(id),
4809    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
4810    external_id TEXT,
4811    title TEXT,
4812    source_path TEXT NOT NULL,
4813    started_at INTEGER,
4814    ended_at INTEGER,
4815    approx_tokens INTEGER,
4816    metadata_json TEXT,
4817    origin_host TEXT,
4818    metadata_bin BLOB,
4819    total_input_tokens INTEGER,
4820    total_output_tokens INTEGER,
4821    total_cache_read_tokens INTEGER,
4822    total_cache_creation_tokens INTEGER,
4823    grand_total_tokens INTEGER,
4824    estimated_cost_usd REAL,
4825    primary_model TEXT,
4826    api_call_count INTEGER,
4827    tool_call_count INTEGER,
4828    user_message_count INTEGER,
4829    assistant_message_count INTEGER,
4830    -- V15 columns are included in the fresh schema so fresh DB creation does
4831    -- not need ALTER TABLE on conversations. That ALTER path can duplicate
4832    -- provenance autoindex state in frankensqlite when the named unique
4833    -- provenance index already exists.
4834    last_message_idx INTEGER,
4835    last_message_created_at INTEGER
4836);
4837
4838-- Named unique index avoids autoindex issues if table is ever recreated
4839CREATE UNIQUE INDEX IF NOT EXISTS idx_conversations_provenance
4840    ON conversations(source_id, agent_id, external_id);
4841
4842-- Messages: V1 base + V7 extra_bin
4843CREATE TABLE IF NOT EXISTS messages (
4844    id INTEGER PRIMARY KEY,
4845    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4846    idx INTEGER NOT NULL,
4847    role TEXT NOT NULL,
4848    author TEXT,
4849    created_at INTEGER,
4850    content TEXT NOT NULL,
4851    extra_json TEXT,
4852    extra_bin BLOB,
4853    UNIQUE(conversation_id, idx)
4854);
4855
4856CREATE TABLE IF NOT EXISTS snippets (
4857    id INTEGER PRIMARY KEY,
4858    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4859    file_path TEXT,
4860    start_line INTEGER,
4861    end_line INTEGER,
4862    language TEXT,
4863    snippet_text TEXT
4864);
4865
4866CREATE TABLE IF NOT EXISTS tags (
4867    id INTEGER PRIMARY KEY,
4868    name TEXT NOT NULL UNIQUE
4869);
4870
4871CREATE TABLE IF NOT EXISTS conversation_tags (
4872    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4873    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
4874    PRIMARY KEY (conversation_id, tag_id)
4875);
4876
4877-- Daily stats (V8)
4878CREATE TABLE IF NOT EXISTS daily_stats (
4879    day_id INTEGER NOT NULL,
4880    agent_slug TEXT NOT NULL,
4881    source_id TEXT NOT NULL DEFAULT 'all',
4882    session_count INTEGER NOT NULL DEFAULT 0,
4883    message_count INTEGER NOT NULL DEFAULT 0,
4884    total_chars INTEGER NOT NULL DEFAULT 0,
4885    last_updated INTEGER NOT NULL,
4886    PRIMARY KEY (day_id, agent_slug, source_id)
4887);
4888
4889-- Embedding jobs (V9)
4890CREATE TABLE IF NOT EXISTS embedding_jobs (
4891    id INTEGER PRIMARY KEY AUTOINCREMENT,
4892    db_path TEXT NOT NULL,
4893    model_id TEXT NOT NULL,
4894    status TEXT NOT NULL DEFAULT 'pending',
4895    total_docs INTEGER NOT NULL DEFAULT 0,
4896    completed_docs INTEGER NOT NULL DEFAULT 0,
4897    error_message TEXT,
4898    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4899    started_at TEXT,
4900    completed_at TEXT
4901);
4902
4903CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4904ON embedding_jobs(db_path, model_id)
4905WHERE status IN ('pending', 'running');
4906
4907-- Token usage ledger (V10)
4908CREATE TABLE IF NOT EXISTS token_usage (
4909    id INTEGER PRIMARY KEY AUTOINCREMENT,
4910    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4911    conversation_id INTEGER NOT NULL,
4912    agent_id INTEGER NOT NULL,
4913    workspace_id INTEGER,
4914    source_id TEXT NOT NULL DEFAULT 'local',
4915    timestamp_ms INTEGER NOT NULL,
4916    day_id INTEGER NOT NULL,
4917    model_name TEXT,
4918    model_family TEXT,
4919    model_tier TEXT,
4920    service_tier TEXT,
4921    provider TEXT,
4922    input_tokens INTEGER,
4923    output_tokens INTEGER,
4924    cache_read_tokens INTEGER,
4925    cache_creation_tokens INTEGER,
4926    thinking_tokens INTEGER,
4927    total_tokens INTEGER,
4928    estimated_cost_usd REAL,
4929    role TEXT NOT NULL,
4930    content_chars INTEGER NOT NULL,
4931    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4932    tool_call_count INTEGER NOT NULL DEFAULT 0,
4933    data_source TEXT NOT NULL DEFAULT 'api',
4934    UNIQUE(message_id)
4935);
4936
4937-- Token daily stats (V10)
4938CREATE TABLE IF NOT EXISTS token_daily_stats (
4939    day_id INTEGER NOT NULL,
4940    agent_slug TEXT NOT NULL,
4941    source_id TEXT NOT NULL DEFAULT 'all',
4942    model_family TEXT NOT NULL DEFAULT 'all',
4943    api_call_count INTEGER NOT NULL DEFAULT 0,
4944    user_message_count INTEGER NOT NULL DEFAULT 0,
4945    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4946    tool_message_count INTEGER NOT NULL DEFAULT 0,
4947    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4948    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4949    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4950    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4951    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4952    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4953    total_content_chars INTEGER NOT NULL DEFAULT 0,
4954    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4955    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4956    session_count INTEGER NOT NULL DEFAULT 0,
4957    last_updated INTEGER NOT NULL,
4958    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4959);
4960
4961-- Model pricing (V10)
4962CREATE TABLE IF NOT EXISTS model_pricing (
4963    model_pattern TEXT NOT NULL,
4964    provider TEXT NOT NULL,
4965    input_cost_per_mtok REAL NOT NULL,
4966    output_cost_per_mtok REAL NOT NULL,
4967    cache_read_cost_per_mtok REAL,
4968    cache_creation_cost_per_mtok REAL,
4969    effective_date TEXT NOT NULL,
4970    PRIMARY KEY (model_pattern, effective_date)
4971);
4972
4973INSERT OR IGNORE INTO model_pricing VALUES
4974    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4975    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4976    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4977    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4978    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4979    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4980    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4981    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4982    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4983    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4984
4985-- Message metrics: V11 base + V12 model dimensions
4986CREATE TABLE IF NOT EXISTS message_metrics (
4987    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4988    created_at_ms INTEGER NOT NULL,
4989    hour_id INTEGER NOT NULL,
4990    day_id INTEGER NOT NULL,
4991    agent_slug TEXT NOT NULL,
4992    workspace_id INTEGER NOT NULL DEFAULT 0,
4993    source_id TEXT NOT NULL DEFAULT 'local',
4994    role TEXT NOT NULL,
4995    content_chars INTEGER NOT NULL,
4996    content_tokens_est INTEGER NOT NULL,
4997    api_input_tokens INTEGER,
4998    api_output_tokens INTEGER,
4999    api_cache_read_tokens INTEGER,
5000    api_cache_creation_tokens INTEGER,
5001    api_thinking_tokens INTEGER,
5002    api_service_tier TEXT,
5003    api_data_source TEXT NOT NULL DEFAULT 'estimated',
5004    tool_call_count INTEGER NOT NULL DEFAULT 0,
5005    has_tool_calls INTEGER NOT NULL DEFAULT 0,
5006    has_plan INTEGER NOT NULL DEFAULT 0,
5007    model_name TEXT,
5008    model_family TEXT NOT NULL DEFAULT 'unknown',
5009    model_tier TEXT NOT NULL DEFAULT 'unknown',
5010    provider TEXT NOT NULL DEFAULT 'unknown'
5011);
5012
5013-- Hourly rollups: V11 base + V13 plan columns
5014CREATE TABLE IF NOT EXISTS usage_hourly (
5015    hour_id INTEGER NOT NULL,
5016    agent_slug TEXT NOT NULL,
5017    workspace_id INTEGER NOT NULL DEFAULT 0,
5018    source_id TEXT NOT NULL DEFAULT 'local',
5019    message_count INTEGER NOT NULL DEFAULT 0,
5020    user_message_count INTEGER NOT NULL DEFAULT 0,
5021    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5022    tool_call_count INTEGER NOT NULL DEFAULT 0,
5023    plan_message_count INTEGER NOT NULL DEFAULT 0,
5024    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5025    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5026    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5027    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5028    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5029    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5030    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5031    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5032    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5033    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5034    last_updated INTEGER NOT NULL DEFAULT 0,
5035    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5036    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5037    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
5038);
5039
5040-- Daily rollups: V11 base + V13 plan columns
5041CREATE TABLE IF NOT EXISTS usage_daily (
5042    day_id INTEGER NOT NULL,
5043    agent_slug TEXT NOT NULL,
5044    workspace_id INTEGER NOT NULL DEFAULT 0,
5045    source_id TEXT NOT NULL DEFAULT 'local',
5046    message_count INTEGER NOT NULL DEFAULT 0,
5047    user_message_count INTEGER NOT NULL DEFAULT 0,
5048    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5049    tool_call_count INTEGER NOT NULL DEFAULT 0,
5050    plan_message_count INTEGER NOT NULL DEFAULT 0,
5051    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5052    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5053    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5054    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5055    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5056    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5057    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5058    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5059    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5060    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5061    last_updated INTEGER NOT NULL DEFAULT 0,
5062    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5063    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5064    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
5065);
5066
5067-- Model daily rollups (V12)
5068CREATE TABLE IF NOT EXISTS usage_models_daily (
5069    day_id INTEGER NOT NULL,
5070    agent_slug TEXT NOT NULL,
5071    workspace_id INTEGER NOT NULL DEFAULT 0,
5072    source_id TEXT NOT NULL DEFAULT 'local',
5073    model_family TEXT NOT NULL DEFAULT 'unknown',
5074    model_tier TEXT NOT NULL DEFAULT 'unknown',
5075    message_count INTEGER NOT NULL DEFAULT 0,
5076    user_message_count INTEGER NOT NULL DEFAULT 0,
5077    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5078    tool_call_count INTEGER NOT NULL DEFAULT 0,
5079    plan_message_count INTEGER NOT NULL DEFAULT 0,
5080    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5081    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5082    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5083    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5084    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5085    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5086    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5087    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5088    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5089    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5090    last_updated INTEGER NOT NULL DEFAULT 0,
5091    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5092);
5093
5094-- All indexes
5095CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
5096CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
5097CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
5098CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
5099CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
5100CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
5101CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
5102CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
5103CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
5104CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
5105CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
5106CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
5107CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5108CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5109CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5110CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5111CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5112CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5113CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5114CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5115CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5116CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5117CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5118CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5119CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5120CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5121CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5122CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5123CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5124CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5125";
5126
5127#[derive(Clone, Copy)]
5128struct SchemaRepairBatch {
5129    name: &'static str,
5130    tables: &'static [&'static str],
5131    sql: &'static str,
5132}
5133
5134const CURRENT_SCHEMA_REPAIR_SOURCES_SQL: &str = r"
5135CREATE TABLE IF NOT EXISTS sources (
5136    id TEXT PRIMARY KEY,
5137    kind TEXT NOT NULL,
5138    host_label TEXT,
5139    machine_id TEXT,
5140    platform TEXT,
5141    config_json TEXT,
5142    created_at INTEGER NOT NULL,
5143    updated_at INTEGER NOT NULL
5144);
5145
5146INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
5147VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
5148";
5149
5150const CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL: &str = r"
5151CREATE TABLE IF NOT EXISTS daily_stats (
5152    day_id INTEGER NOT NULL,
5153    agent_slug TEXT NOT NULL,
5154    source_id TEXT NOT NULL DEFAULT 'all',
5155    session_count INTEGER NOT NULL DEFAULT 0,
5156    message_count INTEGER NOT NULL DEFAULT 0,
5157    total_chars INTEGER NOT NULL DEFAULT 0,
5158    last_updated INTEGER NOT NULL,
5159    PRIMARY KEY (day_id, agent_slug, source_id)
5160);
5161
5162CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
5163CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
5164";
5165
5166const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL: &str = r"
5167CREATE TABLE IF NOT EXISTS conversation_external_lookup (
5168    lookup_key TEXT PRIMARY KEY,
5169    conversation_id INTEGER NOT NULL
5170);
5171
5172INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
5173SELECT
5174    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
5175    CAST(agent_id AS TEXT) || ':' ||
5176    CAST(length(external_id) AS TEXT) || ':' || external_id,
5177    id
5178FROM conversations
5179WHERE external_id IS NOT NULL;
5180";
5181
5182const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL: &str = r"
5183CREATE TABLE IF NOT EXISTS conversation_tail_state (
5184    conversation_id INTEGER PRIMARY KEY,
5185    ended_at INTEGER,
5186    last_message_idx INTEGER,
5187    last_message_created_at INTEGER
5188);
5189
5190CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
5191    lookup_key TEXT PRIMARY KEY,
5192    conversation_id INTEGER NOT NULL,
5193    ended_at INTEGER,
5194    last_message_idx INTEGER,
5195    last_message_created_at INTEGER
5196);
5197
5198INSERT OR REPLACE INTO conversation_external_tail_lookup (
5199    lookup_key,
5200    conversation_id,
5201    ended_at,
5202    last_message_idx,
5203    last_message_created_at
5204)
5205SELECT
5206    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
5207    CAST(c.agent_id AS TEXT) || ':' ||
5208    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
5209    c.id,
5210    ts.ended_at,
5211    ts.last_message_idx,
5212    ts.last_message_created_at
5213FROM conversations c
5214LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
5215WHERE c.external_id IS NOT NULL;
5216";
5217
5218const CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL: &str = r"
5219CREATE TABLE IF NOT EXISTS embedding_jobs (
5220    id INTEGER PRIMARY KEY AUTOINCREMENT,
5221    db_path TEXT NOT NULL,
5222    model_id TEXT NOT NULL,
5223    status TEXT NOT NULL DEFAULT 'pending',
5224    total_docs INTEGER NOT NULL DEFAULT 0,
5225    completed_docs INTEGER NOT NULL DEFAULT 0,
5226    error_message TEXT,
5227    created_at TEXT NOT NULL DEFAULT (datetime('now')),
5228    started_at TEXT,
5229    completed_at TEXT
5230);
5231
5232CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
5233ON embedding_jobs(db_path, model_id)
5234WHERE status IN ('pending', 'running');
5235";
5236
5237const CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL: &str = r"
5238CREATE TABLE IF NOT EXISTS token_usage (
5239    id INTEGER PRIMARY KEY AUTOINCREMENT,
5240    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
5241    conversation_id INTEGER NOT NULL,
5242    agent_id INTEGER NOT NULL,
5243    workspace_id INTEGER,
5244    source_id TEXT NOT NULL DEFAULT 'local',
5245    timestamp_ms INTEGER NOT NULL,
5246    day_id INTEGER NOT NULL,
5247    model_name TEXT,
5248    model_family TEXT,
5249    model_tier TEXT,
5250    service_tier TEXT,
5251    provider TEXT,
5252    input_tokens INTEGER,
5253    output_tokens INTEGER,
5254    cache_read_tokens INTEGER,
5255    cache_creation_tokens INTEGER,
5256    thinking_tokens INTEGER,
5257    total_tokens INTEGER,
5258    estimated_cost_usd REAL,
5259    role TEXT NOT NULL,
5260    content_chars INTEGER NOT NULL,
5261    has_tool_calls INTEGER NOT NULL DEFAULT 0,
5262    tool_call_count INTEGER NOT NULL DEFAULT 0,
5263    data_source TEXT NOT NULL DEFAULT 'api',
5264    UNIQUE(message_id)
5265);
5266
5267CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
5268CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
5269CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
5270CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
5271CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
5272
5273CREATE TABLE IF NOT EXISTS token_daily_stats (
5274    day_id INTEGER NOT NULL,
5275    agent_slug TEXT NOT NULL,
5276    source_id TEXT NOT NULL DEFAULT 'all',
5277    model_family TEXT NOT NULL DEFAULT 'all',
5278    api_call_count INTEGER NOT NULL DEFAULT 0,
5279    user_message_count INTEGER NOT NULL DEFAULT 0,
5280    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5281    tool_message_count INTEGER NOT NULL DEFAULT 0,
5282    total_input_tokens INTEGER NOT NULL DEFAULT 0,
5283    total_output_tokens INTEGER NOT NULL DEFAULT 0,
5284    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
5285    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
5286    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
5287    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
5288    total_content_chars INTEGER NOT NULL DEFAULT 0,
5289    total_tool_calls INTEGER NOT NULL DEFAULT 0,
5290    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
5291    session_count INTEGER NOT NULL DEFAULT 0,
5292    last_updated INTEGER NOT NULL,
5293    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
5294);
5295
5296CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
5297CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
5298
5299CREATE TABLE IF NOT EXISTS model_pricing (
5300    model_pattern TEXT NOT NULL,
5301    provider TEXT NOT NULL,
5302    input_cost_per_mtok REAL NOT NULL,
5303    output_cost_per_mtok REAL NOT NULL,
5304    cache_read_cost_per_mtok REAL,
5305    cache_creation_cost_per_mtok REAL,
5306    effective_date TEXT NOT NULL,
5307    PRIMARY KEY (model_pattern, effective_date)
5308);
5309
5310INSERT OR IGNORE INTO model_pricing VALUES
5311    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
5312    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
5313    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
5314    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
5315    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
5316    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
5317    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
5318    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
5319    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
5320    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
5321";
5322
5323const CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL: &str = r"
5324CREATE TABLE IF NOT EXISTS message_metrics (
5325    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
5326    created_at_ms INTEGER NOT NULL,
5327    hour_id INTEGER NOT NULL,
5328    day_id INTEGER NOT NULL,
5329    agent_slug TEXT NOT NULL,
5330    workspace_id INTEGER NOT NULL DEFAULT 0,
5331    source_id TEXT NOT NULL DEFAULT 'local',
5332    role TEXT NOT NULL,
5333    content_chars INTEGER NOT NULL,
5334    content_tokens_est INTEGER NOT NULL,
5335    api_input_tokens INTEGER,
5336    api_output_tokens INTEGER,
5337    api_cache_read_tokens INTEGER,
5338    api_cache_creation_tokens INTEGER,
5339    api_thinking_tokens INTEGER,
5340    api_service_tier TEXT,
5341    api_data_source TEXT NOT NULL DEFAULT 'estimated',
5342    tool_call_count INTEGER NOT NULL DEFAULT 0,
5343    has_tool_calls INTEGER NOT NULL DEFAULT 0,
5344    has_plan INTEGER NOT NULL DEFAULT 0,
5345    model_name TEXT,
5346    model_family TEXT NOT NULL DEFAULT 'unknown',
5347    model_tier TEXT NOT NULL DEFAULT 'unknown',
5348    provider TEXT NOT NULL DEFAULT 'unknown'
5349);
5350
5351CREATE TABLE IF NOT EXISTS usage_hourly (
5352    hour_id INTEGER NOT NULL,
5353    agent_slug TEXT NOT NULL,
5354    workspace_id INTEGER NOT NULL DEFAULT 0,
5355    source_id TEXT NOT NULL DEFAULT 'local',
5356    message_count INTEGER NOT NULL DEFAULT 0,
5357    user_message_count INTEGER NOT NULL DEFAULT 0,
5358    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5359    tool_call_count INTEGER NOT NULL DEFAULT 0,
5360    plan_message_count INTEGER NOT NULL DEFAULT 0,
5361    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5362    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5363    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5364    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5365    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5366    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5367    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5368    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5369    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5370    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5371    last_updated INTEGER NOT NULL DEFAULT 0,
5372    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5373    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5374    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
5375);
5376
5377CREATE TABLE IF NOT EXISTS usage_daily (
5378    day_id INTEGER NOT NULL,
5379    agent_slug TEXT NOT NULL,
5380    workspace_id INTEGER NOT NULL DEFAULT 0,
5381    source_id TEXT NOT NULL DEFAULT 'local',
5382    message_count INTEGER NOT NULL DEFAULT 0,
5383    user_message_count INTEGER NOT NULL DEFAULT 0,
5384    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5385    tool_call_count INTEGER NOT NULL DEFAULT 0,
5386    plan_message_count INTEGER NOT NULL DEFAULT 0,
5387    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5388    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5389    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5390    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5391    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5392    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5393    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5394    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5395    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5396    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5397    last_updated INTEGER NOT NULL DEFAULT 0,
5398    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5399    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5400    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
5401);
5402
5403CREATE TABLE IF NOT EXISTS usage_models_daily (
5404    day_id INTEGER NOT NULL,
5405    agent_slug TEXT NOT NULL,
5406    workspace_id INTEGER NOT NULL DEFAULT 0,
5407    source_id TEXT NOT NULL DEFAULT 'local',
5408    model_family TEXT NOT NULL DEFAULT 'unknown',
5409    model_tier TEXT NOT NULL DEFAULT 'unknown',
5410    message_count INTEGER NOT NULL DEFAULT 0,
5411    user_message_count INTEGER NOT NULL DEFAULT 0,
5412    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5413    tool_call_count INTEGER NOT NULL DEFAULT 0,
5414    plan_message_count INTEGER NOT NULL DEFAULT 0,
5415    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5416    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5417    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5418    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5419    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5420    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5421    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5422    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5423    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5424    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5425    last_updated INTEGER NOT NULL DEFAULT 0,
5426    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5427);
5428
5429CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5430CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5431CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5432CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5433CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5434CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5435CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5436CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5437CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5438CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5439CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5440CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5441CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5442CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5443CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5444CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5445CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5446CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5447";
5448
5449const CURRENT_SCHEMA_REPAIR_BATCHES: &[SchemaRepairBatch] = &[
5450    SchemaRepairBatch {
5451        name: "sources",
5452        tables: &["sources"],
5453        sql: CURRENT_SCHEMA_REPAIR_SOURCES_SQL,
5454    },
5455    SchemaRepairBatch {
5456        name: "daily_stats",
5457        tables: &["daily_stats"],
5458        sql: CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL,
5459    },
5460    SchemaRepairBatch {
5461        name: "conversation_external_lookup",
5462        tables: &["conversation_external_lookup"],
5463        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL,
5464    },
5465    SchemaRepairBatch {
5466        name: "conversation_external_tail_lookup",
5467        tables: &[
5468            "conversation_tail_state",
5469            "conversation_external_tail_lookup",
5470        ],
5471        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL,
5472    },
5473    SchemaRepairBatch {
5474        name: "embedding_jobs",
5475        tables: &["embedding_jobs"],
5476        sql: CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL,
5477    },
5478    SchemaRepairBatch {
5479        name: "token_analytics",
5480        tables: &["token_usage", "token_daily_stats", "model_pricing"],
5481        sql: CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL,
5482    },
5483    SchemaRepairBatch {
5484        name: "message_rollups",
5485        tables: &[
5486            "message_metrics",
5487            "usage_hourly",
5488            "usage_daily",
5489            "usage_models_daily",
5490        ],
5491        sql: CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL,
5492    },
5493];
5494
5495fn current_schema_repair_batches_for_missing_tables(
5496    missing_tables: &[&'static str],
5497) -> Result<Vec<&'static SchemaRepairBatch>> {
5498    let missing_set: HashSet<&'static str> = missing_tables.iter().copied().collect();
5499    let mut selected_batches = Vec::new();
5500    let mut covered_tables = HashSet::new();
5501
5502    for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
5503        if !batch
5504            .tables
5505            .iter()
5506            .any(|table_name| missing_set.contains(table_name))
5507        {
5508            continue;
5509        }
5510        selected_batches.push(batch);
5511        covered_tables.extend(batch.tables.iter().copied());
5512    }
5513
5514    for &table_name in missing_tables {
5515        if !covered_tables.contains(table_name) {
5516            return Err(anyhow!(
5517                "no current-schema repair batch registered for missing table {table_name}"
5518            ));
5519        }
5520    }
5521
5522    Ok(selected_batches)
5523}
5524
5525/// Migration name lookup for backfilling `_schema_migrations` during transition.
5526const MIGRATION_NAMES: [(i64, &str); 20] = [
5527    (1, "core_tables"),
5528    (2, "fts_messages"),
5529    (3, "fts_messages_rebuild"),
5530    (4, "sources"),
5531    (5, "provenance_columns"),
5532    (6, "source_path_index"),
5533    (7, "msgpack_columns"),
5534    (8, "daily_stats"),
5535    (9, "embedding_jobs"),
5536    (10, "token_analytics"),
5537    (11, "message_metrics"),
5538    (12, "model_dimensions"),
5539    (13, "plan_token_rollups"),
5540    (14, "fts_contentless"),
5541    (15, "conversation_tail_state_cache"),
5542    (16, "drop_redundant_message_conv_idx"),
5543    (17, "drop_message_created_idx"),
5544    (18, "conversation_tail_state_hot_table"),
5545    (19, "conversation_external_lookup"),
5546    (20, "conversation_external_tail_lookup"),
5547];
5548
5549/// Transitions an existing database from `meta` table schema versioning to the
5550/// `_schema_migrations` table used by `MigrationRunner`.
5551///
5552/// The existing `SqliteStorage` tracks schema version as a string value in
5553/// `meta WHERE key = 'schema_version'`. The bead spec references
5554/// `PRAGMA user_version`, but the actual cass code uses the `meta` table.
5555/// This function handles the real code path.
5556///
5557/// Behavior:
5558/// - If `_schema_migrations` already exists → skip (already transitioned)
5559/// - If `meta` table has `schema_version > 0` → create `_schema_migrations`
5560///   and backfill entries for versions `1..=current_version`
5561/// - Legacy V10-V12 databases are represented as V13 in `_schema_migrations`
5562///   because frankensqlite uses one combined V13 base migration instead of
5563///   replaying the old incremental V11-V13 steps.
5564/// - If `meta` table missing or `schema_version = 0` with no tables → fresh DB,
5565///   let `MigrationRunner` handle it
5566/// - If `schema_version = 0` but tables exist → corrupted state, log warning
5567fn transition_from_meta_version(conn: &FrankenConnection) -> Result<()> {
5568    // Avoid sqlite_master enumeration here. Databases with FTS virtual tables
5569    // can trigger frankensqlite parse-recovery on sqlite_master reads, which is
5570    // enough to break the transition on otherwise-healthy legacy cass DBs.
5571    if conn
5572        .query("SELECT version FROM \"_schema_migrations\";")
5573        .is_ok()
5574    {
5575        return Ok(());
5576    }
5577
5578    // Check if the meta table exists.
5579    if conn.query("SELECT key FROM meta;").is_err() {
5580        // No meta table → fresh database, let MigrationRunner handle it.
5581        return Ok(());
5582    }
5583
5584    // Read the current schema version from the meta table.
5585    let rows = conn
5586        .query("SELECT value FROM meta WHERE key = 'schema_version';")
5587        .with_context(|| "reading schema_version from meta")?;
5588
5589    let current_version: i64 = rows
5590        .first()
5591        .and_then(|row| row.get_typed::<String>(0).ok())
5592        .and_then(|s| s.parse().ok())
5593        .unwrap_or(0);
5594
5595    if current_version == 0 {
5596        // Check if tables actually exist (corrupted state: tables present but version=0).
5597        if conn.query("SELECT id FROM conversations LIMIT 1;").is_err() {
5598            // Truly fresh DB (meta table exists but empty/reset). Let MigrationRunner handle it.
5599            return Ok(());
5600        }
5601
5602        // Tables exist but version=0: corrupted state. Log and skip transition;
5603        // MigrationRunner will fail on "table already exists" and surface the error.
5604        info!("meta.schema_version=0 but tables exist; skipping transition (corrupted state)");
5605        return Ok(());
5606    }
5607
5608    // Create _schema_migrations and backfill entries for all applied versions.
5609    info!(
5610        current_version,
5611        "transitioning schema tracking from meta table to _schema_migrations"
5612    );
5613
5614    conn.execute(
5615        "CREATE TABLE IF NOT EXISTS _schema_migrations (\
5616            version INTEGER PRIMARY KEY, \
5617            name TEXT NOT NULL, \
5618            applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))\
5619        );",
5620    )
5621    .with_context(|| "creating _schema_migrations table for transition")?;
5622
5623    let backfill_through_version = if (10..13).contains(&current_version) {
5624        13
5625    } else {
5626        current_version
5627    };
5628
5629    for &(version, name) in &MIGRATION_NAMES {
5630        if version > backfill_through_version {
5631            break;
5632        }
5633        conn.execute_compat(
5634            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
5635            &[ParamValue::from(version), ParamValue::from(name)],
5636        )
5637        .with_context(|| format!("backfilling _schema_migrations version {version}"))?;
5638    }
5639
5640    info!(
5641        current_version,
5642        backfill_through_version,
5643        "schema version transition complete: backfilled legacy meta schema versions"
5644    );
5645
5646    Ok(())
5647}
5648
5649const REQUIRED_CURRENT_SCHEMA_TABLE_PROBES: &[(&str, &str)] = &[
5650    ("sources", "SELECT id FROM sources LIMIT 1;"),
5651    ("daily_stats", "SELECT day_id FROM daily_stats LIMIT 1;"),
5652    (
5653        "conversation_external_lookup",
5654        "SELECT lookup_key FROM conversation_external_lookup LIMIT 1;",
5655    ),
5656    (
5657        "conversation_tail_state",
5658        "SELECT conversation_id FROM conversation_tail_state LIMIT 1;",
5659    ),
5660    (
5661        "conversation_external_tail_lookup",
5662        "SELECT lookup_key, last_message_idx FROM conversation_external_tail_lookup LIMIT 1;",
5663    ),
5664    ("embedding_jobs", "SELECT id FROM embedding_jobs LIMIT 1;"),
5665    ("token_usage", "SELECT id FROM token_usage LIMIT 1;"),
5666    (
5667        "token_daily_stats",
5668        "SELECT day_id FROM token_daily_stats LIMIT 1;",
5669    ),
5670    (
5671        "model_pricing",
5672        "SELECT model_pattern FROM model_pricing LIMIT 1;",
5673    ),
5674    (
5675        "message_metrics",
5676        "SELECT message_id FROM message_metrics LIMIT 1;",
5677    ),
5678    ("usage_hourly", "SELECT hour_id FROM usage_hourly LIMIT 1;"),
5679    ("usage_daily", "SELECT day_id FROM usage_daily LIMIT 1;"),
5680    (
5681        "usage_models_daily",
5682        "SELECT day_id FROM usage_models_daily LIMIT 1;",
5683    ),
5684];
5685
5686const REQUIRED_CONVERSATION_TOKEN_COLUMNS: &[(&str, &str)] = &[
5687    ("total_input_tokens", "INTEGER"),
5688    ("total_output_tokens", "INTEGER"),
5689    ("total_cache_read_tokens", "INTEGER"),
5690    ("total_cache_creation_tokens", "INTEGER"),
5691    ("grand_total_tokens", "INTEGER"),
5692    ("estimated_cost_usd", "REAL"),
5693    ("primary_model", "TEXT"),
5694    ("api_call_count", "INTEGER"),
5695    ("tool_call_count", "INTEGER"),
5696    ("user_message_count", "INTEGER"),
5697    ("assistant_message_count", "INTEGER"),
5698];
5699
5700fn error_indicates_missing_table(err: &impl std::fmt::Display) -> bool {
5701    err.to_string()
5702        .to_ascii_lowercase()
5703        .contains("no such table")
5704}
5705
5706fn error_indicates_missing_column(err: &impl std::fmt::Display) -> bool {
5707    err.to_string()
5708        .to_ascii_lowercase()
5709        .contains("no such column")
5710}
5711
5712const ORPHAN_FK_ID_CHUNK_SIZE: usize = 256;
5713
5714fn collect_orphan_message_ids(conn: &FrankenConnection) -> Result<Vec<i64>> {
5715    conn.query_map_collect(
5716        "SELECT m.id
5717         FROM messages AS m
5718         WHERE NOT EXISTS (
5719             SELECT 1
5720             FROM conversations AS c
5721             WHERE c.id = m.conversation_id
5722         )",
5723        fparams![],
5724        |row| row.get_typed(0),
5725    )
5726    .context("listing orphan message ids for orphan FK cleanup")
5727}
5728
5729fn delete_rows_by_i64_chunks(
5730    tx: &FrankenTransaction<'_>,
5731    delete_many_sql_prefix: &'static str,
5732    ids: &[i64],
5733) -> Result<usize> {
5734    if ids.is_empty() {
5735        return Ok(0);
5736    }
5737
5738    let full_chunk_sql = delete_rows_by_i64_sql(delete_many_sql_prefix, ORPHAN_FK_ID_CHUNK_SIZE);
5739    let tail_len = ids.len() % ORPHAN_FK_ID_CHUNK_SIZE;
5740    let tail_sql =
5741        (tail_len != 0).then(|| delete_rows_by_i64_sql(delete_many_sql_prefix, tail_len));
5742
5743    let mut deleted = 0;
5744    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5745        let sql = if chunk.len() == ORPHAN_FK_ID_CHUNK_SIZE {
5746            &full_chunk_sql
5747        } else {
5748            tail_sql.as_ref().unwrap_or(&full_chunk_sql)
5749        };
5750        let params = chunk
5751            .iter()
5752            .map(|id| SqliteValue::from(*id))
5753            .collect::<Vec<_>>();
5754        deleted += tx.execute_with_params(sql, &params)?;
5755    }
5756    Ok(deleted)
5757}
5758
5759fn delete_rows_by_i64_sql(delete_many_sql_prefix: &'static str, count: usize) -> String {
5760    let placeholders = sql_placeholders(count);
5761    format!("{delete_many_sql_prefix} ({placeholders})")
5762}
5763
5764fn sql_placeholders(count: usize) -> String {
5765    vec!["?"; count].join(", ")
5766}
5767
5768fn delete_orphan_message_ids_bisecting_oom(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5769    let mut deleted = 0usize;
5770    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5771        deleted = deleted.saturating_add(delete_orphan_message_id_chunk(conn, chunk)?);
5772    }
5773    Ok(deleted)
5774}
5775
5776fn delete_orphan_message_id_chunk(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5777    if ids.is_empty() {
5778        return Ok(0);
5779    }
5780
5781    match delete_orphan_message_id_chunk_once(conn, ids) {
5782        Ok(deleted) => Ok(deleted),
5783        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5784            let split_at = ids.len() / 2;
5785            tracing::warn!(
5786                target: "cass::fk_repair",
5787                rows = ids.len(),
5788                left = split_at,
5789                right = ids.len().saturating_sub(split_at),
5790                error = %err,
5791                "orphan-message cleanup ran out of memory; retrying as smaller batches"
5792            );
5793            let left = delete_orphan_message_id_chunk(conn, &ids[..split_at])?;
5794            let right = delete_orphan_message_id_chunk(conn, &ids[split_at..])?;
5795            Ok(left.saturating_add(right))
5796        }
5797        Err(err) => Err(err),
5798    }
5799}
5800
5801fn delete_orphan_message_id_chunk_once(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5802    let mut tx = conn.transaction()?;
5803    let mut deleted = 0usize;
5804    for entry in ORPHAN_MESSAGE_DEPENDENT_TABLES {
5805        match delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids) {
5806            Ok(count) => {
5807                deleted = deleted.saturating_add(count);
5808            }
5809            Err(err) if error_indicates_missing_table(&err) => {
5810                tracing::debug!(
5811                    target: "cass::fk_repair",
5812                    child_table = entry.child_table,
5813                    error = %err,
5814                    "skipping orphan-message dependent cleanup (table unavailable)"
5815                );
5816            }
5817            Err(err) => {
5818                return Err(err).with_context(|| {
5819                    format!(
5820                        "deleting rows from {} that depend on orphan messages",
5821                        entry.child_table
5822                    )
5823                });
5824            }
5825        }
5826    }
5827    deleted = deleted.saturating_add(
5828        delete_rows_by_i64_chunks(&tx, "DELETE FROM messages WHERE id IN", ids)
5829            .context("deleting orphan rows from messages")?,
5830    );
5831    tx.commit()?;
5832    Ok(deleted)
5833}
5834
5835fn collect_direct_orphan_id_page(
5836    conn: &FrankenConnection,
5837    entry: &'static OrphanFkTable,
5838) -> Result<Vec<i64>> {
5839    Ok(conn.query_map_collect(
5840        entry.orphan_id_page_sql,
5841        fparams![i64::try_from(ORPHAN_FK_ID_CHUNK_SIZE).unwrap_or(i64::MAX)],
5842        |row| row.get_typed(0),
5843    )?)
5844}
5845
5846fn delete_direct_orphan_ids_bisecting_oom(
5847    conn: &FrankenConnection,
5848    entry: &'static OrphanFkTable,
5849    ids: &[i64],
5850) -> Result<usize> {
5851    let mut deleted = 0usize;
5852    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5853        deleted = deleted.saturating_add(delete_direct_orphan_id_chunk(conn, entry, chunk)?);
5854    }
5855    Ok(deleted)
5856}
5857
5858fn delete_direct_orphan_id_chunk(
5859    conn: &FrankenConnection,
5860    entry: &'static OrphanFkTable,
5861    ids: &[i64],
5862) -> Result<usize> {
5863    if ids.is_empty() {
5864        return Ok(0);
5865    }
5866
5867    match delete_direct_orphan_id_chunk_once(conn, entry, ids) {
5868        Ok(deleted) => Ok(deleted),
5869        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5870            let split_at = ids.len() / 2;
5871            tracing::warn!(
5872                target: "cass::fk_repair",
5873                child_table = entry.child_table,
5874                rows = ids.len(),
5875                left = split_at,
5876                right = ids.len().saturating_sub(split_at),
5877                error = %err,
5878                "direct orphan cleanup ran out of memory; retrying as smaller batches"
5879            );
5880            let left = delete_direct_orphan_id_chunk(conn, entry, &ids[..split_at])?;
5881            let right = delete_direct_orphan_id_chunk(conn, entry, &ids[split_at..])?;
5882            Ok(left.saturating_add(right))
5883        }
5884        Err(err) => Err(err),
5885    }
5886}
5887
5888fn delete_direct_orphan_id_chunk_once(
5889    conn: &FrankenConnection,
5890    entry: &'static OrphanFkTable,
5891    ids: &[i64],
5892) -> Result<usize> {
5893    let mut tx = conn.transaction()?;
5894    let deleted = delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids)?;
5895    tx.commit()?;
5896    Ok(deleted)
5897}
5898
5899/// Tables whose FK parent rows can go missing when an index transaction is
5900/// dropped mid-flight. The select and delete SQL strings are intentionally
5901/// static (no dynamic table names) so they can be audited at a glance and so
5902/// they cannot be subverted by injected identifiers. The select statement
5903/// yields the integer FK key used by the matching chunked delete.
5904struct OrphanFkTable {
5905    child_table: &'static str,
5906    orphan_id_page_sql: &'static str,
5907    delete_many_sql_prefix: &'static str,
5908}
5909
5910const ORPHAN_DIRECT_CHILD_TABLES: &[OrphanFkTable] = &[
5911    OrphanFkTable {
5912        child_table: "message_metrics",
5913        orphan_id_page_sql: "SELECT message_id FROM message_metrics \
5914                             WHERE NOT EXISTS (\
5915                                 SELECT 1 FROM messages \
5916                                 WHERE messages.id = message_metrics.message_id\
5917                             ) \
5918                             ORDER BY message_id \
5919                             LIMIT ?1",
5920        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5921    },
5922    OrphanFkTable {
5923        child_table: "token_usage",
5924        orphan_id_page_sql: "SELECT message_id FROM token_usage \
5925                             WHERE NOT EXISTS (\
5926                                 SELECT 1 FROM messages \
5927                                 WHERE messages.id = token_usage.message_id\
5928                             ) \
5929                             ORDER BY message_id \
5930                             LIMIT ?1",
5931        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5932    },
5933    OrphanFkTable {
5934        child_table: "snippets",
5935        orphan_id_page_sql: "SELECT message_id FROM snippets \
5936                             WHERE NOT EXISTS (\
5937                                 SELECT 1 FROM messages \
5938                                 WHERE messages.id = snippets.message_id\
5939                             ) \
5940                             ORDER BY message_id \
5941                             LIMIT ?1",
5942        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5943    },
5944    OrphanFkTable {
5945        child_table: "conversation_tags",
5946        orphan_id_page_sql: "SELECT conversation_id FROM conversation_tags \
5947                             WHERE NOT EXISTS (\
5948                                 SELECT 1 FROM conversations \
5949                                 WHERE conversations.id = conversation_tags.conversation_id\
5950                             ) \
5951                             ORDER BY conversation_id \
5952                             LIMIT ?1",
5953        delete_many_sql_prefix: "DELETE FROM conversation_tags WHERE conversation_id IN",
5954    },
5955];
5956
5957struct OrphanMessageDependentTable {
5958    child_table: &'static str,
5959    delete_many_sql_prefix: &'static str,
5960}
5961
5962const ORPHAN_MESSAGE_DEPENDENT_TABLES: &[OrphanMessageDependentTable] = &[
5963    OrphanMessageDependentTable {
5964        child_table: "message_metrics",
5965        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5966    },
5967    OrphanMessageDependentTable {
5968        child_table: "token_usage",
5969        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5970    },
5971    OrphanMessageDependentTable {
5972        child_table: "snippets",
5973        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5974    },
5975];
5976
5977/// Summary of orphan rows detected and removed by `cleanup_orphan_fk_rows`.
5978///
5979/// Message-root counts come from the probe phase, while direct child counts
5980/// come from bounded page deletes. Under the function's intended use — a single
5981/// indexer-startup pass holding the index run lock — no concurrent writers
5982/// exist, so these counts match the primary orphan roots identified and
5983/// removed during cleanup. Dependent rows below an orphan message
5984/// (`message_metrics` / `token_usage` / `snippets`) are an expected consequence
5985/// of removing that root orphan and are *not* separately counted in `total` or
5986/// `per_table`.
5987#[derive(Debug, Default, Clone)]
5988pub(crate) struct OrphanFkCleanupReport {
5989    pub total: i64,
5990    pub per_table: Vec<(&'static str, i64)>,
5991}
5992
5993impl OrphanFkCleanupReport {
5994    fn record(&mut self, child_table: &'static str, count: i64) {
5995        if let Some((_, existing)) = self
5996            .per_table
5997            .iter_mut()
5998            .find(|(table, _)| *table == child_table)
5999        {
6000            *existing = existing.saturating_add(count);
6001        } else {
6002            self.per_table.push((child_table, count));
6003        }
6004        self.total = self.total.saturating_add(count);
6005    }
6006}
6007
6008pub struct InsertOutcome {
6009    pub conversation_id: i64,
6010    pub conversation_inserted: bool,
6011    pub inserted_indices: Vec<i64>,
6012}
6013
6014#[cfg(test)]
6015#[derive(Debug, Clone, Default)]
6016struct MessageInsertSubstageProfile {
6017    single_row_calls: usize,
6018    batch_calls: usize,
6019    batch_rows: usize,
6020    payload_duration: Duration,
6021    sql_build_duration: Duration,
6022    param_build_duration: Duration,
6023    execute_duration: Duration,
6024    rowid_duration: Duration,
6025}
6026
6027#[cfg(test)]
6028#[derive(Debug, Clone, Default)]
6029struct InsertConversationTreePerfProfile {
6030    invocations: usize,
6031    messages: usize,
6032    inserted_messages: usize,
6033    total_duration: Duration,
6034    source_duration: Duration,
6035    tx_open_duration: Duration,
6036    existing_lookup_duration: Duration,
6037    existing_idx_lookup_duration: Duration,
6038    existing_replay_lookup_duration: Duration,
6039    dedupe_filter_duration: Duration,
6040    conversation_row_duration: Duration,
6041    message_insert_duration: Duration,
6042    message_insert_breakdown: MessageInsertSubstageProfile,
6043    snippet_insert_duration: Duration,
6044    fts_entry_duration: Duration,
6045    fts_flush_duration: Duration,
6046    analytics_duration: Duration,
6047    commit_duration: Duration,
6048}
6049
6050#[cfg(test)]
6051impl InsertConversationTreePerfProfile {
6052    fn millis(duration: Duration) -> f64 {
6053        duration.as_secs_f64() * 1000.0
6054    }
6055
6056    fn log_summary(&self, label: &str) {
6057        let calls = self.invocations.max(1) as f64;
6058        let accounted_duration = self.source_duration
6059            + self.tx_open_duration
6060            + self.existing_lookup_duration
6061            + self.existing_idx_lookup_duration
6062            + self.existing_replay_lookup_duration
6063            + self.dedupe_filter_duration
6064            + self.conversation_row_duration
6065            + self.message_insert_duration
6066            + self.snippet_insert_duration
6067            + self.fts_entry_duration
6068            + self.fts_flush_duration
6069            + self.analytics_duration
6070            + self.commit_duration;
6071        let residual_duration = self.total_duration.saturating_sub(accounted_duration);
6072        eprintln!(
6073            concat!(
6074                "CASS_INSERT_TREE_STAGE_PROFILE ",
6075                "label={} calls={} messages={} inserted_messages={} ",
6076                "total_ms={:.3} source_ms={:.3} tx_open_ms={:.3} existing_lookup_ms={:.3} ",
6077                "existing_idx_lookup_ms={:.3} existing_replay_lookup_ms={:.3} dedupe_filter_ms={:.3} ",
6078                "conversation_row_ms={:.3} message_insert_ms={:.3} snippet_insert_ms={:.3} ",
6079                "fts_entry_ms={:.3} fts_flush_ms={:.3} analytics_ms={:.3} commit_ms={:.3} ",
6080                "msg_payload_ms={:.3} msg_sql_ms={:.3} msg_param_ms={:.3} msg_execute_ms={:.3} msg_rowid_ms={:.3} ",
6081                "residual_ms={:.3} avg_total_ms={:.3} avg_message_insert_ms={:.3} ",
6082                "avg_msg_execute_ms={:.3} avg_msg_payload_ms={:.3} avg_snippet_insert_ms={:.3} avg_fts_entry_ms={:.3} avg_commit_ms={:.3}"
6083            ),
6084            label,
6085            self.invocations,
6086            self.messages,
6087            self.inserted_messages,
6088            Self::millis(self.total_duration),
6089            Self::millis(self.source_duration),
6090            Self::millis(self.tx_open_duration),
6091            Self::millis(self.existing_lookup_duration),
6092            Self::millis(self.existing_idx_lookup_duration),
6093            Self::millis(self.existing_replay_lookup_duration),
6094            Self::millis(self.dedupe_filter_duration),
6095            Self::millis(self.conversation_row_duration),
6096            Self::millis(self.message_insert_duration),
6097            Self::millis(self.snippet_insert_duration),
6098            Self::millis(self.fts_entry_duration),
6099            Self::millis(self.fts_flush_duration),
6100            Self::millis(self.analytics_duration),
6101            Self::millis(self.commit_duration),
6102            Self::millis(self.message_insert_breakdown.payload_duration),
6103            Self::millis(self.message_insert_breakdown.sql_build_duration),
6104            Self::millis(self.message_insert_breakdown.param_build_duration),
6105            Self::millis(self.message_insert_breakdown.execute_duration),
6106            Self::millis(self.message_insert_breakdown.rowid_duration),
6107            Self::millis(residual_duration),
6108            Self::millis(self.total_duration) / calls,
6109            Self::millis(self.message_insert_duration) / calls,
6110            Self::millis(self.message_insert_breakdown.execute_duration) / calls,
6111            Self::millis(self.message_insert_breakdown.payload_duration) / calls,
6112            Self::millis(self.snippet_insert_duration) / calls,
6113            Self::millis(self.fts_entry_duration) / calls,
6114            Self::millis(self.commit_duration) / calls,
6115        );
6116    }
6117}
6118
6119#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6120enum PendingConversationKey {
6121    External {
6122        source_id: String,
6123        agent_id: i64,
6124        external_id: String,
6125    },
6126    SourcePath {
6127        source_id: String,
6128        agent_id: i64,
6129        source_path: String,
6130        started_at: Option<i64>,
6131    },
6132}
6133
6134fn conversation_external_lookup_key(source_id: &str, agent_id: i64, external_id: &str) -> String {
6135    format!(
6136        "{}:{source_id}:{agent_id}:{}:{external_id}",
6137        source_id.chars().count(),
6138        external_id.chars().count()
6139    )
6140}
6141
6142fn conversation_external_lookup_key_for_conv(agent_id: i64, conv: &Conversation) -> Option<String> {
6143    conv.external_id
6144        .as_deref()
6145        .map(|external_id| conversation_external_lookup_key(&conv.source_id, agent_id, external_id))
6146}
6147
6148#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6149struct MessageMergeFingerprint {
6150    idx: i64,
6151    created_at: Option<i64>,
6152    role: MessageRole,
6153    author: Option<String>,
6154    content_hash: [u8; 32],
6155}
6156
6157#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6158struct MessageReplayFingerprint {
6159    created_at: Option<i64>,
6160    role: MessageRole,
6161    author: Option<String>,
6162    content_hash: [u8; 32],
6163}
6164
6165#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6166struct ConversationMergeEvidence {
6167    exact_overlap: usize,
6168    replay_overlap: usize,
6169    smaller_replay_set: usize,
6170    started_close: bool,
6171    start_distance_ms: i64,
6172}
6173
6174struct ExistingConversationNewMessages<'a> {
6175    messages: Vec<&'a Message>,
6176    new_chars: i64,
6177    idx_collision_count: usize,
6178    first_collision_idx: Option<i64>,
6179}
6180
6181#[derive(Debug, Clone, Copy)]
6182struct ExistingConversationTailState {
6183    last_message_idx: i64,
6184    last_message_created_at: i64,
6185    ended_at: Option<i64>,
6186}
6187
6188#[derive(Debug, Clone, Copy)]
6189struct ExistingConversationTailMetadata {
6190    last_message_idx: Option<i64>,
6191    last_message_created_at: Option<i64>,
6192    ended_at: Option<i64>,
6193}
6194
6195impl ExistingConversationTailMetadata {
6196    fn complete_tail_state(self) -> Option<ExistingConversationTailState> {
6197        existing_conversation_tail_state_from_cached(
6198            self.last_message_idx,
6199            self.last_message_created_at,
6200            self.ended_at,
6201        )
6202    }
6203}
6204
6205#[derive(Debug, Clone, Copy)]
6206struct ExistingConversationWithTail {
6207    id: i64,
6208    tail_state: Option<ExistingConversationTailState>,
6209}
6210
6211fn conversation_effective_started_at(conv: &Conversation) -> Option<i64> {
6212    conv.started_at
6213        .or_else(|| conv.messages.iter().filter_map(|msg| msg.created_at).min())
6214}
6215
6216fn conversation_tail_state(conv: &Conversation) -> (Option<i64>, Option<i64>) {
6217    (
6218        conv.messages.iter().map(|msg| msg.idx).max(),
6219        conv.messages.iter().filter_map(|msg| msg.created_at).max(),
6220    )
6221}
6222
6223fn conversation_tail_ended_at_candidate(conv: &Conversation) -> Option<i64> {
6224    let max_message_created_at = conv.messages.iter().filter_map(|msg| msg.created_at).max();
6225    max_message_created_at.max(conv.ended_at)
6226}
6227
6228fn borrowed_messages_tail_state(messages: &[&Message]) -> (Option<i64>, Option<i64>) {
6229    (
6230        messages.iter().map(|msg| msg.idx).max(),
6231        messages.iter().filter_map(|msg| msg.created_at).max(),
6232    )
6233}
6234
6235fn role_from_str(role: &str) -> MessageRole {
6236    match role {
6237        "user" => MessageRole::User,
6238        "agent" | "assistant" => MessageRole::Agent,
6239        "tool" => MessageRole::Tool,
6240        "system" => MessageRole::System,
6241        other => MessageRole::Other(other.to_string()),
6242    }
6243}
6244
6245fn message_merge_fingerprint(msg: &Message) -> MessageMergeFingerprint {
6246    MessageMergeFingerprint {
6247        idx: msg.idx,
6248        created_at: msg.created_at,
6249        role: msg.role.clone(),
6250        author: msg.author.clone(),
6251        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
6252    }
6253}
6254
6255fn message_replay_fingerprint(msg: &Message) -> MessageReplayFingerprint {
6256    MessageReplayFingerprint {
6257        created_at: msg.created_at,
6258        role: msg.role.clone(),
6259        author: msg.author.clone(),
6260        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
6261    }
6262}
6263
6264fn conversation_message_fingerprints(conv: &Conversation) -> HashSet<MessageMergeFingerprint> {
6265    conv.messages
6266        .iter()
6267        .map(message_merge_fingerprint)
6268        .collect()
6269}
6270
6271fn conversation_message_replay_fingerprints(
6272    conv: &Conversation,
6273) -> HashSet<MessageReplayFingerprint> {
6274    conv.messages
6275        .iter()
6276        .map(message_replay_fingerprint)
6277        .collect()
6278}
6279
6280fn replay_fingerprint_from_merge(
6281    fingerprint: &MessageMergeFingerprint,
6282) -> MessageReplayFingerprint {
6283    MessageReplayFingerprint {
6284        created_at: fingerprint.created_at,
6285        role: fingerprint.role.clone(),
6286        author: fingerprint.author.clone(),
6287        content_hash: fingerprint.content_hash,
6288    }
6289}
6290
6291fn replay_fingerprints_from_merge_set(
6292    fingerprints: &HashSet<MessageMergeFingerprint>,
6293) -> HashSet<MessageReplayFingerprint> {
6294    fingerprints
6295        .iter()
6296        .map(replay_fingerprint_from_merge)
6297        .collect()
6298}
6299
6300fn collect_new_messages_for_existing_conversation<'a>(
6301    conversation_id: i64,
6302    conv: &'a Conversation,
6303    existing_messages: &mut HashMap<i64, MessageMergeFingerprint>,
6304    existing_replay_fingerprints: &mut HashSet<MessageReplayFingerprint>,
6305    replay_skip_log: &'static str,
6306) -> ExistingConversationNewMessages<'a> {
6307    let mut idx_collision_count = 0usize;
6308    let mut first_collision_idx: Option<i64> = None;
6309    let mut new_chars: i64 = 0;
6310    let mut messages = Vec::new();
6311
6312    for msg in &conv.messages {
6313        let incoming_fingerprint = message_merge_fingerprint(msg);
6314        if let Some(existing_fingerprint) = existing_messages.get(&msg.idx) {
6315            if existing_fingerprint != &incoming_fingerprint {
6316                idx_collision_count = idx_collision_count.saturating_add(1);
6317                first_collision_idx.get_or_insert(msg.idx);
6318            }
6319            continue;
6320        }
6321
6322        let incoming_replay = replay_fingerprint_from_merge(&incoming_fingerprint);
6323        if existing_replay_fingerprints.contains(&incoming_replay) {
6324            tracing::debug!(
6325                conversation_id,
6326                idx = msg.idx,
6327                source_path = %conv.source_path.display(),
6328                "{replay_skip_log}"
6329            );
6330            continue;
6331        }
6332
6333        existing_messages.insert(msg.idx, incoming_fingerprint);
6334        existing_replay_fingerprints.insert(incoming_replay);
6335        new_chars += msg.content.len() as i64;
6336        messages.push(msg);
6337    }
6338
6339    ExistingConversationNewMessages {
6340        messages,
6341        new_chars,
6342        idx_collision_count,
6343        first_collision_idx,
6344    }
6345}
6346
6347fn franken_existing_conversation_append_tail_state(
6348    tx: &FrankenTransaction<'_>,
6349    conversation_id: i64,
6350) -> Result<Option<ExistingConversationTailState>> {
6351    let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
6352        .query_row_map(
6353            "SELECT last_message_idx, last_message_created_at, ended_at
6354             FROM conversation_tail_state
6355             WHERE conversation_id = ?1",
6356            fparams![conversation_id],
6357            |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6358        )
6359        .optional()?;
6360    if let Some(cached) = cached {
6361        let (_, _, cached_ended_at) = cached;
6362        if let Some(tail_state) =
6363            existing_conversation_tail_state_from_cached(cached.0, cached.1, cached_ended_at)
6364        {
6365            return Ok(Some(tail_state));
6366        }
6367    }
6368
6369    let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
6370        "SELECT last_message_idx, last_message_created_at, ended_at
6371         FROM conversations
6372         WHERE id = ?1",
6373        fparams![conversation_id],
6374        |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6375    )?;
6376    let (_, _, cached_ended_at) = legacy_cached;
6377    if let Some(tail_state) = existing_conversation_tail_state_from_cached(
6378        legacy_cached.0,
6379        legacy_cached.1,
6380        cached_ended_at,
6381    ) {
6382        franken_insert_conversation_tail_state(
6383            tx,
6384            conversation_id,
6385            cached_ended_at,
6386            Some(tail_state.last_message_idx),
6387            Some(tail_state.last_message_created_at),
6388        )?;
6389        return Ok(Some(tail_state));
6390    }
6391
6392    let (max_idx, max_created_at): (Option<i64>, Option<i64>) = tx.query_row_map(
6393        "SELECT MAX(idx), MAX(created_at)
6394         FROM messages
6395         WHERE conversation_id = ?1",
6396        fparams![conversation_id],
6397        |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
6398    )?;
6399    if let Some((last_message_idx, last_message_created_at)) = max_idx.zip(max_created_at) {
6400        franken_update_conversation_tail_state(
6401            tx,
6402            conversation_id,
6403            None,
6404            Some(last_message_idx),
6405            Some(last_message_created_at),
6406        )?;
6407        return Ok(Some(ExistingConversationTailState {
6408            last_message_idx,
6409            last_message_created_at,
6410            ended_at: cached_ended_at,
6411        }));
6412    }
6413    Ok(None)
6414}
6415
6416fn franken_cached_existing_conversation_tail_metadata(
6417    tx: &FrankenTransaction<'_>,
6418    conversation_id: i64,
6419) -> Result<ExistingConversationTailMetadata> {
6420    let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
6421        .query_row_map(
6422            "SELECT last_message_idx, last_message_created_at, ended_at
6423             FROM conversation_tail_state
6424             WHERE conversation_id = ?1",
6425            fparams![conversation_id],
6426            |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6427        )
6428        .optional()?;
6429    if let Some(cached) = cached {
6430        return Ok(ExistingConversationTailMetadata {
6431            last_message_idx: cached.0,
6432            last_message_created_at: cached.1,
6433            ended_at: cached.2,
6434        });
6435    }
6436
6437    let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
6438        "SELECT last_message_idx, last_message_created_at, ended_at
6439         FROM conversations
6440         WHERE id = ?1",
6441        fparams![conversation_id],
6442        |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6443    )?;
6444    Ok(ExistingConversationTailMetadata {
6445        last_message_idx: legacy_cached.0,
6446        last_message_created_at: legacy_cached.1,
6447        ended_at: legacy_cached.2,
6448    })
6449}
6450
6451fn existing_conversation_tail_state_from_cached(
6452    last_message_idx: Option<i64>,
6453    last_message_created_at: Option<i64>,
6454    ended_at: Option<i64>,
6455) -> Option<ExistingConversationTailState> {
6456    let (last_message_idx, last_message_created_at) =
6457        last_message_idx.zip(last_message_created_at)?;
6458    Some(ExistingConversationTailState {
6459        last_message_idx,
6460        last_message_created_at,
6461        ended_at,
6462    })
6463}
6464
6465fn franken_find_existing_conversation_with_tail_by_key(
6466    tx: &FrankenTransaction<'_>,
6467    key: &PendingConversationKey,
6468    conv: Option<&Conversation>,
6469) -> Result<Option<ExistingConversationWithTail>> {
6470    if let PendingConversationKey::External {
6471        source_id,
6472        agent_id,
6473        external_id,
6474    } = key
6475    {
6476        let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
6477        if let Some(existing) = franken_find_external_conversation_tail_lookup(tx, &lookup_key)? {
6478            return Ok(Some(existing));
6479        }
6480        return Ok(None);
6481    }
6482
6483    let Some(id) = franken_find_existing_conversation_by_key(tx, key, conv)? else {
6484        return Ok(None);
6485    };
6486    let tail_state = franken_existing_conversation_append_tail_state(tx, id)?;
6487    Ok(Some(ExistingConversationWithTail { id, tail_state }))
6488}
6489
6490fn franken_insert_conversation_tail_state(
6491    tx: &FrankenTransaction<'_>,
6492    conversation_id: i64,
6493    ended_at: Option<i64>,
6494    last_message_idx: Option<i64>,
6495    last_message_created_at: Option<i64>,
6496) -> Result<()> {
6497    if ended_at.is_none() && last_message_idx.is_none() && last_message_created_at.is_none() {
6498        return Ok(());
6499    }
6500    tx.execute_compat(
6501        "INSERT OR REPLACE INTO conversation_tail_state (
6502             conversation_id, ended_at, last_message_idx, last_message_created_at
6503         ) VALUES (?1, ?2, ?3, ?4)",
6504        fparams![
6505            conversation_id,
6506            ended_at,
6507            last_message_idx,
6508            last_message_created_at
6509        ],
6510    )?;
6511    Ok(())
6512}
6513
6514fn franken_update_conversation_tail_columns(
6515    tx: &FrankenTransaction<'_>,
6516    conversation_id: i64,
6517    ended_at_candidate: Option<i64>,
6518    last_message_idx_candidate: Option<i64>,
6519    last_message_created_at_candidate: Option<i64>,
6520) -> Result<()> {
6521    if ended_at_candidate.is_none()
6522        && last_message_idx_candidate.is_none()
6523        && last_message_created_at_candidate.is_none()
6524    {
6525        return Ok(());
6526    }
6527
6528    tx.execute_compat(
6529        "UPDATE conversations
6530         SET ended_at = CASE
6531                 WHEN ?1 IS NULL THEN ended_at
6532                 WHEN ended_at IS NULL OR ended_at < ?1 THEN ?1
6533                 ELSE ended_at
6534             END,
6535             last_message_idx = CASE
6536                 WHEN ?2 IS NULL THEN last_message_idx
6537                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6538                 ELSE last_message_idx
6539             END,
6540             last_message_created_at = CASE
6541                 WHEN ?3 IS NULL THEN last_message_created_at
6542                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6543                 ELSE last_message_created_at
6544             END
6545         WHERE id = ?4",
6546        fparams![
6547            ended_at_candidate,
6548            last_message_idx_candidate,
6549            last_message_created_at_candidate,
6550            conversation_id
6551        ],
6552    )?;
6553    Ok(())
6554}
6555
6556fn franken_tail_state_insert_ended_at(
6557    tx: &FrankenTransaction<'_>,
6558    conversation_id: i64,
6559    candidate: Option<i64>,
6560) -> Result<Option<i64>> {
6561    let canonical: Option<i64> = tx
6562        .query_row_map(
6563            "SELECT ended_at FROM conversations WHERE id = ?1",
6564            fparams![conversation_id],
6565            |row| row.get_typed(0),
6566        )
6567        .optional()?
6568        .flatten();
6569    Ok(canonical.max(candidate))
6570}
6571
6572fn franken_update_conversation_tail_state(
6573    tx: &FrankenTransaction<'_>,
6574    conversation_id: i64,
6575    ended_at_candidate: Option<i64>,
6576    last_message_idx_candidate: Option<i64>,
6577    last_message_created_at_candidate: Option<i64>,
6578) -> Result<()> {
6579    if ended_at_candidate.is_none()
6580        && last_message_idx_candidate.is_none()
6581        && last_message_created_at_candidate.is_none()
6582    {
6583        return Ok(());
6584    }
6585
6586    let changed = tx.execute_compat(
6587        "UPDATE conversation_tail_state
6588         SET ended_at = CASE
6589                 WHEN ?1 IS NULL THEN ended_at
6590                 ELSE MAX(IFNULL(ended_at, 0), ?1)
6591             END,
6592             last_message_idx = CASE
6593                 WHEN ?2 IS NULL THEN last_message_idx
6594                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6595                 ELSE last_message_idx
6596             END,
6597             last_message_created_at = CASE
6598                 WHEN ?3 IS NULL THEN last_message_created_at
6599                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6600                 ELSE last_message_created_at
6601             END
6602         WHERE conversation_id = ?4",
6603        fparams![
6604            ended_at_candidate,
6605            last_message_idx_candidate,
6606            last_message_created_at_candidate,
6607            conversation_id
6608        ],
6609    )?;
6610    if changed == 0 {
6611        let insert_ended_at =
6612            franken_tail_state_insert_ended_at(tx, conversation_id, ended_at_candidate)?;
6613        franken_insert_conversation_tail_state(
6614            tx,
6615            conversation_id,
6616            insert_ended_at,
6617            last_message_idx_candidate,
6618            last_message_created_at_candidate,
6619        )?;
6620    }
6621    franken_update_conversation_tail_columns(
6622        tx,
6623        conversation_id,
6624        ended_at_candidate,
6625        last_message_idx_candidate,
6626        last_message_created_at_candidate,
6627    )?;
6628    Ok(())
6629}
6630
6631fn franken_set_conversation_tail_state_after_append(
6632    tx: &FrankenTransaction<'_>,
6633    conversation_id: i64,
6634    ended_at: i64,
6635    last_message_idx: i64,
6636    last_message_created_at: i64,
6637) -> Result<()> {
6638    let changed = tx.execute_compat(
6639        "UPDATE conversation_tail_state
6640         SET ended_at = ?1,
6641             last_message_idx = ?2,
6642             last_message_created_at = ?3
6643         WHERE conversation_id = ?4",
6644        fparams![
6645            ended_at,
6646            last_message_idx,
6647            last_message_created_at,
6648            conversation_id
6649        ],
6650    )?;
6651    if changed == 0 {
6652        let insert_ended_at =
6653            franken_tail_state_insert_ended_at(tx, conversation_id, Some(ended_at))?;
6654        franken_insert_conversation_tail_state(
6655            tx,
6656            conversation_id,
6657            insert_ended_at,
6658            Some(last_message_idx),
6659            Some(last_message_created_at),
6660        )?;
6661    }
6662    franken_update_conversation_tail_columns(
6663        tx,
6664        conversation_id,
6665        Some(ended_at),
6666        Some(last_message_idx),
6667        Some(last_message_created_at),
6668    )?;
6669    Ok(())
6670}
6671
6672fn collect_append_only_tail_messages<'a>(
6673    conv: &'a Conversation,
6674    existing_max_idx: i64,
6675    existing_max_created_at: i64,
6676) -> Option<ExistingConversationNewMessages<'a>> {
6677    if conv.messages.is_empty() {
6678        return Some(ExistingConversationNewMessages {
6679            messages: Vec::new(),
6680            new_chars: 0,
6681            idx_collision_count: 0,
6682            first_collision_idx: None,
6683        });
6684    }
6685
6686    let mut split_idx = None;
6687    let mut prev_idx = None;
6688    for (pos, msg) in conv.messages.iter().enumerate() {
6689        if prev_idx.is_some_and(|prev| msg.idx < prev) {
6690            return None;
6691        }
6692        prev_idx = Some(msg.idx);
6693        if split_idx.is_none() && msg.idx > existing_max_idx {
6694            split_idx = Some(pos);
6695        }
6696    }
6697    let split_idx = split_idx?;
6698    if split_idx != 0 {
6699        return None;
6700    }
6701
6702    let mut seen_tail_idx = HashSet::new();
6703    let mut seen_tail_replay = HashSet::new();
6704    let mut new_chars = 0i64;
6705    let mut messages = Vec::new();
6706    for msg in &conv.messages[split_idx..] {
6707        let created_at = msg.created_at?;
6708        if created_at <= existing_max_created_at {
6709            return None;
6710        }
6711
6712        if !seen_tail_idx.insert(msg.idx) {
6713            return None;
6714        }
6715
6716        let replay_fingerprint = message_replay_fingerprint(msg);
6717        if !seen_tail_replay.insert(replay_fingerprint) {
6718            return None;
6719        }
6720
6721        new_chars += msg.content.len() as i64;
6722        messages.push(msg);
6723    }
6724
6725    Some(ExistingConversationNewMessages {
6726        messages,
6727        new_chars,
6728        idx_collision_count: 0,
6729        first_collision_idx: None,
6730    })
6731}
6732
6733fn collect_existing_conversation_noop_from_idx_tail<'a>(
6734    conv: &'a Conversation,
6735    _existing_max_idx: i64,
6736) -> Option<ExistingConversationNewMessages<'a>> {
6737    if conv.messages.is_empty() {
6738        return Some(ExistingConversationNewMessages {
6739            messages: Vec::new(),
6740            new_chars: 0,
6741            idx_collision_count: 0,
6742            first_collision_idx: None,
6743        });
6744    }
6745
6746    // A max idx alone does not prove lower idx rows exist. Sparse historical
6747    // imports can have idx 2..3 first and later recover idx 0..1, so non-empty
6748    // no-op decisions must use the bounded message lookup.
6749    None
6750}
6751
6752fn collect_existing_conversation_noop_from_conversation_ended_at<'a>(
6753    conv: &'a Conversation,
6754    existing_ended_at: i64,
6755) -> Option<ExistingConversationNewMessages<'a>> {
6756    if conv.messages.is_empty()
6757        && conv
6758            .ended_at
6759            .is_none_or(|ended_at| ended_at <= existing_ended_at)
6760    {
6761        return Some(ExistingConversationNewMessages {
6762            messages: Vec::new(),
6763            new_chars: 0,
6764            idx_collision_count: 0,
6765            first_collision_idx: None,
6766        });
6767    }
6768
6769    // A conversation-level ended_at says nothing about whether every earlier
6770    // message row was archived. Defer non-empty batches to the bounded lookup.
6771    None
6772}
6773
6774fn collect_existing_conversation_tail_from_ended_at<'a>(
6775    conv: &'a Conversation,
6776    existing_ended_at: i64,
6777) -> Option<ExistingConversationNewMessages<'a>> {
6778    if conv.messages.is_empty() {
6779        return Some(ExistingConversationNewMessages {
6780            messages: Vec::new(),
6781            new_chars: 0,
6782            idx_collision_count: 0,
6783            first_collision_idx: None,
6784        });
6785    }
6786
6787    let mut prev_idx = None;
6788    for msg in conv.messages.iter() {
6789        if prev_idx.is_some_and(|prev| msg.idx <= prev) {
6790            return None;
6791        }
6792        prev_idx = Some(msg.idx);
6793        if msg.created_at? <= existing_ended_at {
6794            return None;
6795        }
6796    }
6797
6798    let mut seen_tail_replay = HashSet::new();
6799    let mut new_chars = 0i64;
6800    let mut messages = Vec::new();
6801    for msg in &conv.messages {
6802        let replay_fingerprint = message_replay_fingerprint(msg);
6803        if !seen_tail_replay.insert(replay_fingerprint) {
6804            return None;
6805        }
6806
6807        new_chars += msg.content.len() as i64;
6808        messages.push(msg);
6809    }
6810
6811    Some(ExistingConversationNewMessages {
6812        messages,
6813        new_chars,
6814        idx_collision_count: 0,
6815        first_collision_idx: None,
6816    })
6817}
6818
6819fn trace_existing_conversation_lookup_fallback(
6820    conversation_id: i64,
6821    conv: &Conversation,
6822    tail_state: Option<ExistingConversationTailState>,
6823    existing_ended_at: Option<i64>,
6824) {
6825    if !MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
6826        return;
6827    }
6828
6829    let mut prev_idx = None;
6830    let mut idx_order_violations = 0usize;
6831    let mut duplicate_idx_count = 0usize;
6832    let mut seen_idx = HashSet::new();
6833    let mut missing_created_at = 0usize;
6834    let mut min_idx = None;
6835    let mut max_idx = None;
6836    let mut min_created_at = None;
6837    let mut max_created_at = None;
6838    for msg in &conv.messages {
6839        if prev_idx.is_some_and(|prev| msg.idx < prev) {
6840            idx_order_violations = idx_order_violations.saturating_add(1);
6841        }
6842        prev_idx = Some(msg.idx);
6843        if !seen_idx.insert(msg.idx) {
6844            duplicate_idx_count = duplicate_idx_count.saturating_add(1);
6845        }
6846        min_idx = Some(min_idx.map_or(msg.idx, |current: i64| current.min(msg.idx)));
6847        max_idx = Some(max_idx.map_or(msg.idx, |current: i64| current.max(msg.idx)));
6848        if let Some(created_at) = msg.created_at {
6849            min_created_at =
6850                Some(min_created_at.map_or(created_at, |current: i64| current.min(created_at)));
6851            max_created_at =
6852                Some(max_created_at.map_or(created_at, |current: i64| current.max(created_at)));
6853        } else {
6854            missing_created_at = missing_created_at.saturating_add(1);
6855        }
6856    }
6857
6858    let first_idx_after_tail = tail_state.and_then(|state| {
6859        conv.messages
6860            .iter()
6861            .find(|msg| msg.idx > state.last_message_idx)
6862            .map(|msg| msg.idx)
6863    });
6864    let first_created_after_tail = tail_state.and_then(|state| {
6865        conv.messages
6866            .iter()
6867            .find(|msg| {
6868                msg.created_at
6869                    .is_some_and(|created_at| created_at > state.last_message_created_at)
6870            })
6871            .and_then(|msg| msg.created_at)
6872    });
6873    let first_created_after_ended_at = existing_ended_at.and_then(|ended_at| {
6874        conv.messages
6875            .iter()
6876            .find(|msg| {
6877                msg.created_at
6878                    .is_some_and(|created_at| created_at > ended_at)
6879            })
6880            .and_then(|msg| msg.created_at)
6881    });
6882
6883    let payload = serde_json::json!({
6884        "event": "existing_conversation_message_lookup_fallback",
6885        "conversation_id": conversation_id,
6886        "agent_slug": conv.agent_slug,
6887        "source_path": conv.source_path,
6888        "external_id": conv.external_id,
6889        "messages": conv.messages.len(),
6890        "min_idx": min_idx,
6891        "max_idx": max_idx,
6892        "missing_created_at": missing_created_at,
6893        "min_created_at": min_created_at,
6894        "max_created_at": max_created_at,
6895        "idx_order_violations": idx_order_violations,
6896        "duplicate_idx_count": duplicate_idx_count,
6897        "tail_state": tail_state.map(|state| {
6898            serde_json::json!({
6899                "last_message_idx": state.last_message_idx,
6900                "last_message_created_at": state.last_message_created_at,
6901                "ended_at": state.ended_at,
6902            })
6903        }),
6904        "existing_ended_at": existing_ended_at,
6905        "first_idx_after_tail": first_idx_after_tail,
6906        "first_created_after_tail": first_created_after_tail,
6907        "first_created_after_ended_at": first_created_after_ended_at,
6908    });
6909    if let Ok(line) = serde_json::to_string(&payload) {
6910        eprintln!("{line}");
6911    }
6912}
6913
6914fn franken_existing_conversation_ended_at(
6915    tx: &FrankenTransaction<'_>,
6916    conversation_id: i64,
6917) -> Result<Option<i64>> {
6918    let ended_at: Option<Option<i64>> = tx
6919        .query_row_map(
6920            "SELECT ended_at
6921             FROM conversations
6922             WHERE id = ?1",
6923            fparams![conversation_id],
6924            |row| row.get_typed(0),
6925        )
6926        .optional()?;
6927    Ok(ended_at.flatten())
6928}
6929
6930fn start_distance_ms(left: Option<i64>, right: Option<i64>) -> i64 {
6931    match (left, right) {
6932        (Some(left), Some(right)) => (i128::from(left) - i128::from(right))
6933            .abs()
6934            .try_into()
6935            .unwrap_or(i64::MAX),
6936        _ => i64::MAX,
6937    }
6938}
6939
6940fn conversation_merge_evidence(
6941    incoming_exact: &HashSet<MessageMergeFingerprint>,
6942    incoming_replay: &HashSet<MessageReplayFingerprint>,
6943    existing_exact: &HashSet<MessageMergeFingerprint>,
6944    existing_replay: &HashSet<MessageReplayFingerprint>,
6945    incoming_started_at: Option<i64>,
6946    existing_started_at: Option<i64>,
6947) -> Option<ConversationMergeEvidence> {
6948    let exact_overlap = incoming_exact.intersection(existing_exact).count();
6949    let replay_overlap = incoming_replay.intersection(existing_replay).count();
6950    if exact_overlap == 0 && replay_overlap == 0 {
6951        return None;
6952    }
6953
6954    let smaller_replay_set = incoming_replay.len().min(existing_replay.len());
6955    let started_close = timestamps_within_tolerance(
6956        incoming_started_at,
6957        existing_started_at,
6958        SOURCE_PATH_MERGE_START_TOLERANCE_MS,
6959    );
6960    let full_replay_subset_match = smaller_replay_set >= 2 && replay_overlap == smaller_replay_set;
6961
6962    let merge_allowed = if started_close {
6963        exact_overlap >= 1 || replay_overlap >= 2
6964    } else {
6965        exact_overlap >= 2 || full_replay_subset_match
6966    };
6967
6968    merge_allowed.then_some(ConversationMergeEvidence {
6969        exact_overlap,
6970        replay_overlap,
6971        smaller_replay_set,
6972        started_close,
6973        start_distance_ms: start_distance_ms(incoming_started_at, existing_started_at),
6974    })
6975}
6976
6977fn timestamps_within_tolerance(left: Option<i64>, right: Option<i64>, tolerance_ms: i64) -> bool {
6978    match (left, right) {
6979        (Some(left), Some(right)) => {
6980            (i128::from(left) - i128::from(right)).abs() <= i128::from(tolerance_ms)
6981        }
6982        _ => false,
6983    }
6984}
6985
6986fn conversation_merge_key(agent_id: i64, conv: &Conversation) -> PendingConversationKey {
6987    if let Some(external_id) = conv.external_id.clone() {
6988        PendingConversationKey::External {
6989            source_id: conv.source_id.clone(),
6990            agent_id,
6991            external_id,
6992        }
6993    } else {
6994        PendingConversationKey::SourcePath {
6995            source_id: conv.source_id.clone(),
6996            agent_id,
6997            source_path: path_to_string(&conv.source_path),
6998            started_at: conversation_effective_started_at(conv),
6999        }
7000    }
7001}
7002
7003/// Message data needed for semantic embedding generation.
7004pub struct MessageForEmbedding {
7005    pub message_id: i64,
7006    pub created_at: Option<i64>,
7007    pub agent_id: i64,
7008    pub workspace_id: Option<i64>,
7009    pub source_id_hash: u32,
7010    pub role: String,
7011    pub content: String,
7012}
7013
7014// =========================================================================
7015// FrankenStorage CRUD operations
7016// =========================================================================
7017
7018impl FrankenStorage {
7019    /// Ensure an agent exists in the database, returning its ID.
7020    pub fn ensure_agent(&self, agent: &Agent) -> Result<i64> {
7021        let cache_key = EnsuredAgentKey::from_agent(agent);
7022        if let Some(id) = self.cached_agent_id(&cache_key) {
7023            return Ok(id);
7024        }
7025
7026        let now = Self::now_millis();
7027        self.conn.execute_compat(
7028            "INSERT INTO agents(slug, name, version, kind, created_at, updated_at)
7029             VALUES(?1, ?2, ?3, ?4, ?5, ?6)
7030             ON CONFLICT(slug) DO UPDATE SET
7031                 name = excluded.name,
7032                 version = excluded.version,
7033                 kind = excluded.kind,
7034                 updated_at = excluded.updated_at
7035             WHERE NOT (
7036                 agents.name IS excluded.name
7037                 AND agents.version IS excluded.version
7038                 AND agents.kind IS excluded.kind
7039             )",
7040            fparams![
7041                agent.slug.as_str(),
7042                agent.name.as_str(),
7043                agent.version.as_deref(),
7044                cache_key.kind.as_str(),
7045                now,
7046                now
7047            ],
7048        )?;
7049
7050        let id = self
7051            .conn
7052            .query_row_map(
7053                "SELECT id FROM agents WHERE slug = ?1 LIMIT 1",
7054                fparams![agent.slug.as_str()],
7055                |row| row.get_typed(0),
7056            )
7057            .with_context(|| format!("fetching agent id for {}", agent.slug))?;
7058        self.mark_agent_ensured(cache_key, id);
7059        Ok(id)
7060    }
7061
7062    /// Ensure a workspace exists in the database, returning its ID.
7063    pub fn ensure_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
7064        let path_str = path.to_string_lossy().to_string();
7065        let cache_key = EnsuredWorkspaceKey::new(path_str.clone(), display_name);
7066        if let Some(id) = self.cached_workspace_id(&cache_key) {
7067            return Ok(id);
7068        }
7069
7070        if let Some(display_name) = display_name {
7071            self.conn.execute_compat(
7072                "INSERT INTO workspaces(path, display_name)
7073                 VALUES(?1, ?2)
7074                 ON CONFLICT(path) DO UPDATE SET
7075                     display_name = excluded.display_name
7076                 WHERE NOT (workspaces.display_name IS excluded.display_name)",
7077                fparams![path_str.as_str(), display_name],
7078            )?;
7079        } else {
7080            self.conn.execute_compat(
7081                "INSERT OR IGNORE INTO workspaces(path, display_name) VALUES(?1, NULL)",
7082                fparams![path_str.as_str()],
7083            )?;
7084        }
7085
7086        let id = self
7087            .conn
7088            .query_row_map(
7089                "SELECT id FROM workspaces WHERE path = ?1 LIMIT 1",
7090                fparams![path_str.as_str()],
7091                |row| row.get_typed(0),
7092            )
7093            .with_context(|| format!("fetching workspace id for {path_str}"))?;
7094        self.mark_workspace_ensured(cache_key, id);
7095        Ok(id)
7096    }
7097
7098    /// Get current time as milliseconds since epoch.
7099    pub fn now_millis() -> i64 {
7100        SystemTime::now()
7101            .duration_since(UNIX_EPOCH)
7102            .map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
7103            .unwrap_or(0)
7104    }
7105
7106    /// Convert a millisecond timestamp to a day ID (days since 2020-01-01).
7107    pub fn day_id_from_millis(timestamp_ms: i64) -> i64 {
7108        const EPOCH_2020_SECS: i64 = 1_577_836_800;
7109        let secs = timestamp_ms.div_euclid(1000);
7110        (secs - EPOCH_2020_SECS).div_euclid(86400)
7111    }
7112
7113    /// Convert a millisecond timestamp to an hour ID (hours since 2020-01-01 00:00 UTC).
7114    pub fn hour_id_from_millis(timestamp_ms: i64) -> i64 {
7115        const EPOCH_2020_SECS: i64 = 1_577_836_800;
7116        let secs = timestamp_ms.div_euclid(1000);
7117        (secs - EPOCH_2020_SECS).div_euclid(3600)
7118    }
7119
7120    /// Convert a day ID back to milliseconds (start of day).
7121    pub fn millis_from_day_id(day_id: i64) -> i64 {
7122        const EPOCH_2020_SECS: i64 = 1_577_836_800;
7123        (EPOCH_2020_SECS + day_id * 86400) * 1000
7124    }
7125
7126    /// Convert an hour ID back to milliseconds (start of hour).
7127    pub fn millis_from_hour_id(hour_id: i64) -> i64 {
7128        const EPOCH_2020_SECS: i64 = 1_577_836_800;
7129        (EPOCH_2020_SECS + hour_id * 3600) * 1000
7130    }
7131
7132    /// Get the timestamp of the last successful scan.
7133    pub fn get_last_scan_ts(&self) -> Result<Option<i64>> {
7134        let result: Result<String, _> = self.conn.query_row_map(
7135            "SELECT value FROM meta WHERE key = 'last_scan_ts'",
7136            fparams![],
7137            |row| row.get_typed(0),
7138        );
7139        match result.optional() {
7140            Ok(Some(s)) => Ok(s.parse().ok()),
7141            Ok(None) => Ok(None),
7142            Err(e) => Err(e.into()),
7143        }
7144    }
7145
7146    /// Set the timestamp of the last successful scan (milliseconds since epoch).
7147    pub fn set_last_scan_ts(&self, ts: i64) -> Result<()> {
7148        self.conn.execute_compat(
7149            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_scan_ts', ?1)",
7150            fparams![ts.to_string()],
7151        )?;
7152        Ok(())
7153    }
7154
7155    fn connector_last_scan_ts_meta_key(connector_name: &str) -> String {
7156        format!(
7157            "last_scan_ts:connector:{}",
7158            connector_name.trim().to_ascii_lowercase()
7159        )
7160    }
7161
7162    fn connector_agent_slug_candidates(connector_name: &str) -> SmallVec<[String; 3]> {
7163        let normalized = connector_name.trim().to_ascii_lowercase();
7164        let mut candidates = SmallVec::<[String; 3]>::new();
7165        if normalized.is_empty() {
7166            return candidates;
7167        }
7168
7169        candidates.push(normalized.clone());
7170        match normalized.as_str() {
7171            "claude" | "claude-code" | "claude_code" => {
7172                candidates.push("claude_code".to_string());
7173                candidates.push("claude-code".to_string());
7174            }
7175            _ => {}
7176        }
7177        candidates.sort();
7178        candidates.dedup();
7179        candidates
7180    }
7181
7182    /// Get the timestamp of the last successful scan for a specific connector.
7183    pub fn get_connector_last_scan_ts(&self, connector_name: &str) -> Result<Option<i64>> {
7184        let key = Self::connector_last_scan_ts_meta_key(connector_name);
7185        let result: Result<String, _> = self.conn.query_row_map(
7186            "SELECT value FROM meta WHERE key = ?1",
7187            fparams![key.as_str()],
7188            |row| row.get_typed(0),
7189        );
7190        match result.optional() {
7191            Ok(Some(s)) => Ok(s.parse().ok()),
7192            Ok(None) => Ok(None),
7193            Err(e) => Err(e.into()),
7194        }
7195    }
7196
7197    /// Set the timestamp of the last successful scan for a specific connector.
7198    pub fn set_connector_last_scan_ts(&self, connector_name: &str, ts: i64) -> Result<()> {
7199        let key = Self::connector_last_scan_ts_meta_key(connector_name);
7200        self.conn.execute_compat(
7201            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
7202            fparams![key.as_str(), ts.to_string()],
7203        )?;
7204        Ok(())
7205    }
7206
7207    /// Load per-connector scan watermarks and archived-row presence in one
7208    /// explicit transaction.
7209    ///
7210    /// These reads run during index startup. On large file-backed archives,
7211    /// issuing them one-at-a-time in autocommit can force frankensqlite's
7212    /// clean prepared-read path to refresh the file-backed MemDatabase before
7213    /// any connector work starts. A single explicit transaction keeps the
7214    /// startup path on the pager-backed read path while preserving the same
7215    /// newly-enabled-connector semantics as the scalar helpers below.
7216    pub fn connector_scan_states(
7217        &self,
7218        connector_names: &[&str],
7219    ) -> Result<HashMap<String, (Option<i64>, bool)>> {
7220        let requested = connector_names
7221            .iter()
7222            .map(|name| name.trim().to_ascii_lowercase())
7223            .filter(|name| !name.is_empty())
7224            .collect::<HashSet<_>>();
7225        let mut states = requested
7226            .iter()
7227            .map(|name| (name.clone(), (None, false)))
7228            .collect::<HashMap<_, _>>();
7229        if states.is_empty() {
7230            return Ok(states);
7231        }
7232
7233        let mut tx = self.conn.transaction()?;
7234        let watermark_rows: Vec<(String, String)> = tx.query_map_collect(
7235            "SELECT key, value FROM meta WHERE key LIKE 'last_scan_ts:connector:%'",
7236            fparams![],
7237            |row| {
7238                let key: String = row.get_typed(0)?;
7239                let value: String = row.get_typed(1)?;
7240                Ok((key, value))
7241            },
7242        )?;
7243
7244        for (key, value) in watermark_rows {
7245            let Some(connector_name) = key.strip_prefix("last_scan_ts:connector:") else {
7246                continue;
7247            };
7248            if let Some((last_scan_ts, _)) =
7249                states.get_mut(connector_name.trim().to_ascii_lowercase().as_str())
7250            {
7251                *last_scan_ts = value.parse().ok();
7252            }
7253        }
7254
7255        let archived_agent_slugs = tx
7256            .query_map_collect(
7257                "SELECT DISTINCT a.slug
7258                 FROM agents a
7259                 JOIN conversations c ON c.agent_id = a.id",
7260                fparams![],
7261                |row| row.get_typed::<String>(0),
7262            )?
7263            .into_iter()
7264            .map(|slug| slug.trim().to_ascii_lowercase())
7265            .collect::<HashSet<_>>();
7266
7267        for connector_name in requested {
7268            if Self::connector_agent_slug_candidates(&connector_name)
7269                .iter()
7270                .any(|slug| archived_agent_slugs.contains(slug))
7271                && let Some((_, has_conversations)) = states.get_mut(connector_name.as_str())
7272            {
7273                *has_conversations = true;
7274            }
7275        }
7276
7277        tx.rollback()?;
7278        Ok(states)
7279    }
7280
7281    /// Whether this connector already has archived conversations.
7282    pub fn connector_has_conversations(&self, connector_name: &str) -> Result<bool> {
7283        let candidate_slugs = Self::connector_agent_slug_candidates(connector_name);
7284        if candidate_slugs.is_empty() {
7285            return Ok(false);
7286        }
7287
7288        for slug in candidate_slugs {
7289            let exists: i64 = self.conn.query_row_map(
7290                "SELECT EXISTS(
7291                     SELECT 1
7292                     FROM conversations c
7293                     JOIN agents a ON a.id = c.agent_id
7294                     WHERE a.slug = ?1
7295                     LIMIT 1
7296                 )",
7297                fparams![slug.as_str()],
7298                |row| row.get_typed(0),
7299            )?;
7300            if exists != 0 {
7301                return Ok(true);
7302            }
7303        }
7304        Ok(false)
7305    }
7306
7307    /// Get the timestamp of the last successful index completion.
7308    pub fn get_last_indexed_at(&self) -> Result<Option<i64>> {
7309        let result: Result<String, _> = self.conn.query_row_map(
7310            "SELECT value FROM meta WHERE key = 'last_indexed_at'",
7311            fparams![],
7312            |row| row.get_typed(0),
7313        );
7314        match result.optional() {
7315            Ok(Some(s)) => Ok(s.parse().ok()),
7316            Ok(None) => Ok(None),
7317            Err(e) => Err(e.into()),
7318        }
7319    }
7320
7321    /// Set the timestamp of the last successful index completion (milliseconds since epoch).
7322    pub fn set_last_indexed_at(&self, ts: i64) -> Result<()> {
7323        self.conn.execute_compat(
7324            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_indexed_at', ?1)",
7325            fparams![ts.to_string()],
7326        )?;
7327        Ok(())
7328    }
7329
7330    /// List all registered agents.
7331    pub fn list_agents(&self) -> Result<Vec<Agent>> {
7332        self.conn
7333            .query_map_collect(
7334                "SELECT id, slug, name, version, kind FROM agents ORDER BY slug",
7335                fparams![],
7336                |row| {
7337                    let kind: String = row.get_typed(4)?;
7338                    Ok(Agent {
7339                        id: Some(row.get_typed(0)?),
7340                        slug: row.get_typed(1)?,
7341                        name: row.get_typed(2)?,
7342                        version: row.get_typed(3)?,
7343                        kind: match kind.as_str() {
7344                            "cli" => AgentKind::Cli,
7345                            "vscode" => AgentKind::VsCode,
7346                            _ => AgentKind::Hybrid,
7347                        },
7348                    })
7349                },
7350            )
7351            .with_context(|| "listing agents")
7352    }
7353
7354    /// Count all archived conversations.
7355    pub fn total_conversation_count(&self) -> Result<usize> {
7356        let count: i64 =
7357            self.conn
7358                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
7359                    row.get_typed(0)
7360                })?;
7361        Ok(count.max(0) as usize)
7362    }
7363
7364    /// Count all archived messages.
7365    pub fn total_message_count(&self) -> Result<usize> {
7366        let count: i64 =
7367            self.conn
7368                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
7369                    row.get_typed(0)
7370                })?;
7371        Ok(count.max(0) as usize)
7372    }
7373
7374    /// Remove all archived conversations/messages for one agent slug.
7375    ///
7376    /// This only affects cass's local archive database. Source session files on
7377    /// disk are untouched.
7378    pub fn purge_agent_archive_data(&self, agent_slug: &str) -> Result<AgentArchivePurgeResult> {
7379        let normalized = agent_slug.trim().to_ascii_lowercase();
7380        if normalized.is_empty() {
7381            return Err(anyhow!("agent slug cannot be empty"));
7382        }
7383
7384        let Some(agent_id) = self
7385            .conn
7386            .query_row_map(
7387                "SELECT id FROM agents WHERE slug = ?1",
7388                fparams![normalized.as_str()],
7389                |row| row.get_typed::<i64>(0),
7390            )
7391            .optional()?
7392        else {
7393            return Ok(AgentArchivePurgeResult::default());
7394        };
7395
7396        let conversations_deleted: i64 = self.conn.query_row_map(
7397            "SELECT COUNT(*) FROM conversations WHERE agent_id = ?1",
7398            fparams![agent_id],
7399            |row| row.get_typed(0),
7400        )?;
7401        if conversations_deleted == 0 {
7402            return Ok(AgentArchivePurgeResult::default());
7403        }
7404
7405        let messages_deleted: i64 = self.conn.query_row_map(
7406            "SELECT COUNT(*)
7407             FROM messages
7408             WHERE conversation_id IN (
7409                 SELECT id FROM conversations WHERE agent_id = ?1
7410             )",
7411            fparams![agent_id],
7412            |row| row.get_typed(0),
7413        )?;
7414
7415        let mut tx = self.conn.transaction()?;
7416        tx.execute_compat(
7417            "DELETE FROM conversation_external_lookup
7418             WHERE conversation_id IN (
7419                 SELECT id FROM conversations WHERE agent_id = ?1
7420             )",
7421            fparams![agent_id],
7422        )?;
7423        tx.execute_compat(
7424            "DELETE FROM conversation_external_tail_lookup
7425             WHERE conversation_id IN (
7426                 SELECT id FROM conversations WHERE agent_id = ?1
7427             )",
7428            fparams![agent_id],
7429        )?;
7430        tx.execute_compat(
7431            "DELETE FROM conversations WHERE agent_id = ?1",
7432            fparams![agent_id],
7433        )?;
7434        tx.execute_compat(
7435            "DELETE FROM agents
7436             WHERE id = ?1
7437               AND NOT EXISTS (
7438                   SELECT 1 FROM conversations WHERE agent_id = ?1
7439               )",
7440            fparams![agent_id],
7441        )?;
7442        tx.commit()?;
7443
7444        Ok(AgentArchivePurgeResult {
7445            conversations_deleted: conversations_deleted.max(0) as usize,
7446            messages_deleted: messages_deleted.max(0) as usize,
7447        })
7448    }
7449
7450    /// List all registered workspaces.
7451    pub fn list_workspaces(&self) -> Result<Vec<crate::model::types::Workspace>> {
7452        self.conn
7453            .query_map_collect(
7454                "SELECT id, path, display_name FROM workspaces ORDER BY path",
7455                fparams![],
7456                |row| {
7457                    let path_str: String = row.get_typed(1)?;
7458                    Ok(crate::model::types::Workspace {
7459                        id: Some(row.get_typed(0)?),
7460                        path: Path::new(&path_str).to_path_buf(),
7461                        display_name: row.get_typed(2)?,
7462                    })
7463                },
7464            )
7465            .with_context(|| "listing workspaces")
7466    }
7467
7468    /// List conversations with pagination.
7469    pub fn list_conversations(&self, limit: i64, offset: i64) -> Result<Vec<Conversation>> {
7470        // Avoid the multi-table JOIN with LIMIT/OFFSET that triggers
7471        // frankensqlite's materialization fallback (see c38edcd9, 860acb12).
7472        // Use correlated subqueries for the tiny agents (~20 rows) and
7473        // workspaces (~30 rows) lookup tables and degrade NULL agent_id to
7474        // the same 'unknown' sentinel that 8a0c547c established for the
7475        // lexical rebuild path.
7476        self.conn
7477            .query_map_collect(
7478                r"SELECT c.id,
7479                         COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
7480                         (SELECT w.path FROM workspaces w WHERE w.id = c.workspace_id),
7481                         c.external_id, c.title, c.source_path,
7482                         c.started_at,
7483                         COALESCE(
7484                             (SELECT ts.ended_at
7485                              FROM conversation_tail_state ts
7486                              WHERE ts.conversation_id = c.id),
7487                             c.ended_at
7488                         ),
7489                         c.approx_tokens, c.metadata_json,
7490                         c.source_id, c.origin_host, c.metadata_bin
7491                FROM conversations c
7492                ORDER BY CASE WHEN c.started_at IS NULL THEN 1 ELSE 0 END, c.started_at DESC, c.id DESC
7493                LIMIT ?1 OFFSET ?2",
7494                fparams![limit, offset],
7495                |row| {
7496                    let workspace_path: Option<String> = row.get_typed(2)?;
7497                    let source_path: String = row.get_typed(5)?;
7498                    let raw_source_id: Option<String> = row.get_typed(10)?;
7499                    let raw_origin_host: Option<String> = row.get_typed(11)?;
7500                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7501                        raw_source_id.as_deref(),
7502                        None,
7503                        raw_origin_host.as_deref(),
7504                    );
7505                    Ok(Conversation {
7506                        id: Some(row.get_typed(0)?),
7507                        agent_slug: row.get_typed(1)?,
7508                        workspace: workspace_path.map(|p| Path::new(&p).to_path_buf()),
7509                        external_id: row.get_typed(3)?,
7510                        title: row.get_typed(4)?,
7511                        source_path: Path::new(&source_path).to_path_buf(),
7512                        started_at: row.get_typed(6)?,
7513                        ended_at: row.get_typed(7)?,
7514                        approx_tokens: row.get_typed(8)?,
7515                        metadata_json: franken_read_metadata_compat(row, 9, 12),
7516                        messages: Vec::new(),
7517                        source_id,
7518                        origin_host,
7519                    })
7520                },
7521            )
7522            .with_context(|| "listing conversations")
7523    }
7524
7525    /// Build lookup maps for agents and workspaces to avoid JOINs in
7526    /// paged conversation queries.  Both tables are tiny (tens of rows)
7527    /// so this is effectively free.
7528    pub fn build_lexical_rebuild_lookups(
7529        &self,
7530    ) -> Result<(HashMap<i64, String>, HashMap<i64, PathBuf>)> {
7531        let agents: HashMap<i64, String> = self
7532            .conn
7533            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
7534                Ok((row.get_typed::<i64>(0)?, row.get_typed::<String>(1)?))
7535            })
7536            .with_context(|| "loading agent lookup for lexical rebuild")?
7537            .into_iter()
7538            .collect();
7539        let workspaces: HashMap<i64, PathBuf> = self
7540            .conn
7541            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
7542                let path_str: String = row.get_typed(1)?;
7543                Ok((row.get_typed::<i64>(0)?, PathBuf::from(path_str)))
7544            })
7545            .with_context(|| "loading workspace lookup for lexical rebuild")?
7546            .into_iter()
7547            .collect();
7548        Ok((agents, workspaces))
7549    }
7550
7551    /// List per-conversation message footprints in primary-key order.
7552    ///
7553    /// This deliberately avoids rebuild-path JOINs. Instead we merge ordered
7554    /// single-table reads over `conversations` and the narrow
7555    /// `conversation_tail_state` cache in Rust, then use `last_message_idx + 1`
7556    /// as a planning estimate.
7557    ///
7558    /// The planner only needs a sizing heuristic; exact message and byte
7559    /// accounting is performed later by the rebuild packet pipeline as it reads
7560    /// message content for indexing. Rows missing both tail-cache sources fall
7561    /// back to `MAX(messages.idx) + 1`, which preserves legacy upgraded
7562    /// databases without treating populated conversations as empty.
7563    pub fn list_conversation_footprints_for_lexical_rebuild(
7564        &self,
7565    ) -> Result<Vec<LexicalRebuildConversationFootprintRow>> {
7566        let tail_state_rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
7567            "SELECT conversation_id, last_message_idx
7568             FROM conversation_tail_state
7569             ORDER BY conversation_id ASC",
7570            fparams![],
7571            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
7572        ) {
7573            Ok(rows) => rows,
7574            Err(err) if error_indicates_missing_table(&err) => Vec::new(),
7575            Err(err) => {
7576                return Err(err).with_context(|| "listing lexical rebuild tail-state estimates");
7577            }
7578        };
7579        let tail_state_by_conversation: HashMap<i64, Option<i64>> =
7580            tail_state_rows.into_iter().collect();
7581
7582        let rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
7583            "SELECT id, last_message_idx
7584             FROM conversations
7585             ORDER BY id ASC",
7586            fparams![],
7587            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
7588        ) {
7589            Ok(rows) => rows,
7590            Err(err) if error_indicates_missing_column(&err) => self
7591                .conn
7592                .query_map_collect(
7593                    "SELECT id
7594                     FROM conversations
7595                     ORDER BY id ASC",
7596                    fparams![],
7597                    |row| Ok((row.get_typed::<i64>(0)?, None)),
7598                )
7599                .with_context(|| {
7600                    "listing lexical rebuild conversation ids after missing tail column fallback"
7601                })?,
7602            Err(err) => {
7603                return Err(err)
7604                    .with_context(|| "listing lexical rebuild conversation footprint estimates");
7605            }
7606        };
7607
7608        let mut footprints = Vec::with_capacity(rows.len());
7609        let mut missing_tail_positions = HashMap::new();
7610        for (conversation_id, conversation_last_message_idx) in rows {
7611            let last_message_idx = tail_state_by_conversation
7612                .get(&conversation_id)
7613                .copied()
7614                .flatten()
7615                .or(conversation_last_message_idx);
7616            let Some(message_count) = lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7617            else {
7618                missing_tail_positions.insert(conversation_id, footprints.len());
7619                footprints.push(LexicalRebuildConversationFootprintRow {
7620                    conversation_id,
7621                    message_count: 0,
7622                    message_bytes: 0,
7623                });
7624                continue;
7625            };
7626            footprints.push(lexical_rebuild_conversation_footprint_from_count(
7627                conversation_id,
7628                message_count,
7629            ));
7630        }
7631
7632        let every_footprint_was_missing_tail = missing_tail_positions.len() == footprints.len();
7633        if !missing_tail_positions.is_empty() {
7634            self.fill_missing_lexical_rebuild_footprint_tails(
7635                &mut footprints,
7636                &missing_tail_positions,
7637            )?;
7638        }
7639        if !every_footprint_was_missing_tail {
7640            self.raise_lexical_rebuild_footprints_to_exact_message_counts(&mut footprints)?;
7641        }
7642
7643        Ok(footprints)
7644    }
7645
7646    pub fn lexical_rebuild_has_tail_footprint_metadata(&self) -> Result<bool> {
7647        let total_conversations: i64 = self
7648            .conn
7649            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
7650                row.get_typed(0)
7651            })
7652            .with_context(|| "counting conversations for lexical rebuild tail metadata coverage")?;
7653        let total_conversations = usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX);
7654        if total_conversations == 0 {
7655            return Ok(true);
7656        }
7657
7658        let conversation_columns = franken_table_column_names(&self.conn, "conversations")?;
7659        let conversations_have_tail_column = conversation_columns.contains("last_message_idx");
7660        let tail_state_has_tail_column =
7661            match franken_table_column_names(&self.conn, "conversation_tail_state") {
7662                Ok(columns) => columns.contains("last_message_idx"),
7663                Err(err) if error_indicates_missing_table(&err) => false,
7664                Err(err) => {
7665                    return Err(err)
7666                        .with_context(|| "reading lexical rebuild tail-state metadata columns");
7667                }
7668            };
7669        if !conversations_have_tail_column && !tail_state_has_tail_column {
7670            return Ok(false);
7671        }
7672
7673        let covered_sql = match (conversations_have_tail_column, tail_state_has_tail_column) {
7674            (true, true) => {
7675                "SELECT COUNT(*)
7676                 FROM conversations c
7677                 LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
7678                 WHERE c.last_message_idx IS NOT NULL
7679                    OR ts.last_message_idx IS NOT NULL"
7680            }
7681            (true, false) => {
7682                "SELECT COUNT(*)
7683                 FROM conversations
7684                 WHERE last_message_idx IS NOT NULL"
7685            }
7686            (false, true) => {
7687                "SELECT COUNT(*)
7688                 FROM conversations c
7689                 WHERE EXISTS (
7690                     SELECT 1
7691                     FROM conversation_tail_state ts
7692                     WHERE ts.conversation_id = c.id
7693                       AND ts.last_message_idx IS NOT NULL
7694                 )"
7695            }
7696            (false, false) => unreachable!("checked before covered_sql selection"),
7697        };
7698        let covered_conversations: i64 = self
7699            .conn
7700            .query_row_map(covered_sql, fparams![], |row| row.get_typed(0))
7701            .with_context(
7702                || "counting conversations covered by lexical rebuild tail footprint metadata",
7703            )?;
7704        let covered_conversations =
7705            usize::try_from(covered_conversations.max(0)).unwrap_or(usize::MAX);
7706
7707        Ok(lexical_rebuild_tail_metadata_coverage_is_sufficient(
7708            total_conversations,
7709            covered_conversations,
7710        ))
7711    }
7712
7713    fn raise_lexical_rebuild_footprints_to_exact_message_counts(
7714        &self,
7715        footprints: &mut [LexicalRebuildConversationFootprintRow],
7716    ) -> Result<()> {
7717        if footprints.is_empty() {
7718            return Ok(());
7719        }
7720
7721        let positions_by_conversation: HashMap<i64, usize> = footprints
7722            .iter()
7723            .enumerate()
7724            .map(|(position, footprint)| (footprint.conversation_id, position))
7725            .collect();
7726        self.conn
7727            .query_with_params_for_each(
7728                "SELECT conversation_id, COUNT(*) AS message_count
7729                 FROM messages
7730                 GROUP BY conversation_id
7731                 ORDER BY conversation_id ASC",
7732                &[] as &[SqliteValue],
7733                |row| {
7734                    let conversation_id: i64 = row.get_typed(0)?;
7735                    let exact_count: i64 = row.get_typed(1)?;
7736                    let Some(position) = positions_by_conversation.get(&conversation_id) else {
7737                        return Ok(());
7738                    };
7739                    let exact_count = usize::try_from(exact_count.max(0)).unwrap_or(usize::MAX);
7740                    let footprint = &mut footprints[*position];
7741                    if exact_count > footprint.message_count {
7742                        footprint.message_count = exact_count;
7743                        footprint.message_bytes =
7744                            footprint.message_bytes.max(exact_count.saturating_mul(
7745                                LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
7746                            ));
7747                    }
7748                    Ok(())
7749                },
7750            )
7751            .with_context(|| "raising lexical rebuild footprints to exact message counts")?;
7752        Ok(())
7753    }
7754
7755    fn fill_missing_lexical_rebuild_footprint_tails(
7756        &self,
7757        footprints: &mut [LexicalRebuildConversationFootprintRow],
7758        missing_tail_positions: &HashMap<i64, usize>,
7759    ) -> Result<()> {
7760        self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7761            footprints,
7762            missing_tail_positions,
7763            "SELECT conversation_id, MAX(idx) AS last_message_idx
7764             FROM messages INDEXED BY idx_messages_conv_idx
7765             GROUP BY conversation_id
7766             ORDER BY conversation_id ASC",
7767        )
7768        .or_else(|err| {
7769            if err
7770                .to_string()
7771                .contains("no such index: idx_messages_conv_idx")
7772            {
7773                return self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7774                    footprints,
7775                    missing_tail_positions,
7776                    "SELECT conversation_id, MAX(idx) AS last_message_idx
7777                     FROM messages
7778                     GROUP BY conversation_id
7779                     ORDER BY conversation_id ASC",
7780                );
7781            }
7782            Err(err)
7783        })
7784        .with_context(|| "grouping missing lexical rebuild tail estimates from messages")?;
7785
7786        Ok(())
7787    }
7788
7789    fn fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7790        &self,
7791        footprints: &mut [LexicalRebuildConversationFootprintRow],
7792        missing_tail_positions: &HashMap<i64, usize>,
7793        sql: &str,
7794    ) -> Result<()> {
7795        self.conn
7796            .query_with_params_for_each(sql, &[] as &[SqliteValue], |row| {
7797                let conversation_id: i64 = row.get_typed(0)?;
7798                let last_message_idx: Option<i64> = row.get_typed(1)?;
7799                let Some(position) = missing_tail_positions.get(&conversation_id) else {
7800                    return Ok(());
7801                };
7802                if let Some(message_count) =
7803                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7804                {
7805                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7806                        conversation_id,
7807                        message_count,
7808                    );
7809                }
7810                Ok(())
7811            })
7812            .with_context(|| "grouping lexical rebuild missing tail estimates")
7813    }
7814
7815    /// List conversation ids in the stable order used by lexical rebuilds.
7816    pub fn list_conversation_ids_for_lexical_rebuild(&self) -> Result<Vec<i64>> {
7817        self.conn
7818            .query_map_collect(
7819                "SELECT id FROM conversations ORDER BY id ASC",
7820                fparams![],
7821                |row| row.get_typed(0),
7822            )
7823            .with_context(|| "listing conversation ids for lexical rebuild")
7824    }
7825    /// Legacy OFFSET-based traversal for one-time checkpoint migration only.
7826    ///
7827    /// New code must use `list_conversations_for_lexical_rebuild_after_id`
7828    /// for keyset pagination.
7829    pub fn list_conversations_for_lexical_rebuild_by_offset(
7830        &self,
7831        limit: i64,
7832        offset: i64,
7833        agent_slugs: &HashMap<i64, String>,
7834        workspace_paths: &HashMap<i64, PathBuf>,
7835    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7836        // Single-table query avoids the 3-table JOIN that triggers
7837        // frankensqlite's full-materialization fallback path.
7838        self.conn
7839            .query_map_collect(
7840                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7841                       started_at,
7842                       COALESCE(
7843                           (SELECT ts.ended_at
7844                            FROM conversation_tail_state ts
7845                            WHERE ts.conversation_id = conversations.id),
7846                           ended_at
7847                       ),
7848                       source_id, origin_host
7849                FROM conversations
7850                ORDER BY id ASC
7851                LIMIT ?1 OFFSET ?2",
7852                fparams![limit, offset],
7853                |row| {
7854                    let agent_id: Option<i64> = row.get_typed(1)?;
7855                    let workspace_id: Option<i64> = row.get_typed(2)?;
7856                    let source_path: String = row.get_typed(5)?;
7857                    let raw_source_id: Option<String> = row.get_typed(8)?;
7858                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7859                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7860                        raw_source_id.as_deref(),
7861                        None,
7862                        raw_origin_host.as_deref(),
7863                    );
7864                    Ok(LexicalRebuildConversationRow {
7865                        id: Some(row.get_typed(0)?),
7866                        agent_slug: agent_id
7867                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7868                            .unwrap_or_else(|| "unknown".to_string()),
7869                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7870                        external_id: row.get_typed(3)?,
7871                        title: row.get_typed(4)?,
7872                        source_path: Path::new(&source_path).to_path_buf(),
7873                        started_at: row.get_typed(6)?,
7874                        ended_at: row.get_typed(7)?,
7875                        source_id,
7876                        origin_host,
7877                    })
7878                },
7879            )
7880            .with_context(|| "listing conversations for lexical rebuild")
7881    }
7882
7883    /// List lexical rebuild conversations strictly after the given primary key.
7884    ///
7885    /// Keyset pagination keeps later rebuild pages as cheap as earlier ones,
7886    /// avoiding the ever-growing `OFFSET` scan cost during large rebuilds.
7887    pub fn list_conversations_for_lexical_rebuild_after_id(
7888        &self,
7889        limit: i64,
7890        after_conversation_id: i64,
7891        agent_slugs: &HashMap<i64, String>,
7892        workspace_paths: &HashMap<i64, PathBuf>,
7893    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7894        self.conn
7895            .query_map_collect(
7896                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7897                       started_at,
7898                       COALESCE(
7899                           (SELECT ts.ended_at
7900                            FROM conversation_tail_state ts
7901                            WHERE ts.conversation_id = conversations.id),
7902                           ended_at
7903                       ),
7904                       source_id, origin_host
7905                FROM conversations
7906                WHERE id > ?2
7907                ORDER BY id ASC
7908                LIMIT ?1",
7909                fparams![limit, after_conversation_id],
7910                |row| {
7911                    let agent_id: Option<i64> = row.get_typed(1)?;
7912                    let workspace_id: Option<i64> = row.get_typed(2)?;
7913                    let source_path: String = row.get_typed(5)?;
7914                    let raw_source_id: Option<String> = row.get_typed(8)?;
7915                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7916                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7917                        raw_source_id.as_deref(),
7918                        None,
7919                        raw_origin_host.as_deref(),
7920                    );
7921                    Ok(LexicalRebuildConversationRow {
7922                        id: Some(row.get_typed(0)?),
7923                        agent_slug: agent_id
7924                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7925                            .unwrap_or_else(|| "unknown".to_string()),
7926                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7927                        external_id: row.get_typed(3)?,
7928                        title: row.get_typed(4)?,
7929                        source_path: Path::new(&source_path).to_path_buf(),
7930                        started_at: row.get_typed(6)?,
7931                        ended_at: row.get_typed(7)?,
7932                        source_id,
7933                        origin_host,
7934                    })
7935                },
7936            )
7937            .with_context(|| {
7938                format!(
7939                    "listing conversations for lexical rebuild after id {after_conversation_id}"
7940                )
7941            })
7942    }
7943
7944    /// List lexical rebuild conversations inside an `(after_id, through_id]`
7945    /// primary-key window.
7946    ///
7947    /// This lets the rebuild producer respect planned shard boundaries without
7948    /// falling back to client-side trimming or multi-table joins.
7949    pub fn list_conversations_for_lexical_rebuild_after_id_through_id(
7950        &self,
7951        limit: i64,
7952        after_conversation_id: i64,
7953        through_conversation_id: i64,
7954        agent_slugs: &HashMap<i64, String>,
7955        workspace_paths: &HashMap<i64, PathBuf>,
7956    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7957        if through_conversation_id <= after_conversation_id {
7958            return Ok(Vec::new());
7959        }
7960        self.conn
7961            .query_map_collect(
7962                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7963                       started_at,
7964                       COALESCE(
7965                           (SELECT ts.ended_at
7966                            FROM conversation_tail_state ts
7967                            WHERE ts.conversation_id = conversations.id),
7968                           ended_at
7969                       ),
7970                       source_id, origin_host
7971                FROM conversations
7972                WHERE id > ?2 AND id <= ?3
7973                ORDER BY id ASC
7974                LIMIT ?1",
7975                fparams![limit, after_conversation_id, through_conversation_id],
7976                |row| {
7977                    let agent_id: Option<i64> = row.get_typed(1)?;
7978                    let workspace_id: Option<i64> = row.get_typed(2)?;
7979                    let source_path: String = row.get_typed(5)?;
7980                    let raw_source_id: Option<String> = row.get_typed(8)?;
7981                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7982                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7983                        raw_source_id.as_deref(),
7984                        None,
7985                        raw_origin_host.as_deref(),
7986                    );
7987                    Ok(LexicalRebuildConversationRow {
7988                        id: Some(row.get_typed(0)?),
7989                        agent_slug: agent_id
7990                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7991                            .unwrap_or_else(|| "unknown".to_string()),
7992                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7993                        external_id: row.get_typed(3)?,
7994                        title: row.get_typed(4)?,
7995                        source_path: Path::new(&source_path).to_path_buf(),
7996                        started_at: row.get_typed(6)?,
7997                        ended_at: row.get_typed(7)?,
7998                        source_id,
7999                        origin_host,
8000                    })
8001                },
8002            )
8003            .with_context(|| {
8004                format!(
8005                    "listing conversations for lexical rebuild after id {after_conversation_id} through id {through_conversation_id}"
8006                )
8007            })
8008    }
8009
8010    /// Fetch messages for a conversation.
8011    pub fn fetch_messages(&self, conversation_id: i64) -> Result<Vec<Message>> {
8012        let hinted_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
8013             FROM messages INDEXED BY sqlite_autoindex_messages_1 \
8014             WHERE conversation_id = ?1 ORDER BY idx";
8015        let fallback_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
8016             FROM messages \
8017             WHERE conversation_id = ?1 ORDER BY idx";
8018
8019        self.conn
8020            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
8021                let role: String = row.get_typed(2)?;
8022                Ok(Message {
8023                    id: Some(row.get_typed(0)?),
8024                    idx: row.get_typed(1)?,
8025                    role: match role.as_str() {
8026                        "user" => MessageRole::User,
8027                        "agent" | "assistant" => MessageRole::Agent,
8028                        "tool" => MessageRole::Tool,
8029                        "system" => MessageRole::System,
8030                        other => MessageRole::Other(other.to_string()),
8031                    },
8032                    author: row.get_typed(3)?,
8033                    created_at: row.get_typed(4)?,
8034                    content: row.get_typed(5)?,
8035                    extra_json: franken_read_message_extra_compat(row, 6, 7),
8036                    snippets: Vec::new(),
8037                })
8038            })
8039            .or_else(|err| {
8040                if err
8041                    .to_string()
8042                    .contains("no such index: sqlite_autoindex_messages_1")
8043                {
8044                    return self.conn.query_map_collect(
8045                        fallback_sql,
8046                        fparams![conversation_id],
8047                        |row| {
8048                            let role: String = row.get_typed(2)?;
8049                            Ok(Message {
8050                                id: Some(row.get_typed(0)?),
8051                                idx: row.get_typed(1)?,
8052                                role: match role.as_str() {
8053                                    "user" => MessageRole::User,
8054                                    "agent" | "assistant" => MessageRole::Agent,
8055                                    "tool" => MessageRole::Tool,
8056                                    "system" => MessageRole::System,
8057                                    other => MessageRole::Other(other.to_string()),
8058                                },
8059                                author: row.get_typed(3)?,
8060                                created_at: row.get_typed(4)?,
8061                                content: row.get_typed(5)?,
8062                                extra_json: franken_read_message_extra_compat(row, 6, 7),
8063                                snippets: Vec::new(),
8064                            })
8065                        },
8066                    );
8067                }
8068                Err(err)
8069            })
8070            .with_context(|| format!("fetching messages for conversation {conversation_id}"))
8071    }
8072
8073    /// Fetch messages for lexical index rebuilds without deserializing extra metadata.
8074    ///
8075    /// Tantivy only needs message text and core envelope fields, so avoiding
8076    /// `extra_json` here prevents rebuilds from rehydrating enormous historical
8077    /// payloads that are irrelevant to lexical search.
8078    pub fn fetch_messages_for_lexical_rebuild(&self, conversation_id: i64) -> Result<Vec<Message>> {
8079        let hinted_sql = "SELECT id, idx, role, author, created_at, content \
8080                 FROM messages INDEXED BY sqlite_autoindex_messages_1 \
8081                 WHERE conversation_id = ?1 ORDER BY idx";
8082        let fallback_sql = "SELECT id, idx, role, author, created_at, content \
8083                 FROM messages \
8084                 WHERE conversation_id = ?1 ORDER BY idx";
8085
8086        self.conn
8087            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
8088                let role: String = row.get_typed(2)?;
8089                Ok(Message {
8090                    id: Some(row.get_typed(0)?),
8091                    idx: row.get_typed(1)?,
8092                    role: match role.as_str() {
8093                        "user" => MessageRole::User,
8094                        "agent" | "assistant" => MessageRole::Agent,
8095                        "tool" => MessageRole::Tool,
8096                        "system" => MessageRole::System,
8097                        other => MessageRole::Other(other.to_string()),
8098                    },
8099                    author: row.get_typed(3)?,
8100                    created_at: row.get_typed(4)?,
8101                    content: row.get_typed(5)?,
8102                    extra_json: serde_json::Value::Null,
8103                    snippets: Vec::new(),
8104                })
8105            })
8106            .or_else(|err| {
8107                if err
8108                    .to_string()
8109                    .contains("no such index: sqlite_autoindex_messages_1")
8110                {
8111                    return self.conn.query_map_collect(
8112                        fallback_sql,
8113                        fparams![conversation_id],
8114                        |row| {
8115                            let role: String = row.get_typed(2)?;
8116                            Ok(Message {
8117                                id: Some(row.get_typed(0)?),
8118                                idx: row.get_typed(1)?,
8119                                role: match role.as_str() {
8120                                    "user" => MessageRole::User,
8121                                    "agent" | "assistant" => MessageRole::Agent,
8122                                    "tool" => MessageRole::Tool,
8123                                    "system" => MessageRole::System,
8124                                    other => MessageRole::Other(other.to_string()),
8125                                },
8126                                author: row.get_typed(3)?,
8127                                created_at: row.get_typed(4)?,
8128                                content: row.get_typed(5)?,
8129                                extra_json: serde_json::Value::Null,
8130                                snippets: Vec::new(),
8131                            })
8132                        },
8133                    );
8134                }
8135                Err(err)
8136            })
8137            .with_context(|| {
8138                format!("fetching messages for lexical rebuild of conversation {conversation_id}")
8139            })
8140    }
8141
8142    /// Fetch messages for multiple conversations during lexical rebuilds.
8143    ///
8144    /// This preserves the lightweight lexical-rebuild projection while avoiding
8145    /// one round-trip per conversation when rebuilding large canonical indexes.
8146    pub fn fetch_messages_for_lexical_rebuild_batch(
8147        &self,
8148        conversation_ids: &[i64],
8149        max_messages: Option<usize>,
8150        max_content_bytes: Option<usize>,
8151    ) -> Result<HashMap<i64, Vec<Message>>> {
8152        if conversation_ids.is_empty() {
8153            return Ok(HashMap::new());
8154        }
8155
8156        let mut grouped: HashMap<i64, Vec<Message>> =
8157            HashMap::with_capacity(conversation_ids.len());
8158        let mut fetched_conversation_ids = HashSet::with_capacity(conversation_ids.len());
8159        let mut total_messages = 0usize;
8160        let mut total_content_bytes = 0usize;
8161
8162        // The apparent single-query shape (`WHERE conversation_id IN (...) ORDER BY ...`)
8163        // is a bad frankensqlite plan for large live databases: it can
8164        // materialize far more of `messages` than the requested conversations.
8165        // Reuse the hinted per-conversation primary-key lookup instead.
8166        for conversation_id in conversation_ids {
8167            if !fetched_conversation_ids.insert(*conversation_id) {
8168                continue;
8169            }
8170
8171            let messages = self
8172                .fetch_messages_for_lexical_rebuild(*conversation_id)
8173                .with_context(|| {
8174                    format!("fetching lexical rebuild messages for conversation {conversation_id}")
8175                })?;
8176            total_messages = total_messages.saturating_add(messages.len());
8177            if let Some(limit) = max_messages
8178                && total_messages > limit
8179            {
8180                return Err(anyhow!(
8181                    "lexical rebuild batch fetch exceeded message guardrail: messages={total_messages} limit={limit} conversations={}",
8182                    conversation_ids.len()
8183                ));
8184            }
8185
8186            let message_bytes = messages
8187                .iter()
8188                .map(|message| message.content.len())
8189                .sum::<usize>();
8190            total_content_bytes = total_content_bytes.saturating_add(message_bytes);
8191            if let Some(limit) = max_content_bytes
8192                && total_content_bytes > limit
8193            {
8194                return Err(anyhow!(
8195                    "lexical rebuild batch fetch exceeded content-byte guardrail: bytes={total_content_bytes} limit={limit} conversations={}",
8196                    conversation_ids.len()
8197                ));
8198            }
8199
8200            if !messages.is_empty() {
8201                grouped.insert(*conversation_id, messages);
8202            }
8203        }
8204
8205        Ok(grouped)
8206    }
8207
8208    /// Stream lexical rebuild message rows in `(conversation_id, idx)` order
8209    /// without materializing the full result set.
8210    pub fn stream_messages_for_lexical_rebuild_between_conversation_ids<F>(
8211        &self,
8212        start_conversation_id: i64,
8213        end_conversation_id: i64,
8214        mut f: F,
8215    ) -> Result<()>
8216    where
8217        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
8218    {
8219        if end_conversation_id < start_conversation_id {
8220            return Ok(());
8221        }
8222
8223        let conversation_ids: Vec<i64> = self
8224            .conn
8225            .query_map_collect(
8226                "SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
8227                fparams![start_conversation_id, end_conversation_id],
8228                |row| row.get_typed(0),
8229            )
8230            .with_context(|| "listing conversation ids for streamed lexical rebuild")?;
8231
8232        for conversation_id in conversation_ids {
8233            let messages = self
8234                .fetch_messages_for_lexical_rebuild(conversation_id)
8235                .with_context(|| {
8236                    format!("streaming lexical rebuild messages for conversation {conversation_id}")
8237                })?;
8238
8239            for message in messages {
8240                let message_id = message.id.ok_or_else(|| {
8241                    anyhow!(
8242                        "lexical rebuild message missing id for conversation {conversation_id} idx {}",
8243                        message.idx
8244                    )
8245                })?;
8246                f(LexicalRebuildMessageRow {
8247                    conversation_id,
8248                    id: message_id,
8249                    idx: message.idx,
8250                    role: role_str(&message.role),
8251                    author: message.author,
8252                    created_at: message.created_at,
8253                    content: message.content,
8254                })?;
8255            }
8256        }
8257
8258        Ok(())
8259    }
8260
8261    /// Stream grouped lexical rebuild message rows in `(conversation_id, idx)`
8262    /// order by reusing the canonical per-message stream and coalescing rows
8263    /// per conversation.
8264    pub fn stream_grouped_messages_for_lexical_rebuild_between_conversation_ids<F>(
8265        &self,
8266        start_conversation_id: i64,
8267        end_conversation_id: i64,
8268        mut f: F,
8269    ) -> Result<()>
8270    where
8271        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
8272    {
8273        if end_conversation_id < start_conversation_id {
8274            return Ok(());
8275        }
8276
8277        let mut current_conversation_id: Option<i64> = None;
8278        let mut current_messages: LexicalRebuildGroupedMessageRows = SmallVec::new();
8279        let mut current_last_message_id = 0i64;
8280        let mut flush_current = |current_conversation_id: &mut Option<i64>,
8281                                 current_messages: &mut LexicalRebuildGroupedMessageRows,
8282                                 current_last_message_id: &mut i64|
8283         -> Result<()> {
8284            let Some(conversation_id) = current_conversation_id.take() else {
8285                return Ok(());
8286            };
8287            let messages = std::mem::take(current_messages);
8288            let last_message_id = std::mem::take(current_last_message_id);
8289            f(conversation_id, messages, last_message_id)
8290        };
8291
8292        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
8293            start_conversation_id,
8294            end_conversation_id,
8295            |row| {
8296                if current_conversation_id != Some(row.conversation_id) {
8297                    flush_current(
8298                        &mut current_conversation_id,
8299                        &mut current_messages,
8300                        &mut current_last_message_id,
8301                    )?;
8302                    current_conversation_id = Some(row.conversation_id);
8303                }
8304                current_last_message_id = row.id;
8305                current_messages.push(LexicalRebuildGroupedMessageRow {
8306                    idx: row.idx,
8307                    is_tool_role: row.role == "tool",
8308                    created_at: row.created_at,
8309                    content: row.content,
8310                });
8311                Ok(())
8312            },
8313        )
8314        .with_context(|| "streaming grouped lexical rebuild messages")?;
8315
8316        flush_current(
8317            &mut current_conversation_id,
8318            &mut current_messages,
8319            &mut current_last_message_id,
8320        )
8321        .with_context(|| "flushing grouped lexical rebuild messages")
8322    }
8323
8324    /// Stream grouped lexical rebuild message rows from a starting conversation
8325    /// id to the end of the table.
8326    pub fn stream_grouped_messages_for_lexical_rebuild_from_conversation_id<F>(
8327        &self,
8328        start_conversation_id: i64,
8329        f: F,
8330    ) -> Result<()>
8331    where
8332        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
8333    {
8334        self.stream_grouped_messages_for_lexical_rebuild_between_conversation_ids(
8335            start_conversation_id,
8336            i64::MAX,
8337            f,
8338        )
8339    }
8340
8341    /// Stream lexical rebuild message rows from a starting conversation id to
8342    /// the end of the table.
8343    pub fn stream_messages_for_lexical_rebuild_from_conversation_id<F>(
8344        &self,
8345        start_conversation_id: i64,
8346        f: F,
8347    ) -> Result<()>
8348    where
8349        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
8350    {
8351        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
8352            start_conversation_id,
8353            i64::MAX,
8354            f,
8355        )
8356    }
8357
8358    /// Get a source by ID.
8359    pub fn get_source(&self, id: &str) -> Result<Option<Source>> {
8360        let result = self.conn.query_row_map(
8361            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources WHERE id = ?1",
8362            fparams![id],
8363            |row| {
8364                let kind_str: String = row.get_typed(1)?;
8365                let config_json_str: Option<String> = row.get_typed(5)?;
8366                Ok(Source {
8367                    id: row.get_typed(0)?,
8368                    kind: SourceKind::parse(&kind_str).unwrap_or_default(),
8369                    host_label: row.get_typed(2)?,
8370                    machine_id: row.get_typed(3)?,
8371                    platform: row.get_typed(4)?,
8372                    config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
8373                    created_at: row.get_typed(6)?,
8374                    updated_at: row.get_typed(7)?,
8375                })
8376            },
8377        );
8378        Ok(result.optional()?)
8379    }
8380
8381    /// List all sources.
8382    pub fn list_sources(&self) -> Result<Vec<Source>> {
8383        self.conn
8384            .query_map_collect(
8385                "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources ORDER BY id",
8386                fparams![],
8387                |row| {
8388                    let kind_str: String = row.get_typed(1)?;
8389                    let config_json_str: Option<String> = row.get_typed(5)?;
8390                    Ok(Source {
8391                        id: row.get_typed(0)?,
8392                        kind: SourceKind::parse(&kind_str).unwrap_or_default(),
8393                        host_label: row.get_typed(2)?,
8394                        machine_id: row.get_typed(3)?,
8395                        platform: row.get_typed(4)?,
8396                        config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
8397                        created_at: row.get_typed(6)?,
8398                        updated_at: row.get_typed(7)?,
8399                    })
8400                },
8401            )
8402            .with_context(|| "listing sources")
8403    }
8404
8405    /// Get IDs of all non-local sources.
8406    pub fn get_source_ids(&self) -> Result<Vec<String>> {
8407        self.conn
8408            .query_map_collect(
8409                "SELECT id FROM sources WHERE id != 'local' ORDER BY id",
8410                fparams![],
8411                |row| row.get_typed(0),
8412            )
8413            .with_context(|| "listing source ids")
8414    }
8415
8416    /// Create or update a source.
8417    pub fn upsert_source(&self, source: &Source) -> Result<()> {
8418        self.invalidate_conversation_source_cache(source.id.as_str());
8419        let now = Self::now_millis();
8420        let kind_str = source.kind.to_string();
8421        let config_json_str = source
8422            .config_json
8423            .as_ref()
8424            .map(serde_json::to_string)
8425            .transpose()?;
8426
8427        // Re-indexing commonly reuses the same normalized source metadata
8428        // across many conversations. Skip the write entirely when the row is
8429        // already identical so we avoid needless WAL churn and timestamp bumps.
8430        self.conn.execute_compat(
8431            "INSERT INTO sources(id, kind, host_label, machine_id, platform, config_json, created_at, updated_at)
8432             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
8433             ON CONFLICT(id) DO UPDATE SET
8434                 kind = excluded.kind,
8435                 host_label = excluded.host_label,
8436                 machine_id = excluded.machine_id,
8437                 platform = excluded.platform,
8438                 config_json = excluded.config_json,
8439                 updated_at = excluded.updated_at
8440             WHERE NOT (
8441                 sources.kind IS excluded.kind
8442                 AND sources.host_label IS excluded.host_label
8443                 AND sources.machine_id IS excluded.machine_id
8444                 AND sources.platform IS excluded.platform
8445                 AND sources.config_json IS excluded.config_json
8446             )",
8447            fparams![
8448                source.id.as_str(),
8449                kind_str.as_str(),
8450                source.host_label.as_deref(),
8451                source.machine_id.as_deref(),
8452                source.platform.as_deref(),
8453                config_json_str.as_deref(),
8454                source.created_at.unwrap_or(now),
8455                now
8456            ],
8457        )?;
8458        Ok(())
8459    }
8460
8461    fn historical_bundle_key_hash(
8462        version: u32,
8463        bundle: &HistoricalDatabaseBundle,
8464        include_bundle_stats: bool,
8465    ) -> String {
8466        let signature = if include_bundle_stats {
8467            format!(
8468                "{}:{}:{}:{}",
8469                version,
8470                bundle.root_path.display(),
8471                bundle.total_bytes,
8472                bundle.modified_at_ms
8473            )
8474        } else {
8475            format!("{}:{}", version, bundle.root_path.display())
8476        };
8477        blake3::hash(signature.as_bytes()).to_hex().to_string()
8478    }
8479
8480    fn historical_bundle_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
8481        format!(
8482            "historical_bundle_salvaged:{}",
8483            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_LEDGER_VERSION, bundle, false)
8484        )
8485    }
8486
8487    fn historical_bundle_legacy_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
8488        let signature = format!(
8489            "{}:{}:{}:{}",
8490            HISTORICAL_SALVAGE_LEDGER_VERSION,
8491            bundle.root_path.display(),
8492            bundle.total_bytes,
8493            bundle.modified_at_ms
8494        );
8495        format!(
8496            "historical_bundle_salvaged:{}",
8497            blake3::hash(signature.as_bytes()).to_hex()
8498        )
8499    }
8500
8501    fn historical_bundle_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
8502        format!(
8503            "historical_bundle_progress:{}",
8504            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_PROGRESS_VERSION, bundle, false)
8505        )
8506    }
8507
8508    fn historical_bundle_legacy_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
8509        let signature = format!(
8510            "{}:{}:{}:{}",
8511            HISTORICAL_SALVAGE_PROGRESS_VERSION,
8512            bundle.root_path.display(),
8513            bundle.total_bytes,
8514            bundle.modified_at_ms
8515        );
8516        format!(
8517            "historical_bundle_progress:{}",
8518            blake3::hash(signature.as_bytes()).to_hex()
8519        )
8520    }
8521
8522    fn historical_bundle_already_imported(
8523        &self,
8524        bundle: &HistoricalDatabaseBundle,
8525    ) -> Result<bool> {
8526        for key in [
8527            Self::historical_bundle_meta_key(bundle),
8528            Self::historical_bundle_legacy_meta_key(bundle),
8529        ] {
8530            let existing: Option<String> = self
8531                .conn
8532                .query_row_map(
8533                    "SELECT value FROM meta WHERE key = ?1",
8534                    fparams![key.as_str()],
8535                    |row| row.get_typed(0),
8536                )
8537                .optional()?;
8538            if existing.is_some() {
8539                return Ok(true);
8540            }
8541        }
8542        Ok(false)
8543    }
8544
8545    pub(crate) fn has_pending_historical_bundles(&self, canonical_db_path: &Path) -> Result<bool> {
8546        for bundle in discover_historical_database_bundles(canonical_db_path) {
8547            if !self.historical_bundle_already_imported(&bundle)? {
8548                return Ok(true);
8549            }
8550        }
8551        Ok(false)
8552    }
8553
8554    fn load_historical_bundle_progress(
8555        &self,
8556        bundle: &HistoricalDatabaseBundle,
8557    ) -> Result<Option<HistoricalBundleProgress>> {
8558        for key in [
8559            Self::historical_bundle_progress_key(bundle),
8560            Self::historical_bundle_legacy_progress_key(bundle),
8561        ] {
8562            let raw: Option<String> = self
8563                .conn
8564                .query_row_map(
8565                    "SELECT value FROM meta WHERE key = ?1",
8566                    fparams![key.as_str()],
8567                    |row| row.get_typed(0),
8568                )
8569                .optional()?;
8570            let Some(raw) = raw else {
8571                continue;
8572            };
8573            let parsed: HistoricalBundleProgress =
8574                serde_json::from_str(&raw).with_context(|| {
8575                    format!(
8576                        "parsing historical salvage progress checkpoint for {}",
8577                        bundle.root_path.display()
8578                    )
8579                })?;
8580            if parsed.progress_version == HISTORICAL_SALVAGE_PROGRESS_VERSION {
8581                return Ok(Some(parsed));
8582            }
8583        }
8584        Ok(None)
8585    }
8586
8587    fn record_historical_bundle_progress(
8588        &self,
8589        bundle: &HistoricalDatabaseBundle,
8590        method: &str,
8591        last_completed_source_row_id: i64,
8592        conversations_imported: usize,
8593        messages_imported: usize,
8594    ) -> Result<()> {
8595        let key = Self::historical_bundle_progress_key(bundle);
8596        let value = HistoricalBundleProgress {
8597            progress_version: HISTORICAL_SALVAGE_PROGRESS_VERSION,
8598            path: bundle.root_path.display().to_string(),
8599            bytes: bundle.total_bytes,
8600            modified_at_ms: bundle.modified_at_ms,
8601            method: method.to_string(),
8602            last_completed_source_row_id,
8603            conversations_imported,
8604            messages_imported,
8605            updated_at_ms: Self::now_millis(),
8606        };
8607        let value_str = serde_json::to_string(&value)?;
8608        self.conn.execute_compat(
8609            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
8610            fparams![key.as_str(), value_str.as_str()],
8611        )?;
8612        Ok(())
8613    }
8614
8615    fn clear_historical_bundle_progress(&self, bundle: &HistoricalDatabaseBundle) -> Result<()> {
8616        for key in [
8617            Self::historical_bundle_progress_key(bundle),
8618            Self::historical_bundle_legacy_progress_key(bundle),
8619        ] {
8620            self.conn
8621                .execute_compat("DELETE FROM meta WHERE key = ?1", fparams![key.as_str()])?;
8622        }
8623        Ok(())
8624    }
8625
8626    fn record_historical_bundle_import(
8627        &self,
8628        bundle: &HistoricalDatabaseBundle,
8629        method: &str,
8630        conversations_imported: usize,
8631        messages_imported: usize,
8632    ) -> Result<()> {
8633        let key = Self::historical_bundle_meta_key(bundle);
8634        let value = serde_json::json!({
8635            "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
8636            "path": bundle.root_path.display().to_string(),
8637            "bytes": bundle.total_bytes,
8638            "modified_at_ms": bundle.modified_at_ms,
8639            "method": method,
8640            "conversations_imported": conversations_imported,
8641            "messages_imported": messages_imported,
8642            "recorded_at_ms": Self::now_millis(),
8643        });
8644        let value_str = serde_json::to_string(&value)?;
8645        self.conn.execute_compat(
8646            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
8647            fparams![key.as_str(), value_str.as_str()],
8648        )?;
8649        Ok(())
8650    }
8651
8652    fn historical_import_error_is_split_retryable(err: &anyhow::Error) -> bool {
8653        const RETRYABLE_PATTERNS: &[&str] = &[
8654            "out of memory",
8655            "string or blob too big",
8656            "too many sql variables",
8657        ];
8658        err.chain().any(|cause| {
8659            let rendered = cause.to_string().to_ascii_lowercase();
8660            RETRYABLE_PATTERNS
8661                .iter()
8662                .any(|pattern| rendered.contains(pattern))
8663        })
8664    }
8665
8666    fn split_historical_batch_entry_messages(
8667        entry: &HistoricalBatchEntry,
8668    ) -> Option<(HistoricalBatchEntry, HistoricalBatchEntry)> {
8669        if entry.conversation.messages.len() < 2 {
8670            return None;
8671        }
8672        let split_at = entry.conversation.messages.len() / 2;
8673        if split_at == 0 || split_at >= entry.conversation.messages.len() {
8674            return None;
8675        }
8676
8677        let mut left = entry.clone();
8678        left.conversation.messages = entry.conversation.messages[..split_at].to_vec();
8679
8680        let mut right = entry.clone();
8681        right.conversation.messages = entry.conversation.messages[split_at..].to_vec();
8682
8683        Some((left, right))
8684    }
8685
8686    fn import_historical_batch_with_retry<F>(
8687        entries: &[HistoricalBatchEntry],
8688        insert_batch: &mut F,
8689    ) -> Result<HistoricalBatchImportTotals>
8690    where
8691        F: FnMut(&[HistoricalBatchEntry]) -> Result<HistoricalBatchImportTotals>,
8692    {
8693        match insert_batch(entries) {
8694            Ok(totals) => Ok(totals),
8695            Err(err) if Self::historical_import_error_is_split_retryable(&err) => {
8696                if entries.len() > 1 {
8697                    let mid = entries.len() / 2;
8698                    tracing::warn!(
8699                        batch_entries = entries.len(),
8700                        split_left = mid,
8701                        split_right = entries.len() - mid,
8702                        error = %err,
8703                        "historical salvage batch failed; retrying in smaller sub-batches"
8704                    );
8705                    let left =
8706                        Self::import_historical_batch_with_retry(&entries[..mid], insert_batch)?;
8707                    let right =
8708                        Self::import_historical_batch_with_retry(&entries[mid..], insert_batch)?;
8709                    return Ok(HistoricalBatchImportTotals {
8710                        inserted_source_rows: left.inserted_source_rows
8711                            + right.inserted_source_rows,
8712                        inserted_messages: left.inserted_messages + right.inserted_messages,
8713                    });
8714                }
8715
8716                if let Some(entry) = entries.first()
8717                    && let Some((left, right)) = Self::split_historical_batch_entry_messages(entry)
8718                {
8719                    tracing::warn!(
8720                        source_row_id = entry.source_row_id,
8721                        message_count = entry.conversation.messages.len(),
8722                        error = %err,
8723                        "historical salvage conversation failed; retrying in smaller message slices"
8724                    );
8725                    let left_totals = Self::import_historical_batch_with_retry(
8726                        std::slice::from_ref(&left),
8727                        insert_batch,
8728                    )?;
8729                    let right_totals = Self::import_historical_batch_with_retry(
8730                        std::slice::from_ref(&right),
8731                        insert_batch,
8732                    )?;
8733                    return Ok(HistoricalBatchImportTotals {
8734                        inserted_source_rows: usize::from(
8735                            left_totals.inserted_source_rows > 0
8736                                || right_totals.inserted_source_rows > 0,
8737                        ),
8738                        inserted_messages: left_totals
8739                            .inserted_messages
8740                            .saturating_add(right_totals.inserted_messages),
8741                    });
8742                }
8743
8744                Err(err)
8745            }
8746            Err(err) => Err(err),
8747        }
8748    }
8749
8750    fn import_historical_sources(&self, source_conn: &FrankenConnection) -> Result<()> {
8751        let sources: Vec<Source> = match source_conn.query_map_collect(
8752            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at
8753             FROM sources",
8754            fparams![],
8755            |row| {
8756                let raw_source_id: String = row.get_typed(0)?;
8757                let kind_str: String = row.get_typed(1)?;
8758                let raw_host_label: Option<String> = row.get_typed(2)?;
8759                let config_json_raw: Option<String> = row.get_typed(5)?;
8760                let (source_id, source_kind, host_label) = normalized_storage_source_parts(
8761                    Some(raw_source_id.as_str()),
8762                    Some(kind_str.as_str()),
8763                    raw_host_label.as_deref(),
8764                );
8765                Ok(Source {
8766                    id: source_id,
8767                    kind: source_kind,
8768                    host_label,
8769                    machine_id: row.get_typed(3)?,
8770                    platform: row.get_typed(4)?,
8771                    config_json: config_json_raw.and_then(|raw| serde_json::from_str(&raw).ok()),
8772                    created_at: row.get_typed(6)?,
8773                    updated_at: row.get_typed(7)?,
8774                })
8775            },
8776        ) {
8777            Ok(rows) => rows,
8778            Err(err) => {
8779                tracing::warn!(error = %err, "historical sources table unavailable; skipping source import");
8780                return Ok(());
8781            }
8782        };
8783
8784        for source in sources {
8785            self.upsert_source(&source)?;
8786        }
8787        Ok(())
8788    }
8789
8790    fn import_historical_conversations(
8791        &self,
8792        bundle: &HistoricalDatabaseBundle,
8793        salvage_method: &str,
8794        source_conn: &FrankenConnection,
8795    ) -> Result<(usize, usize)> {
8796        let batch_limits = historical_import_batch_limits();
8797        let cache_enabled = IndexingCache::is_enabled();
8798        let mut indexing_cache = IndexingCache::new();
8799        let mut known_sources: HashSet<String> = self
8800            .list_sources()?
8801            .into_iter()
8802            .map(|source| source.id)
8803            .collect();
8804        let resume_progress = self.load_historical_bundle_progress(bundle)?;
8805        let resume_after_row_id = resume_progress
8806            .as_ref()
8807            .map(|progress| progress.last_completed_source_row_id)
8808            .filter(|row_id| *row_id > 0);
8809
8810        tracing::info!(
8811            target: "cass::historical_salvage",
8812            batch_conversations = batch_limits.conversations,
8813            batch_messages = batch_limits.messages,
8814            batch_payload_chars = batch_limits.payload_chars,
8815            cache_enabled,
8816            resume_after_row_id,
8817            "configured historical salvage batch limits"
8818        );
8819
8820        if let Some(progress) = &resume_progress {
8821            tracing::info!(
8822                target: "cass::historical_salvage",
8823                path = %bundle.root_path.display(),
8824                resume_after_row_id = progress.last_completed_source_row_id,
8825                prior_conversations_imported = progress.conversations_imported,
8826                prior_messages_imported = progress.messages_imported,
8827                "resuming historical salvage bundle from durable checkpoint"
8828            );
8829        }
8830
8831        // LEFT JOIN + COALESCE on agents so legacy source databases with NULL
8832        // agent_id (the V1 schema did not require NOT NULL) still have their
8833        // conversations imported, degrading to 'unknown' slug like the other
8834        // rebuild paths.  Using INNER JOIN here would silently drop those
8835        // conversations during historical salvage, which is data loss.
8836        let conv_sql = if resume_after_row_id.is_some() {
8837            "SELECT
8838                c.id,
8839                COALESCE(a.slug, 'unknown'),
8840                w.path,
8841                c.external_id,
8842                c.title,
8843                c.source_path,
8844                c.started_at,
8845                c.ended_at,
8846                c.approx_tokens,
8847                c.metadata_json,
8848                c.source_id,
8849                c.origin_host
8850             FROM conversations c
8851             LEFT JOIN agents a ON c.agent_id = a.id
8852             LEFT JOIN workspaces w ON c.workspace_id = w.id
8853             WHERE c.id > ?1
8854             ORDER BY c.id"
8855        } else {
8856            "SELECT
8857                c.id,
8858                COALESCE(a.slug, 'unknown'),
8859                w.path,
8860                c.external_id,
8861                c.title,
8862                c.source_path,
8863                c.started_at,
8864                c.ended_at,
8865                c.approx_tokens,
8866                c.metadata_json,
8867                c.source_id,
8868                c.origin_host
8869             FROM conversations c
8870             LEFT JOIN agents a ON c.agent_id = a.id
8871             LEFT JOIN workspaces w ON c.workspace_id = w.id
8872             ORDER BY c.id"
8873        };
8874        let conv_params: &[ParamValue] =
8875            if let Some(last_completed_source_row_id) = resume_after_row_id {
8876                &[ParamValue::from(last_completed_source_row_id)]
8877            } else {
8878                &[]
8879            };
8880
8881        #[allow(clippy::type_complexity)]
8882        let conv_rows: Vec<(
8883            i64,
8884            String,
8885            Option<String>,
8886            Option<String>,
8887            Option<String>,
8888            String,
8889            Option<i64>,
8890            Option<i64>,
8891            Option<i64>,
8892            Option<String>,
8893            Option<String>,
8894            Option<String>,
8895        )> = source_conn
8896            .query_map_collect(conv_sql, conv_params, |row| {
8897                Ok((
8898                    row.get_typed::<i64>(0)?,
8899                    row.get_typed::<String>(1)?,
8900                    row.get_typed::<Option<String>>(2)?,
8901                    row.get_typed::<Option<String>>(3)?,
8902                    row.get_typed::<Option<String>>(4)?,
8903                    row.get_typed::<String>(5)?,
8904                    row.get_typed::<Option<i64>>(6)?,
8905                    row.get_typed::<Option<i64>>(7)?,
8906                    row.get_typed::<Option<i64>>(8)?,
8907                    row.get_typed::<Option<String>>(9)?,
8908                    row.get_typed::<Option<String>>(10)?,
8909                    row.get_typed::<Option<String>>(11)?,
8910                ))
8911            })
8912            .context("querying historical conversations")?;
8913
8914        let msg_sql = "SELECT idx, role, author, created_at, content, extra_json
8915             FROM messages
8916             WHERE conversation_id = ?1
8917             ORDER BY idx";
8918
8919        let mut imported_conversations = resume_progress
8920            .as_ref()
8921            .map(|progress| progress.conversations_imported)
8922            .unwrap_or(0);
8923        let mut imported_messages = resume_progress
8924            .as_ref()
8925            .map(|progress| progress.messages_imported)
8926            .unwrap_or(0);
8927        let mut pending_batch: Vec<HistoricalBatchEntry> = Vec::new();
8928        let mut pending_batch_messages = 0usize;
8929        let mut pending_batch_chars = 0usize;
8930        let mut pending_batch_first_row_id: Option<i64> = None;
8931        let mut pending_batch_last_row_id: Option<i64> = None;
8932
8933        let flush_batch = |storage: &FrankenStorage,
8934                           batch: &mut Vec<HistoricalBatchEntry>,
8935                           pending_messages: &mut usize,
8936                           pending_chars: &mut usize,
8937                           first_row_id: &mut Option<i64>,
8938                           last_row_id: &mut Option<i64>,
8939                           imported_conversations: &mut usize,
8940                           imported_messages: &mut usize|
8941         -> Result<()> {
8942            if batch.is_empty() {
8943                return Ok(());
8944            }
8945
8946            let batch_first_row_id = *first_row_id;
8947            let batch_last_row_id = *last_row_id;
8948            if historical_salvage_debug_enabled() {
8949                eprintln!(
8950                    "[historical-salvage] flushing batch rows {:?}..{:?} conversations={} messages={} payload_chars={}",
8951                    batch_first_row_id,
8952                    batch_last_row_id,
8953                    batch.len(),
8954                    *pending_messages,
8955                    *pending_chars
8956                );
8957            }
8958            tracing::info!(
8959                target: "cass::historical_salvage",
8960                batch_conversations = batch.len(),
8961                batch_messages = *pending_messages,
8962                batch_payload_chars = *pending_chars,
8963                first_source_row_id = batch_first_row_id,
8964                last_source_row_id = batch_last_row_id,
8965                "flushing historical salvage batch"
8966            );
8967
8968            let mut insert_batch =
8969                |entries: &[HistoricalBatchEntry]| -> Result<HistoricalBatchImportTotals> {
8970                    let borrowed_batch: Vec<(i64, Option<i64>, &Conversation)> = entries
8971                        .iter()
8972                        .map(|entry| (entry.agent_id, entry.workspace_id, &entry.conversation))
8973                        .collect();
8974                    let outcomes = storage
8975                        .insert_conversations_batched(&borrowed_batch)
8976                        .with_context(|| {
8977                            let first_source_row_id =
8978                                entries.first().map(|entry| entry.source_row_id);
8979                            let last_source_row_id =
8980                                entries.last().map(|entry| entry.source_row_id);
8981                            format!(
8982                                "inserting historical salvage batch source rows {:?}..{:?}",
8983                                first_source_row_id, last_source_row_id
8984                            )
8985                        })?;
8986                    let mut totals = HistoricalBatchImportTotals::default();
8987                    for outcome in outcomes {
8988                        if !outcome.inserted_indices.is_empty() {
8989                            totals.inserted_source_rows += 1;
8990                            totals.inserted_messages += outcome.inserted_indices.len();
8991                        }
8992                    }
8993                    Ok(totals)
8994                };
8995            let totals =
8996                Self::import_historical_batch_with_retry(batch.as_slice(), &mut insert_batch)?;
8997            *imported_conversations =
8998                (*imported_conversations).saturating_add(totals.inserted_source_rows);
8999            *imported_messages = (*imported_messages).saturating_add(totals.inserted_messages);
9000            if let Some(last_completed_row_id) = batch_last_row_id {
9001                storage.record_historical_bundle_progress(
9002                    bundle,
9003                    salvage_method,
9004                    last_completed_row_id,
9005                    *imported_conversations,
9006                    *imported_messages,
9007                )?;
9008            }
9009            tracing::info!(
9010                target: "cass::historical_salvage",
9011                batch_conversations = batch.len(),
9012                batch_messages = *pending_messages,
9013                imported_conversations = *imported_conversations,
9014                imported_messages = *imported_messages,
9015                first_source_row_id = batch_first_row_id,
9016                last_source_row_id = batch_last_row_id,
9017                "historical salvage batch committed"
9018            );
9019            if historical_salvage_debug_enabled() {
9020                eprintln!(
9021                    "[historical-salvage] committed batch rows {:?}..{:?} imported_conversations={} imported_messages={}",
9022                    batch_first_row_id,
9023                    batch_last_row_id,
9024                    *imported_conversations,
9025                    *imported_messages
9026                );
9027            }
9028            batch.clear();
9029            *pending_messages = 0;
9030            *pending_chars = 0;
9031            *first_row_id = None;
9032            *last_row_id = None;
9033            Ok(())
9034        };
9035
9036        for (
9037            conversation_row_id,
9038            agent_slug,
9039            workspace_path,
9040            external_id,
9041            title,
9042            source_path,
9043            started_at,
9044            ended_at,
9045            approx_tokens,
9046            metadata_json_raw,
9047            raw_source_id,
9048            raw_origin_host,
9049        ) in conv_rows
9050        {
9051            let source_id = crate::search::tantivy::normalized_index_source_id(
9052                raw_source_id.as_deref(),
9053                None,
9054                raw_origin_host.as_deref(),
9055            );
9056            let origin_host =
9057                crate::search::tantivy::normalized_index_origin_host(raw_origin_host.as_deref());
9058
9059            let messages: Vec<Message> = source_conn
9060                .query_map_collect(msg_sql, fparams![conversation_row_id], |msg_row| {
9061                    let role: String = msg_row.get_typed(1)?;
9062                    Ok(Message {
9063                        id: None,
9064                        idx: msg_row.get_typed(0)?,
9065                        role: match role.as_str() {
9066                            "user" => MessageRole::User,
9067                            "agent" | "assistant" => MessageRole::Agent,
9068                            "tool" => MessageRole::Tool,
9069                            "system" => MessageRole::System,
9070                            other => MessageRole::Other(other.to_string()),
9071                        },
9072                        author: msg_row.get_typed(2)?,
9073                        created_at: msg_row.get_typed(3)?,
9074                        content: msg_row.get_typed(4)?,
9075                        extra_json: parse_historical_json_column(msg_row.get_typed(5)?),
9076                        snippets: Vec::new(),
9077                    })
9078                })
9079                .context("collecting historical message rows")?;
9080
9081            if messages.is_empty() {
9082                continue;
9083            }
9084
9085            let conversation_message_count = messages.len();
9086            let conversation_chars = messages
9087                .iter()
9088                .map(message_payload_size_hint)
9089                .sum::<usize>();
9090
9091            let conversation = Conversation {
9092                id: None,
9093                agent_slug: agent_slug.clone(),
9094                workspace: workspace_path.map(PathBuf::from),
9095                external_id,
9096                title,
9097                source_path: PathBuf::from(source_path),
9098                started_at,
9099                ended_at,
9100                approx_tokens,
9101                metadata_json: parse_json_column(metadata_json_raw),
9102                messages,
9103                source_id,
9104                origin_host,
9105            };
9106
9107            if !known_sources.contains(&conversation.source_id) {
9108                let placeholder = if conversation.source_id == LOCAL_SOURCE_ID {
9109                    Source::local()
9110                } else {
9111                    Source {
9112                        id: conversation.source_id.clone(),
9113                        kind: SourceKind::Ssh,
9114                        host_label: conversation.origin_host.clone(),
9115                        machine_id: None,
9116                        platform: None,
9117                        config_json: None,
9118                        created_at: None,
9119                        updated_at: None,
9120                    }
9121                };
9122                self.upsert_source(&placeholder)?;
9123                known_sources.insert(conversation.source_id.clone());
9124            }
9125
9126            let agent = Agent {
9127                id: None,
9128                slug: agent_slug.clone(),
9129                name: agent_slug,
9130                version: None,
9131                kind: AgentKind::Cli,
9132            };
9133            let agent_id = if cache_enabled {
9134                indexing_cache.get_or_insert_agent(self, &agent)?
9135            } else {
9136                self.ensure_agent(&agent)?
9137            };
9138            let workspace_id = if let Some(workspace) = &conversation.workspace {
9139                if cache_enabled {
9140                    Some(indexing_cache.get_or_insert_workspace(self, workspace, None)?)
9141                } else {
9142                    Some(self.ensure_workspace(workspace, None)?)
9143                }
9144            } else {
9145                None
9146            };
9147
9148            let exceeds_pending_limits = !pending_batch.is_empty()
9149                && (pending_batch.len() >= batch_limits.conversations
9150                    || pending_batch_messages.saturating_add(conversation_message_count)
9151                        > batch_limits.messages
9152                    || pending_batch_chars.saturating_add(conversation_chars)
9153                        > batch_limits.payload_chars);
9154            if exceeds_pending_limits {
9155                flush_batch(
9156                    self,
9157                    &mut pending_batch,
9158                    &mut pending_batch_messages,
9159                    &mut pending_batch_chars,
9160                    &mut pending_batch_first_row_id,
9161                    &mut pending_batch_last_row_id,
9162                    &mut imported_conversations,
9163                    &mut imported_messages,
9164                )?;
9165            }
9166
9167            if pending_batch_first_row_id.is_none() {
9168                pending_batch_first_row_id = Some(conversation_row_id);
9169            }
9170            pending_batch_last_row_id = Some(conversation_row_id);
9171            pending_batch_messages =
9172                pending_batch_messages.saturating_add(conversation_message_count);
9173            pending_batch_chars = pending_batch_chars.saturating_add(conversation_chars);
9174            pending_batch.push(HistoricalBatchEntry {
9175                source_row_id: conversation_row_id,
9176                agent_id,
9177                workspace_id,
9178                conversation,
9179            });
9180
9181            if pending_batch.len() >= batch_limits.conversations
9182                || pending_batch_messages >= batch_limits.messages
9183                || pending_batch_chars >= batch_limits.payload_chars
9184            {
9185                flush_batch(
9186                    self,
9187                    &mut pending_batch,
9188                    &mut pending_batch_messages,
9189                    &mut pending_batch_chars,
9190                    &mut pending_batch_first_row_id,
9191                    &mut pending_batch_last_row_id,
9192                    &mut imported_conversations,
9193                    &mut imported_messages,
9194                )?;
9195            }
9196        }
9197
9198        flush_batch(
9199            self,
9200            &mut pending_batch,
9201            &mut pending_batch_messages,
9202            &mut pending_batch_chars,
9203            &mut pending_batch_first_row_id,
9204            &mut pending_batch_last_row_id,
9205            &mut imported_conversations,
9206            &mut imported_messages,
9207        )?;
9208
9209        if cache_enabled {
9210            let (hits, misses, hit_rate) = indexing_cache.stats();
9211            tracing::info!(
9212                target: "cass::historical_salvage",
9213                hits,
9214                misses,
9215                hit_rate = format!("{:.1}%", hit_rate * 100.0),
9216                agents = indexing_cache.agent_count(),
9217                workspaces = indexing_cache.workspace_count(),
9218                sources = known_sources.len(),
9219                "historical salvage cache stats"
9220            );
9221        }
9222
9223        Ok((imported_conversations, imported_messages))
9224    }
9225
9226    pub fn salvage_historical_databases(
9227        &self,
9228        canonical_db_path: &Path,
9229    ) -> Result<HistoricalSalvageOutcome> {
9230        let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
9231        let mut outcome = HistoricalSalvageOutcome {
9232            bundles_considered: ordered_bundles.len(),
9233            ..HistoricalSalvageOutcome::default()
9234        };
9235
9236        for bundle in ordered_bundles {
9237            if self.historical_bundle_already_imported(&bundle)? {
9238                self.clear_historical_bundle_progress(&bundle)?;
9239                continue;
9240            }
9241
9242            let source = match open_historical_bundle_for_salvage(&bundle).with_context(|| {
9243                format!(
9244                    "opening historical bundle {} for salvage",
9245                    bundle.root_path.display()
9246                )
9247            }) {
9248                Ok(source) => source,
9249                Err(err) => {
9250                    tracing::warn!(
9251                        path = %bundle.root_path.display(),
9252                        error = %err,
9253                        "skipping unreadable historical cass database bundle during salvage"
9254                    );
9255                    self.clear_historical_bundle_progress(&bundle)?;
9256                    continue;
9257                }
9258            };
9259
9260            // #247 (coding_agent_session_search-r8pcy): if a per-bundle progress
9261            // checkpoint already covers the backup's entire conversation row-id
9262            // space, the bundle was effectively fully imported but the daemon was
9263            // killed (e.g. OOM) before the completion ledger marker landed.
9264            // Re-scanning it is a pure O(n) no-op — every batch commits
9265            // imported=0 while taking 5-12 min. Detect it via the high-water
9266            // checkpoint, write the ledger marker, drop the checkpoint, and skip.
9267            if let Some(progress) = self.load_historical_bundle_progress(&bundle)? {
9268                let backup_max_conversation_id: i64 = source
9269                    .conn
9270                    .query_row_map(
9271                        "SELECT COALESCE(MAX(id), 0) FROM conversations",
9272                        fparams![],
9273                        |row| row.get_typed(0),
9274                    )
9275                    .unwrap_or(0);
9276                if backup_max_conversation_id > 0
9277                    && progress.last_completed_source_row_id >= backup_max_conversation_id
9278                {
9279                    self.record_historical_bundle_import(
9280                        &bundle,
9281                        source.method,
9282                        progress.conversations_imported,
9283                        progress.messages_imported,
9284                    )?;
9285                    self.clear_historical_bundle_progress(&bundle)?;
9286                    tracing::info!(
9287                        path = %bundle.root_path.display(),
9288                        last_completed_source_row_id = progress.last_completed_source_row_id,
9289                        backup_max_conversation_id,
9290                        conversations_imported = progress.conversations_imported,
9291                        messages_imported = progress.messages_imported,
9292                        "historical bundle already fully imported per checkpoint; marking salvaged and skipping O(n) re-scan"
9293                    );
9294                    continue;
9295                }
9296            }
9297
9298            self.import_historical_sources(&source.conn)?;
9299            let (imported_conversations, imported_messages) =
9300                self.import_historical_conversations(&bundle, source.method, &source.conn)?;
9301            self.record_historical_bundle_import(
9302                &bundle,
9303                source.method,
9304                imported_conversations,
9305                imported_messages,
9306            )?;
9307            self.clear_historical_bundle_progress(&bundle)?;
9308
9309            outcome.bundles_imported += 1;
9310            outcome.conversations_imported += imported_conversations;
9311            outcome.messages_imported += imported_messages;
9312
9313            tracing::info!(
9314                path = %bundle.root_path.display(),
9315                bytes = bundle.total_bytes,
9316                method = source.method,
9317                imported_conversations,
9318                imported_messages,
9319                "salvaged historical cass database bundle"
9320            );
9321        }
9322
9323        Ok(outcome)
9324    }
9325
9326    /// Delete a source by ID. Returns true if a row was deleted.
9327    pub fn delete_source(&self, id: &str, _cascade: bool) -> Result<bool> {
9328        if id == LOCAL_SOURCE_ID {
9329            anyhow::bail!("cannot delete the local source");
9330        }
9331        let count = self
9332            .conn
9333            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams![id])?;
9334        if count > 0 {
9335            self.invalidate_conversation_source_cache(id);
9336        }
9337        Ok(count > 0)
9338    }
9339
9340    /// Insert a conversation tree (conversation + messages + snippets + FTS).
9341    pub fn insert_conversation_tree(
9342        &self,
9343        agent_id: i64,
9344        workspace_id: Option<i64>,
9345        conv: &Conversation,
9346    ) -> Result<InsertOutcome> {
9347        let normalized_conv = normalized_conversation_for_storage(conv);
9348        let conv = normalized_conv.as_ref();
9349        self.ensure_source_for_conversation(conv)?;
9350        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9351        let defer_analytics_updates = defer_analytics_updates_enabled();
9352        let conversation_key = conversation_merge_key(agent_id, conv);
9353        let mut tx = self.conn.transaction()?;
9354        let existing = franken_find_existing_conversation_with_tail_by_key(
9355            &tx,
9356            &conversation_key,
9357            Some(conv),
9358        )?;
9359        if let Some(existing) = existing {
9360            let outcome = self.franken_append_messages_with_tail_in_tx(
9361                &tx,
9362                agent_id,
9363                existing.id,
9364                conv,
9365                existing.tail_state,
9366                defer_lexical_updates,
9367                defer_analytics_updates,
9368            )?;
9369            tx.commit()?;
9370            return Ok(outcome);
9371        }
9372
9373        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
9374            &tx,
9375            agent_id,
9376            workspace_id,
9377            conv,
9378            &conversation_key,
9379        )? {
9380            ConversationInsertStatus::Inserted(conv_id) => conv_id,
9381            ConversationInsertStatus::Existing(existing_id) => {
9382                let ExistingMessageLookup {
9383                    by_idx: mut existing_messages,
9384                    replay: mut existing_replay_fingerprints,
9385                } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9386                let ExistingConversationNewMessages {
9387                    messages: new_messages,
9388                    new_chars: _planned_new_chars,
9389                    idx_collision_count,
9390                    first_collision_idx,
9391                } = collect_new_messages_for_existing_conversation(
9392                    existing_id,
9393                    conv,
9394                    &mut existing_messages,
9395                    &mut existing_replay_fingerprints,
9396                    "skipping replay-equivalent recovered message with shifted idx",
9397                );
9398                let (inserted_last_idx, inserted_last_created_at) =
9399                    borrowed_messages_tail_state(&new_messages);
9400                let mut inserted_indices = Vec::new();
9401                let mut fts_entries = Vec::new();
9402                let mut fts_pending_chars = 0usize;
9403                let mut _fts_inserted_total = 0usize;
9404                let inserted_messages =
9405                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
9406                let inserted_chars = inserted_messages
9407                    .iter()
9408                    .map(|(_, msg)| msg.content.len() as i64)
9409                    .sum::<i64>();
9410                for (msg_id, msg) in inserted_messages {
9411                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9412                    if !defer_lexical_updates {
9413                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9414                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9415                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9416                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9417                        {
9418                            flush_pending_fts_entries(
9419                                self,
9420                                &tx,
9421                                &mut fts_entries,
9422                                &mut fts_pending_chars,
9423                                &mut _fts_inserted_total,
9424                            )?;
9425                        }
9426                    }
9427                    inserted_indices.push(msg.idx);
9428                }
9429
9430                if idx_collision_count > 0 {
9431                    tracing::warn!(
9432                        conversation_id = existing_id,
9433                        collision_count = idx_collision_count,
9434                        first_idx = first_collision_idx,
9435                        source_path = %conv.source_path.display(),
9436                        "message idx collisions encountered while merging recovered conversation; retaining canonical message variants"
9437                    );
9438                }
9439
9440                if !defer_lexical_updates {
9441                    flush_pending_fts_entries(
9442                        self,
9443                        &tx,
9444                        &mut fts_entries,
9445                        &mut fts_pending_chars,
9446                        &mut _fts_inserted_total,
9447                    )?;
9448                }
9449
9450                let conv_last_ts = conversation_tail_ended_at_candidate(conv);
9451                franken_update_conversation_tail_state(
9452                    &tx,
9453                    existing_id,
9454                    conv_last_ts,
9455                    inserted_last_idx,
9456                    inserted_last_created_at,
9457                )?;
9458                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
9459                {
9460                    franken_update_external_conversation_tail_lookup_key(
9461                        &tx,
9462                        &lookup_key,
9463                        conv_last_ts,
9464                        inserted_last_idx,
9465                        inserted_last_created_at,
9466                    )?;
9467                }
9468
9469                if !defer_analytics_updates && !inserted_indices.is_empty() {
9470                    franken_update_daily_stats_in_tx(
9471                        self,
9472                        &tx,
9473                        &conv.agent_slug,
9474                        &conv.source_id,
9475                        conversation_effective_started_at(conv),
9476                        StatsDelta {
9477                            session_count_delta: 0,
9478                            message_count_delta: inserted_indices.len() as i64,
9479                            total_chars_delta: inserted_chars,
9480                        },
9481                    )?;
9482                }
9483
9484                tx.commit()?;
9485                return Ok(InsertOutcome {
9486                    conversation_id: existing_id,
9487                    conversation_inserted: false,
9488                    inserted_indices,
9489                });
9490            }
9491        };
9492        let mut fts_entries = Vec::new();
9493        let mut fts_pending_chars = 0usize;
9494        let mut _fts_inserted_total = 0usize;
9495        let mut total_chars: i64 = 0;
9496        let mut inserted_indices = Vec::new();
9497        let mut pending_messages = HashMap::new();
9498        let mut pending_replay_fingerprints = HashSet::new();
9499        let mut idx_collision_count = 0usize;
9500        let mut first_collision_idx: Option<i64> = None;
9501        let mut new_messages = Vec::new();
9502        for msg in &conv.messages {
9503            let incoming_fingerprint = message_merge_fingerprint(msg);
9504            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
9505                if existing_fingerprint != &incoming_fingerprint {
9506                    idx_collision_count = idx_collision_count.saturating_add(1);
9507                    first_collision_idx.get_or_insert(msg.idx);
9508                }
9509                continue;
9510            }
9511            let incoming_replay = message_replay_fingerprint(msg);
9512            if pending_replay_fingerprints.contains(&incoming_replay) {
9513                tracing::debug!(
9514                    conversation_id = conv_id,
9515                    idx = msg.idx,
9516                    source_path = %conv.source_path.display(),
9517                    "skipping replay-equivalent duplicate message within new conversation insert"
9518                );
9519                continue;
9520            }
9521            pending_messages.insert(msg.idx, incoming_fingerprint);
9522            pending_replay_fingerprints.insert(incoming_replay);
9523            new_messages.push(msg);
9524        }
9525        let inserted_message_ids = franken_batch_insert_new_messages(&tx, conv_id, &new_messages)?;
9526        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9527            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9528            if !defer_lexical_updates {
9529                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9530                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9531                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9532                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9533                {
9534                    flush_pending_fts_entries(
9535                        self,
9536                        &tx,
9537                        &mut fts_entries,
9538                        &mut fts_pending_chars,
9539                        &mut _fts_inserted_total,
9540                    )?;
9541                }
9542            }
9543            total_chars += msg.content.len() as i64;
9544            inserted_indices.push(msg.idx);
9545        }
9546        if idx_collision_count > 0 {
9547            tracing::warn!(
9548                conversation_id = conv_id,
9549                collision_count = idx_collision_count,
9550                first_idx = first_collision_idx,
9551                source_path = %conv.source_path.display(),
9552                "message idx collisions encountered while inserting a new conversation; retaining the first canonical variant per idx"
9553            );
9554        }
9555        if !defer_lexical_updates {
9556            flush_pending_fts_entries(
9557                self,
9558                &tx,
9559                &mut fts_entries,
9560                &mut fts_pending_chars,
9561                &mut _fts_inserted_total,
9562            )?;
9563        }
9564
9565        if !defer_analytics_updates {
9566            franken_update_daily_stats_in_tx(
9567                self,
9568                &tx,
9569                &conv.agent_slug,
9570                &conv.source_id,
9571                conversation_effective_started_at(conv),
9572                StatsDelta {
9573                    session_count_delta: 1,
9574                    message_count_delta: inserted_indices.len() as i64,
9575                    total_chars_delta: total_chars,
9576                },
9577            )?;
9578        }
9579
9580        tx.commit()?;
9581        Ok(InsertOutcome {
9582            conversation_id: conv_id,
9583            conversation_inserted: true,
9584            inserted_indices,
9585        })
9586    }
9587
9588    #[cfg(test)]
9589    fn insert_conversation_tree_with_profile(
9590        &self,
9591        agent_id: i64,
9592        workspace_id: Option<i64>,
9593        conv: &Conversation,
9594        profile: &mut InsertConversationTreePerfProfile,
9595    ) -> Result<InsertOutcome> {
9596        let total_start = Instant::now();
9597        let normalized_conv = normalized_conversation_for_storage(conv);
9598        let conv = normalized_conv.as_ref();
9599
9600        let source_start = Instant::now();
9601        self.ensure_source_for_conversation(conv)?;
9602        profile.source_duration += source_start.elapsed();
9603
9604        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9605        let defer_analytics_updates = defer_analytics_updates_enabled();
9606        let conversation_key = conversation_merge_key(agent_id, conv);
9607
9608        let tx_open_start = Instant::now();
9609        let mut tx = self.conn.transaction()?;
9610        profile.tx_open_duration += tx_open_start.elapsed();
9611
9612        let existing_lookup_start = Instant::now();
9613        let existing =
9614            franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
9615        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9616        if let Some(existing_id) = existing {
9617            return Err(anyhow!(
9618                "profile helper expects new conversation path, found existing id {existing_id}"
9619            ));
9620        }
9621
9622        let conversation_row_start = Instant::now();
9623        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
9624            &tx,
9625            agent_id,
9626            workspace_id,
9627            conv,
9628            &conversation_key,
9629        )? {
9630            ConversationInsertStatus::Inserted(conv_id) => conv_id,
9631            ConversationInsertStatus::Existing(existing_id) => {
9632                return Err(anyhow!(
9633                    "profile helper expected inserted conversation row, reused existing id {existing_id}"
9634                ));
9635            }
9636        };
9637        profile.conversation_row_duration += conversation_row_start.elapsed();
9638
9639        let mut fts_entries = Vec::new();
9640        let mut fts_pending_chars = 0usize;
9641        let mut fts_inserted_total = 0usize;
9642        let mut total_chars: i64 = 0;
9643        let mut inserted_indices = Vec::new();
9644        let mut pending_messages = HashMap::new();
9645        let mut pending_replay_fingerprints = HashSet::new();
9646        let mut idx_collision_count = 0usize;
9647        let mut first_collision_idx: Option<i64> = None;
9648        let mut new_messages = Vec::new();
9649
9650        for msg in &conv.messages {
9651            let incoming_fingerprint = message_merge_fingerprint(msg);
9652            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
9653                if existing_fingerprint != &incoming_fingerprint {
9654                    idx_collision_count = idx_collision_count.saturating_add(1);
9655                    first_collision_idx.get_or_insert(msg.idx);
9656                }
9657                continue;
9658            }
9659
9660            let incoming_replay = message_replay_fingerprint(msg);
9661            if pending_replay_fingerprints.contains(&incoming_replay) {
9662                tracing::debug!(
9663                    conversation_id = conv_id,
9664                    idx = msg.idx,
9665                    source_path = %conv.source_path.display(),
9666                    "skipping replay-equivalent duplicate message within profiled new conversation insert"
9667                );
9668                continue;
9669            }
9670
9671            pending_messages.insert(msg.idx, incoming_fingerprint);
9672            pending_replay_fingerprints.insert(incoming_replay);
9673            new_messages.push(msg);
9674        }
9675
9676        let message_insert_start = Instant::now();
9677        let inserted_message_ids = franken_batch_insert_new_messages_with_profile(
9678            &tx,
9679            conv_id,
9680            &new_messages,
9681            &mut profile.message_insert_breakdown,
9682        )?;
9683        profile.message_insert_duration += message_insert_start.elapsed();
9684
9685        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9686            let snippet_insert_start = Instant::now();
9687            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9688            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9689
9690            if !defer_lexical_updates {
9691                let fts_entry_start = Instant::now();
9692                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9693                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9694                profile.fts_entry_duration += fts_entry_start.elapsed();
9695                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9696                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9697                {
9698                    let fts_flush_start = Instant::now();
9699                    flush_pending_fts_entries(
9700                        self,
9701                        &tx,
9702                        &mut fts_entries,
9703                        &mut fts_pending_chars,
9704                        &mut fts_inserted_total,
9705                    )?;
9706                    profile.fts_flush_duration += fts_flush_start.elapsed();
9707                }
9708            }
9709
9710            total_chars += msg.content.len() as i64;
9711            inserted_indices.push(msg.idx);
9712        }
9713
9714        if idx_collision_count > 0 {
9715            tracing::warn!(
9716                conversation_id = conv_id,
9717                collision_count = idx_collision_count,
9718                first_idx = first_collision_idx,
9719                source_path = %conv.source_path.display(),
9720                "message idx collisions encountered while profiling a new conversation insert; retaining the first canonical variant per idx"
9721            );
9722        }
9723
9724        if !defer_lexical_updates {
9725            let fts_flush_start = Instant::now();
9726            flush_pending_fts_entries(
9727                self,
9728                &tx,
9729                &mut fts_entries,
9730                &mut fts_pending_chars,
9731                &mut fts_inserted_total,
9732            )?;
9733            profile.fts_flush_duration += fts_flush_start.elapsed();
9734        }
9735
9736        if !defer_analytics_updates {
9737            let analytics_start = Instant::now();
9738            franken_update_daily_stats_in_tx(
9739                self,
9740                &tx,
9741                &conv.agent_slug,
9742                &conv.source_id,
9743                conversation_effective_started_at(conv),
9744                StatsDelta {
9745                    session_count_delta: 1,
9746                    message_count_delta: inserted_indices.len() as i64,
9747                    total_chars_delta: total_chars,
9748                },
9749            )?;
9750            profile.analytics_duration += analytics_start.elapsed();
9751        }
9752
9753        let commit_start = Instant::now();
9754        tx.commit()?;
9755        profile.commit_duration += commit_start.elapsed();
9756        profile.invocations += 1;
9757        profile.messages += conv.messages.len();
9758        profile.inserted_messages += inserted_indices.len();
9759        profile.total_duration += total_start.elapsed();
9760
9761        Ok(InsertOutcome {
9762            conversation_id: conv_id,
9763            conversation_inserted: true,
9764            inserted_indices,
9765        })
9766    }
9767
9768    #[cfg(test)]
9769    fn append_existing_conversation_with_profile(
9770        &self,
9771        agent_id: i64,
9772        _workspace_id: Option<i64>,
9773        conv: &Conversation,
9774        profile: &mut InsertConversationTreePerfProfile,
9775    ) -> Result<InsertOutcome> {
9776        let total_start = Instant::now();
9777        let normalized_conv = normalized_conversation_for_storage(conv);
9778        let conv = normalized_conv.as_ref();
9779
9780        let source_start = Instant::now();
9781        self.ensure_source_for_conversation(conv)?;
9782        profile.source_duration += source_start.elapsed();
9783
9784        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9785        let defer_analytics_updates = defer_analytics_updates_enabled();
9786        let conversation_key = conversation_merge_key(agent_id, conv);
9787
9788        let tx_open_start = Instant::now();
9789        let mut tx = self.conn.transaction()?;
9790        profile.tx_open_duration += tx_open_start.elapsed();
9791
9792        let existing_lookup_start = Instant::now();
9793        let existing = franken_find_existing_conversation_with_tail_by_key(
9794            &tx,
9795            &conversation_key,
9796            Some(conv),
9797        )?;
9798        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9799        let existing = existing.ok_or_else(|| {
9800            anyhow!("append profile helper expects existing conversation for {conversation_key:?}")
9801        })?;
9802        let existing_id = existing.id;
9803
9804        let existing_idx_lookup_start = Instant::now();
9805        let append_tail_state = existing.tail_state;
9806        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9807        let existing_plan = append_tail_state.as_ref().and_then(|state| {
9808            collect_append_only_tail_messages(
9809                conv,
9810                state.last_message_idx,
9811                state.last_message_created_at,
9812            )
9813        });
9814        let used_append_tail_plan = existing_plan.is_some();
9815        profile.existing_idx_lookup_duration += existing_idx_lookup_start.elapsed();
9816
9817        let dedupe_filter_start = Instant::now();
9818        let ExistingConversationNewMessages {
9819            messages: new_messages,
9820            new_chars,
9821            idx_collision_count,
9822            first_collision_idx,
9823        } = if let Some(existing_plan) = existing_plan {
9824            existing_plan
9825        } else {
9826            let ExistingMessageLookup {
9827                by_idx: mut existing_messages,
9828                replay: mut existing_replay_fingerprints,
9829            } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9830            collect_new_messages_for_existing_conversation(
9831                existing_id,
9832                conv,
9833                &mut existing_messages,
9834                &mut existing_replay_fingerprints,
9835                "skipping replay-equivalent profiled append message with shifted idx",
9836            )
9837        };
9838        profile.dedupe_filter_duration += dedupe_filter_start.elapsed();
9839
9840        let mut inserted_indices = Vec::new();
9841        let mut fts_entries = Vec::new();
9842        let mut fts_pending_chars = 0usize;
9843        let mut fts_inserted_total = 0usize;
9844        let (inserted_last_idx, inserted_last_created_at) =
9845            borrowed_messages_tail_state(&new_messages);
9846
9847        let message_insert_start = Instant::now();
9848        let inserted_message_ids = franken_append_insert_new_messages_with_profile(
9849            &tx,
9850            existing_id,
9851            &new_messages,
9852            &mut profile.message_insert_breakdown,
9853        )?;
9854        profile.message_insert_duration += message_insert_start.elapsed();
9855
9856        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9857            let snippet_insert_start = Instant::now();
9858            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9859            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9860
9861            if !defer_lexical_updates {
9862                let fts_entry_start = Instant::now();
9863                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9864                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9865                profile.fts_entry_duration += fts_entry_start.elapsed();
9866                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9867                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9868                {
9869                    let fts_flush_start = Instant::now();
9870                    flush_pending_fts_entries(
9871                        self,
9872                        &tx,
9873                        &mut fts_entries,
9874                        &mut fts_pending_chars,
9875                        &mut fts_inserted_total,
9876                    )?;
9877                    profile.fts_flush_duration += fts_flush_start.elapsed();
9878                }
9879            }
9880
9881            inserted_indices.push(msg.idx);
9882        }
9883
9884        if idx_collision_count > 0 {
9885            tracing::warn!(
9886                conversation_id = existing_id,
9887                collision_count = idx_collision_count,
9888                first_idx = first_collision_idx,
9889                source_path = %conv.source_path.display(),
9890                "message idx collisions encountered while profiling append merge; retaining canonical message variants"
9891            );
9892        }
9893
9894        if !defer_lexical_updates {
9895            let fts_flush_start = Instant::now();
9896            flush_pending_fts_entries(
9897                self,
9898                &tx,
9899                &mut fts_entries,
9900                &mut fts_pending_chars,
9901                &mut fts_inserted_total,
9902            )?;
9903            profile.fts_flush_duration += fts_flush_start.elapsed();
9904        }
9905
9906        let conversation_row_start = Instant::now();
9907        let mut exact_append_tail_set = false;
9908        if used_append_tail_plan {
9909            if let (Some(last_message_idx), Some(last_message_created_at)) =
9910                (inserted_last_idx, inserted_last_created_at)
9911            {
9912                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9913                    franken_set_conversation_tail_state_after_append(
9914                        &tx,
9915                        existing_id,
9916                        last_message_created_at,
9917                        last_message_idx,
9918                        last_message_created_at,
9919                    )?;
9920                    exact_append_tail_set = true;
9921                } else {
9922                    franken_update_conversation_tail_state(
9923                        &tx,
9924                        existing_id,
9925                        Some(last_message_created_at),
9926                        inserted_last_idx,
9927                        inserted_last_created_at,
9928                    )?;
9929                }
9930            }
9931        } else {
9932            let conv_last_ts = conversation_tail_ended_at_candidate(conv);
9933            franken_update_conversation_tail_state(
9934                &tx,
9935                existing_id,
9936                conv_last_ts,
9937                inserted_last_idx,
9938                inserted_last_created_at,
9939            )?;
9940        }
9941        franken_update_external_conversation_tail_after_append(
9942            &tx,
9943            agent_id,
9944            conv,
9945            used_append_tail_plan,
9946            exact_append_tail_set,
9947            inserted_last_idx,
9948            inserted_last_created_at,
9949        )?;
9950        profile.conversation_row_duration += conversation_row_start.elapsed();
9951
9952        if !defer_analytics_updates && !inserted_indices.is_empty() {
9953            let analytics_start = Instant::now();
9954            franken_update_daily_stats_in_tx(
9955                self,
9956                &tx,
9957                &conv.agent_slug,
9958                &conv.source_id,
9959                conversation_effective_started_at(conv),
9960                StatsDelta {
9961                    session_count_delta: 0,
9962                    message_count_delta: inserted_indices.len() as i64,
9963                    total_chars_delta: new_chars,
9964                },
9965            )?;
9966            profile.analytics_duration += analytics_start.elapsed();
9967        }
9968
9969        let commit_start = Instant::now();
9970        tx.commit()?;
9971        profile.commit_duration += commit_start.elapsed();
9972        profile.invocations += 1;
9973        profile.messages += conv.messages.len();
9974        profile.inserted_messages += inserted_indices.len();
9975        profile.total_duration += total_start.elapsed();
9976
9977        Ok(InsertOutcome {
9978            conversation_id: existing_id,
9979            conversation_inserted: false,
9980            inserted_indices,
9981        })
9982    }
9983
9984    /// Append new messages to an existing conversation within an active transaction.
9985    #[allow(clippy::too_many_arguments)]
9986    fn franken_append_messages_with_tail_in_tx(
9987        &self,
9988        tx: &FrankenTransaction<'_>,
9989        agent_id: i64,
9990        conversation_id: i64,
9991        conv: &Conversation,
9992        append_tail_state: Option<ExistingConversationTailState>,
9993        defer_lexical_updates: bool,
9994        defer_analytics_updates: bool,
9995    ) -> Result<InsertOutcome> {
9996        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9997        let append_plan = append_tail_state.as_ref().and_then(|state| {
9998            collect_append_only_tail_messages(
9999                conv,
10000                state.last_message_idx,
10001                state.last_message_created_at,
10002            )
10003        });
10004        let used_append_tail_plan = append_plan.is_some();
10005        let ExistingConversationNewMessages {
10006            messages: new_messages,
10007            new_chars: _planned_new_chars,
10008            idx_collision_count,
10009            first_collision_idx,
10010        } = if let Some(append_plan) = append_plan {
10011            append_plan
10012        } else {
10013            let ExistingMessageLookup {
10014                by_idx: mut existing_messages,
10015                replay: mut existing_replay_fingerprints,
10016            } = franken_existing_message_lookup(tx, conversation_id, &conv.messages)?;
10017            collect_new_messages_for_existing_conversation(
10018                conversation_id,
10019                conv,
10020                &mut existing_messages,
10021                &mut existing_replay_fingerprints,
10022                "skipping replay-equivalent recovered message with shifted idx",
10023            )
10024        };
10025
10026        let mut inserted_indices = Vec::new();
10027        let mut fts_entries = Vec::new();
10028        let mut fts_pending_chars = 0usize;
10029        let mut _fts_inserted_total = 0usize;
10030        let (inserted_last_idx, inserted_last_created_at) =
10031            borrowed_messages_tail_state(&new_messages);
10032        let inserted_messages =
10033            franken_append_insert_new_messages(tx, conversation_id, &new_messages)?;
10034        let inserted_chars = inserted_messages
10035            .iter()
10036            .map(|(_, msg)| msg.content.len() as i64)
10037            .sum::<i64>();
10038        for (msg_id, msg) in inserted_messages {
10039            franken_insert_snippets(tx, msg_id, &msg.snippets)?;
10040            if !defer_lexical_updates {
10041                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10042                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
10043                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10044                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10045                {
10046                    flush_pending_fts_entries(
10047                        self,
10048                        tx,
10049                        &mut fts_entries,
10050                        &mut fts_pending_chars,
10051                        &mut _fts_inserted_total,
10052                    )?;
10053                }
10054            }
10055            inserted_indices.push(msg.idx);
10056        }
10057
10058        if idx_collision_count > 0 {
10059            tracing::warn!(
10060                conversation_id,
10061                collision_count = idx_collision_count,
10062                first_idx = first_collision_idx,
10063                source_path = %conv.source_path.display(),
10064                "message idx collisions encountered while appending to an existing conversation; retaining canonical message variants"
10065            );
10066        }
10067
10068        if !defer_lexical_updates {
10069            flush_pending_fts_entries(
10070                self,
10071                tx,
10072                &mut fts_entries,
10073                &mut fts_pending_chars,
10074                &mut _fts_inserted_total,
10075            )?;
10076        }
10077
10078        let mut exact_append_tail_set = false;
10079        if used_append_tail_plan {
10080            if let (Some(last_message_idx), Some(last_message_created_at)) =
10081                (inserted_last_idx, inserted_last_created_at)
10082            {
10083                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
10084                    franken_set_conversation_tail_state_after_append(
10085                        tx,
10086                        conversation_id,
10087                        last_message_created_at,
10088                        last_message_idx,
10089                        last_message_created_at,
10090                    )?;
10091                    exact_append_tail_set = true;
10092                } else {
10093                    franken_update_conversation_tail_state(
10094                        tx,
10095                        conversation_id,
10096                        Some(last_message_created_at),
10097                        inserted_last_idx,
10098                        inserted_last_created_at,
10099                    )?;
10100                }
10101            }
10102        } else {
10103            let conv_last_ts = conversation_tail_ended_at_candidate(conv);
10104            franken_update_conversation_tail_state(
10105                tx,
10106                conversation_id,
10107                conv_last_ts,
10108                inserted_last_idx,
10109                inserted_last_created_at,
10110            )?;
10111        }
10112        franken_update_external_conversation_tail_after_append(
10113            tx,
10114            agent_id,
10115            conv,
10116            used_append_tail_plan,
10117            exact_append_tail_set,
10118            inserted_last_idx,
10119            inserted_last_created_at,
10120        )?;
10121
10122        if !defer_analytics_updates && !inserted_indices.is_empty() {
10123            let message_count = inserted_indices.len() as i64;
10124            franken_update_daily_stats_in_tx(
10125                self,
10126                tx,
10127                &conv.agent_slug,
10128                &conv.source_id,
10129                conversation_effective_started_at(conv),
10130                StatsDelta {
10131                    session_count_delta: 0,
10132                    message_count_delta: message_count,
10133                    total_chars_delta: inserted_chars,
10134                },
10135            )?;
10136        }
10137
10138        Ok(InsertOutcome {
10139            conversation_id,
10140            conversation_inserted: false,
10141            inserted_indices,
10142        })
10143    }
10144
10145    /// Rebuild the FTS5 index from scratch (chunked to avoid OOM on large databases, #110).
10146    pub fn rebuild_fts(&self) -> Result<()> {
10147        self.rebuild_fts_via_frankensqlite().map(|_| ())
10148    }
10149
10150    /// Best-effort repair for the derived SQLite FTS fallback index.
10151    ///
10152    /// The canonical archive and Tantivy index remain authoritative, so callers
10153    /// should invoke this from maintenance paths rather than ordinary opens.
10154    pub(crate) fn ensure_search_fallback_fts_consistency(&self) -> Result<FtsConsistencyRepair> {
10155        self.ensure_fts_consistency_via_frankensqlite()
10156    }
10157
10158    pub(crate) fn validate_fts_messages_integrity(&self) -> Result<()> {
10159        validate_fts_messages_integrity_for_connection(&self.conn)
10160    }
10161
10162    pub(crate) fn fallback_fts_is_known_healthy_for_archive_fingerprint(
10163        &self,
10164        archive_fingerprint: &str,
10165    ) -> Result<bool> {
10166        Ok(
10167            self.read_fts_franken_rebuild_generation()? == Some(FTS_FRANKEN_REBUILD_GENERATION)
10168                && self
10169                    .read_fts_franken_rebuild_archive_fingerprint()?
10170                    .as_deref()
10171                    == Some(archive_fingerprint),
10172        )
10173    }
10174
10175    pub(crate) fn record_search_fallback_fts_archive_fingerprint(
10176        &self,
10177        archive_fingerprint: &str,
10178    ) -> Result<()> {
10179        self.conn
10180            .execute_compat(
10181                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
10182                fparams![
10183                    FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY,
10184                    archive_fingerprint.to_string()
10185                ],
10186            )
10187            .with_context(|| "recording frankensqlite FTS archive fingerprint")?;
10188        Ok(())
10189    }
10190
10191    pub(crate) fn daily_stats_is_known_healthy_for_archive_fingerprint(
10192        &self,
10193        archive_fingerprint: &str,
10194    ) -> Result<bool> {
10195        Ok(
10196            self.read_daily_stats_health_generation()? == Some(DAILY_STATS_HEALTH_GENERATION)
10197                && self.read_daily_stats_archive_fingerprint()?.as_deref()
10198                    == Some(archive_fingerprint),
10199        )
10200    }
10201
10202    pub(crate) fn record_daily_stats_archive_fingerprint(
10203        &self,
10204        archive_fingerprint: &str,
10205    ) -> Result<()> {
10206        self.conn
10207            .execute_compat(
10208                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
10209                fparams![
10210                    DAILY_STATS_HEALTH_GENERATION_META_KEY,
10211                    DAILY_STATS_HEALTH_GENERATION.to_string()
10212                ],
10213            )
10214            .with_context(|| "recording daily_stats health generation")?;
10215        self.conn
10216            .execute_compat(
10217                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
10218                fparams![DAILY_STATS_HEALTH_META_KEY, archive_fingerprint.to_string()],
10219            )
10220            .with_context(|| "recording daily_stats archive fingerprint")?;
10221        Ok(())
10222    }
10223
10224    fn read_fts_franken_rebuild_generation(&self) -> Result<Option<i64>> {
10225        let value: Option<String> = self
10226            .conn
10227            .query_row_map(
10228                "SELECT value FROM meta WHERE key = ?1",
10229                fparams![FTS_FRANKEN_REBUILD_META_KEY],
10230                |row| row.get_typed(0),
10231            )
10232            .optional()?;
10233        Ok(value.and_then(|v| v.parse::<i64>().ok()))
10234    }
10235
10236    fn read_fts_franken_rebuild_archive_fingerprint(&self) -> Result<Option<String>> {
10237        Ok(self
10238            .conn
10239            .query_row_map(
10240                "SELECT value FROM meta WHERE key = ?1",
10241                fparams![FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY],
10242                |row| row.get_typed(0),
10243            )
10244            .optional()?)
10245    }
10246
10247    fn read_daily_stats_health_generation(&self) -> Result<Option<i64>> {
10248        let value: Option<String> = self
10249            .conn
10250            .query_row_map(
10251                "SELECT value FROM meta WHERE key = ?1",
10252                fparams![DAILY_STATS_HEALTH_GENERATION_META_KEY],
10253                |row| row.get_typed(0),
10254            )
10255            .optional()?;
10256        Ok(value.and_then(|value| value.parse::<i64>().ok()))
10257    }
10258
10259    fn read_daily_stats_archive_fingerprint(&self) -> Result<Option<String>> {
10260        Ok(self
10261            .conn
10262            .query_row_map(
10263                "SELECT value FROM meta WHERE key = ?1",
10264                fparams![DAILY_STATS_HEALTH_META_KEY],
10265                |row| row.get_typed(0),
10266            )
10267            .optional()?)
10268    }
10269
10270    fn record_fts_franken_rebuild_generation(&self) -> Result<()> {
10271        self.conn
10272            .execute_compat(
10273                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
10274                fparams![
10275                    FTS_FRANKEN_REBUILD_META_KEY,
10276                    FTS_FRANKEN_REBUILD_GENERATION.to_string()
10277                ],
10278            )
10279            .with_context(|| "recording frankensqlite FTS rebuild generation")?;
10280        Ok(())
10281    }
10282
10283    fn ensure_fts_consistency_via_frankensqlite(&self) -> Result<FtsConsistencyRepair> {
10284        if self.read_fts_franken_rebuild_generation()? != Some(FTS_FRANKEN_REBUILD_GENERATION) {
10285            // Before triggering an expensive full rebuild, probe whether
10286            // fts_messages is already populated and consistent.  On large
10287            // databases the rebuild can take hours and OOM — skip it when
10288            // the only thing missing is the generation marker (#184).
10289            let fts_already_healthy = (|| -> Result<bool> {
10290                let fts_exists: i64 = self.conn.query_row_map(
10291                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
10292                    fparams![],
10293                    |row| row.get_typed(0),
10294                )?;
10295                if fts_exists != 1 {
10296                    return Ok(false);
10297                }
10298                let total: i64 = self.conn.query_row_map(
10299                    "SELECT COUNT(*) FROM messages",
10300                    fparams![],
10301                    |row| row.get_typed(0),
10302                )?;
10303                if total == 0 {
10304                    return Ok(false);
10305                }
10306                let indexed: i64 = self.conn.query_row_map(
10307                    "SELECT COUNT(*) FROM fts_messages",
10308                    fparams![],
10309                    |row| row.get_typed(0),
10310                )?;
10311                // Consider healthy if >=90% of messages are indexed
10312                Ok(indexed > 0 && indexed * 100 >= total * 90)
10313            })()
10314            .unwrap_or(false);
10315
10316            if fts_already_healthy {
10317                tracing::info!(
10318                    target: "cass::fts_rebuild",
10319                    "FTS already populated and consistent; setting generation marker without rebuild"
10320                );
10321                self.record_fts_franken_rebuild_generation()?;
10322                self.set_fts_messages_present_cache(true);
10323            } else {
10324                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10325                self.record_fts_franken_rebuild_generation()?;
10326                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
10327            }
10328        }
10329
10330        let inspection = (|| -> Result<(i64, bool)> {
10331            let fts_schema_rows = self.conn.query_row_map(
10332                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
10333                fparams![],
10334                |row| row.get_typed::<i64>(0),
10335            )?;
10336            let fts_queryable = fts_schema_rows == 1
10337                && self.conn.query("SELECT COUNT(*) FROM fts_messages").is_ok();
10338            Ok((fts_schema_rows, fts_queryable))
10339        })();
10340
10341        let (fts_schema_rows, fts_queryable) = match inspection {
10342            Ok(result) => result,
10343            Err(err) => {
10344                tracing::warn!(
10345                    error = %err,
10346                    "frankensqlite FTS consistency probe failed; rebuilding authoritative FTS"
10347                );
10348                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10349                self.record_fts_franken_rebuild_generation()?;
10350                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
10351            }
10352        };
10353
10354        if fts_schema_rows != 1 || !fts_queryable {
10355            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10356            self.record_fts_franken_rebuild_generation()?;
10357            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
10358        }
10359
10360        let total_messages =
10361            self.conn
10362                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
10363                    row.get_typed::<i64>(0)
10364                })?;
10365        let indexed_messages =
10366            self.conn
10367                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
10368                    row.get_typed::<i64>(0)
10369                })?;
10370
10371        if indexed_messages == total_messages {
10372            self.set_fts_messages_present_cache(true);
10373            return Ok(FtsConsistencyRepair::AlreadyHealthy {
10374                rows: usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX),
10375            });
10376        }
10377
10378        if indexed_messages > total_messages {
10379            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10380            self.record_fts_franken_rebuild_generation()?;
10381            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
10382        }
10383
10384        let inserted_rows = self
10385            .stream_fts_rows_via_frankensqlite(true)
10386            .with_context(|| "incrementally repairing missing FTS rows via frankensqlite")?;
10387        let repaired_rows =
10388            self.conn
10389                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
10390                    row.get_typed::<i64>(0)
10391                })?;
10392        if repaired_rows == total_messages {
10393            self.set_fts_messages_present_cache(true);
10394            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
10395                inserted_rows,
10396                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
10397            });
10398        }
10399
10400        // The incremental catch-up found nothing to insert, yet the gap
10401        // between total_messages (all rows, including orphans) and
10402        // indexed_messages (only rows with valid conversation_id, since the
10403        // FTS INSERT inner-joins on conversations) remains.  A full rebuild
10404        // cannot close this gap either — the orphaned messages will be
10405        // excluded again — so falling through to one would just re-do ~5 min
10406        // of work on every startup.  Accept the current state.
10407        if inserted_rows == 0 {
10408            tracing::debug!(
10409                target: "cass::fts_rebuild",
10410                indexed_messages = repaired_rows,
10411                total_messages,
10412                un_indexable_gap = total_messages.saturating_sub(repaired_rows),
10413                "FTS catch-up inserted 0 rows; remaining gap is un-indexable (likely orphaned messages with dangling conversation_id)"
10414            );
10415            self.set_fts_messages_present_cache(true);
10416            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
10417                inserted_rows: 0,
10418                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
10419            });
10420        }
10421
10422        // Incremental made progress but didn't fully close the gap — something
10423        // is genuinely inconsistent, so do a full rebuild.
10424        let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10425        self.record_fts_franken_rebuild_generation()?;
10426        Ok(FtsConsistencyRepair::Rebuilt { inserted_rows })
10427    }
10428
10429    pub(crate) fn rebuild_fts_via_frankensqlite(&self) -> Result<usize> {
10430        self.invalidate_fts_messages_present_cache();
10431        self.conn
10432            .execute("DROP TABLE IF EXISTS fts_messages;")
10433            .with_context(|| "dropping derived fts_messages before frankensqlite rebuild")?;
10434        self.conn
10435            .execute_compat(FTS5_REGISTER_SQL, fparams![])
10436            .with_context(|| "creating derived fts_messages via frankensqlite rebuild")?;
10437        self.set_fts_messages_present_cache(true);
10438
10439        self.stream_fts_rows_via_frankensqlite(false)
10440    }
10441
10442    fn stream_fts_rows_via_frankensqlite(&self, missing_only: bool) -> Result<usize> {
10443        let batch_size = fts_rebuild_batch_size().max(1);
10444        let batch_limit = i64::try_from(batch_size).unwrap_or(i64::MAX);
10445        let mut total_inserted: usize = 0;
10446        let mut total_skipped_orphans: usize = 0;
10447        let mut total_skipped_existing: usize = 0;
10448        let mut last_rowid: i64 = 0;
10449        let conversation_by_id = self.load_fts_conversation_projection_map()?;
10450        let agent_slug_by_id = self.load_fts_agent_slug_map()?;
10451        let workspace_path_by_id = self.load_fts_workspace_path_map()?;
10452        let existing_fts_rowids = if missing_only {
10453            Some(self.load_fts_message_rowid_set()?)
10454        } else {
10455            None
10456        };
10457        let mut entries = Vec::new();
10458        let mut pending_chars = 0usize;
10459
10460        loop {
10461            let rows = self.fetch_fts_rebuild_message_rows(last_rowid, batch_limit)?;
10462            let fetched_count = rows.len();
10463            if fetched_count == 0 {
10464                break;
10465            }
10466
10467            let inserted_before_batch = total_inserted;
10468            let skipped_before_batch = total_skipped_orphans;
10469            let existing_before_batch = total_skipped_existing;
10470
10471            for row in rows {
10472                last_rowid = row.rowid;
10473                if existing_fts_rowids
10474                    .as_ref()
10475                    .is_some_and(|rowids| rowids.contains(&row.message_id))
10476                {
10477                    total_skipped_existing = total_skipped_existing.saturating_add(1);
10478                    continue;
10479                }
10480                let Some(conversation) = conversation_by_id.get(&row.conversation_id) else {
10481                    total_skipped_orphans = total_skipped_orphans.saturating_add(1);
10482                    continue;
10483                };
10484                let agent = conversation
10485                    .agent_id
10486                    .and_then(|agent_id| agent_slug_by_id.get(&agent_id))
10487                    .filter(|slug| !slug.is_empty())
10488                    .cloned()
10489                    .unwrap_or_else(|| "unknown".to_string());
10490                let workspace = conversation
10491                    .workspace_id
10492                    .and_then(|workspace_id| workspace_path_by_id.get(&workspace_id))
10493                    .cloned()
10494                    .unwrap_or_default();
10495                pending_chars = pending_chars.saturating_add(row.content.len());
10496                entries.push(FtsEntry {
10497                    content: row.content,
10498                    title: conversation.title.clone(),
10499                    agent,
10500                    workspace,
10501                    source_path: conversation.source_path.clone(),
10502                    created_at: row.created_at,
10503                    message_id: row.message_id,
10504                });
10505                if entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10506                    || pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10507                {
10508                    total_inserted = total_inserted.saturating_add(
10509                        franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
10510                    );
10511                    entries.clear();
10512                    pending_chars = 0;
10513                }
10514            }
10515
10516            if !entries.is_empty() {
10517                total_inserted = total_inserted.saturating_add(
10518                    franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
10519                );
10520                entries.clear();
10521                pending_chars = 0;
10522            }
10523
10524            tracing::debug!(
10525                target: "cass::fts_rebuild",
10526                batch_rows = fetched_count,
10527                batch_inserted = total_inserted.saturating_sub(inserted_before_batch),
10528                batch_skipped_orphans = total_skipped_orphans.saturating_sub(skipped_before_batch),
10529                batch_skipped_existing = total_skipped_existing.saturating_sub(existing_before_batch),
10530                total_inserted,
10531                total_skipped_orphans,
10532                total_skipped_existing,
10533                last_rowid,
10534                missing_only,
10535                "FTS streaming maintenance batch complete"
10536            );
10537
10538            if fetched_count < batch_size {
10539                break;
10540            }
10541        }
10542
10543        Ok(total_inserted)
10544    }
10545
10546    fn fetch_fts_rebuild_message_rows(
10547        &self,
10548        last_rowid: i64,
10549        batch_limit: i64,
10550    ) -> Result<Vec<FtsRebuildMessageRow>> {
10551        self.conn
10552            .query_map_collect(
10553                "SELECT m.rowid, m.id, m.conversation_id, m.content, m.created_at
10554                 FROM messages m
10555                 WHERE m.rowid > ?1
10556                 ORDER BY m.rowid
10557                 LIMIT ?2",
10558                fparams![last_rowid, batch_limit],
10559                |row| {
10560                    Ok(FtsRebuildMessageRow {
10561                        rowid: row.get_typed(0)?,
10562                        message_id: row.get_typed(1)?,
10563                        conversation_id: row.get_typed(2)?,
10564                        content: row.get_typed::<Option<String>>(3)?.unwrap_or_default(),
10565                        created_at: row.get_typed(4)?,
10566                    })
10567                },
10568            )
10569            .with_context(|| format!("fetching FTS maintenance messages after rowid {last_rowid}"))
10570    }
10571
10572    fn load_fts_message_rowid_set(&self) -> Result<HashSet<i64>> {
10573        let rows: Vec<i64> = self
10574            .conn
10575            .query_map_collect("SELECT rowid FROM fts_messages", fparams![], |row| {
10576                row.get_typed(0)
10577            })
10578            .with_context(|| "loading existing FTS message rowids")?;
10579        Ok(rows.into_iter().collect())
10580    }
10581
10582    fn load_fts_conversation_projection_map(
10583        &self,
10584    ) -> Result<HashMap<i64, FtsConversationProjection>> {
10585        let rows: Vec<(i64, FtsConversationProjection)> = self
10586            .conn
10587            .query_map_collect(
10588                "SELECT id, title, agent_id, workspace_id, source_path
10589                 FROM conversations",
10590                fparams![],
10591                |row| {
10592                    Ok((
10593                        row.get_typed(0)?,
10594                        FtsConversationProjection {
10595                            title: row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
10596                            agent_id: row.get_typed(2)?,
10597                            workspace_id: row.get_typed(3)?,
10598                            source_path: row.get_typed::<Option<String>>(4)?.unwrap_or_default(),
10599                        },
10600                    ))
10601                },
10602            )
10603            .with_context(|| "loading FTS conversation projection map")?;
10604        Ok(rows.into_iter().collect())
10605    }
10606
10607    fn load_fts_agent_slug_map(&self) -> Result<HashMap<i64, String>> {
10608        let rows: Vec<(i64, String)> = self
10609            .conn
10610            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
10611                Ok((
10612                    row.get_typed(0)?,
10613                    row.get_typed::<Option<String>>(1)?
10614                        .unwrap_or_else(|| "unknown".to_string()),
10615                ))
10616            })
10617            .with_context(|| "loading FTS agent slug map")?;
10618        Ok(rows.into_iter().collect())
10619    }
10620
10621    fn load_fts_workspace_path_map(&self) -> Result<HashMap<i64, String>> {
10622        let rows: Vec<(i64, String)> = self
10623            .conn
10624            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
10625                Ok((
10626                    row.get_typed(0)?,
10627                    row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
10628                ))
10629            })
10630            .with_context(|| "loading FTS workspace path map")?;
10631        Ok(rows.into_iter().collect())
10632    }
10633
10634    /// Fetch all messages for embedding generation.
10635    pub fn fetch_messages_for_embedding(&self) -> Result<Vec<MessageForEmbedding>> {
10636        // COALESCE(c.agent_id, 0) so legacy V1 conversations with NULL
10637        // agent_id don't cause a runtime row-decode failure (agent_id in
10638        // MessageForEmbedding is i64).  saturating_u32_from_i64 downstream
10639        // turns 0 into the "unknown agent" sentinel for doc-id hashing.
10640        self.conn
10641            .query_map_collect(
10642                "SELECT m.id, m.created_at, COALESCE(c.agent_id, 0), c.workspace_id, c.source_id, m.role, m.content
10643                 FROM messages m
10644                 JOIN conversations c ON m.conversation_id = c.id
10645                 ORDER BY m.id",
10646                fparams![],
10647                |row| {
10648                    let source_id: String = row.get_typed::<Option<String>>(4)?
10649                        .unwrap_or_else(|| "local".to_string());
10650                    Ok(MessageForEmbedding {
10651                        message_id: row.get_typed(0)?,
10652                        created_at: row.get_typed(1)?,
10653                        agent_id: row.get_typed(2)?,
10654                        workspace_id: row.get_typed(3)?,
10655                        source_id_hash: crc32fast::hash(source_id.as_bytes()),
10656                        role: row.get_typed(5)?,
10657                        content: row.get_typed(6)?,
10658                    })
10659                },
10660            )
10661            .with_context(|| "fetching messages for embedding")
10662    }
10663
10664    /// Get the watermark for incremental semantic embedding.
10665    pub fn get_last_embedded_message_id(&self) -> Result<Option<i64>> {
10666        let result: Result<String, _> = self.conn.query_row_map(
10667            "SELECT value FROM meta WHERE key = 'last_embedded_message_id'",
10668            fparams![],
10669            |row| row.get_typed(0),
10670        );
10671        match result.optional() {
10672            Ok(Some(s)) => Ok(s.parse().ok()),
10673            Ok(None) => Ok(None),
10674            Err(e) => Err(e.into()),
10675        }
10676    }
10677
10678    /// Set the watermark for incremental semantic embedding.
10679    pub fn set_last_embedded_message_id(&self, id: i64) -> Result<()> {
10680        self.conn.execute_compat(
10681            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_embedded_message_id', ?1)",
10682            fparams![id.to_string()],
10683        )?;
10684        Ok(())
10685    }
10686
10687    /// Get embedding jobs for a database path.
10688    pub fn get_embedding_jobs(&self, db_path: &str) -> Result<Vec<EmbeddingJobRow>> {
10689        self.conn
10690            .query_map_collect(
10691                "SELECT id, db_path, model_id, status, total_docs, completed_docs, error_message, created_at, started_at, completed_at
10692                 FROM embedding_jobs WHERE db_path = ?1 ORDER BY id DESC",
10693                fparams![db_path],
10694                |row| {
10695                    Ok(EmbeddingJobRow {
10696                        id: row.get_typed(0)?,
10697                        db_path: row.get_typed(1)?,
10698                        model_id: row.get_typed(2)?,
10699                        status: row.get_typed(3)?,
10700                        total_docs: row.get_typed(4)?,
10701                        completed_docs: row.get_typed(5)?,
10702                        error_message: row.get_typed(6)?,
10703                        created_at: row.get_typed(7)?,
10704                        started_at: row.get_typed(8)?,
10705                        completed_at: row.get_typed(9)?,
10706                    })
10707                },
10708            )
10709            .with_context(|| format!("fetching embedding jobs for {db_path}"))
10710    }
10711
10712    /// Create or update an embedding job.
10713    pub fn upsert_embedding_job(
10714        &self,
10715        db_path: &str,
10716        model_id: &str,
10717        total_docs: i64,
10718    ) -> Result<i64> {
10719        let updated = self.conn.execute_compat(
10720            "UPDATE embedding_jobs
10721             SET total_docs = ?3
10722             WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10723            fparams![db_path, model_id, total_docs],
10724        )?;
10725        if updated == 0 {
10726            let insert_result = self.conn.execute_compat(
10727                "INSERT INTO embedding_jobs(db_path, model_id, total_docs) VALUES(?1,?2,?3)",
10728                fparams![db_path, model_id, total_docs],
10729            );
10730            if let Err(err) = insert_result {
10731                if !matches!(err, frankensqlite::FrankenError::UniqueViolation { .. }) {
10732                    return Err(err.into());
10733                }
10734                self.conn.execute_compat(
10735                    "UPDATE embedding_jobs
10736                     SET total_docs = ?3
10737                     WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10738                    fparams![db_path, model_id, total_docs],
10739                )?;
10740            }
10741        }
10742        self.conn
10743            .query_row_map(
10744                "SELECT id FROM embedding_jobs
10745                 WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')
10746                 ORDER BY id DESC
10747                 LIMIT 1",
10748                fparams![db_path, model_id],
10749                |row| row.get_typed(0),
10750            )
10751            .with_context(|| "resolving embedding job id after upsert")
10752    }
10753
10754    /// Mark an embedding job as started.
10755    pub fn start_embedding_job(&self, job_id: i64) -> Result<()> {
10756        self.conn.execute_compat(
10757            "UPDATE embedding_jobs SET status = 'running', started_at = datetime('now') WHERE id = ?1",
10758            fparams![job_id],
10759        )?;
10760        Ok(())
10761    }
10762
10763    /// Mark an embedding job as completed.
10764    pub fn complete_embedding_job(&self, job_id: i64) -> Result<()> {
10765        self.conn.execute_compat(
10766            "UPDATE embedding_jobs SET status = 'completed', completed_at = datetime('now') WHERE id = ?1",
10767            fparams![job_id],
10768        )?;
10769        Ok(())
10770    }
10771
10772    /// Mark an embedding job as failed.
10773    pub fn fail_embedding_job(&self, job_id: i64, error: &str) -> Result<()> {
10774        self.conn.execute_compat(
10775            "UPDATE embedding_jobs SET status = 'failed', error_message = ?2, completed_at = datetime('now') WHERE id = ?1",
10776            fparams![job_id, error],
10777        )?;
10778        Ok(())
10779    }
10780
10781    /// Cancel embedding jobs for a database path.
10782    pub fn cancel_embedding_jobs(&self, db_path: &str, model_id: Option<&str>) -> Result<usize> {
10783        if let Some(mid) = model_id {
10784            Ok(self.conn.execute_compat(
10785                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10786                fparams![db_path, mid],
10787            )?)
10788        } else {
10789            Ok(self.conn.execute_compat(
10790                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND status IN ('pending', 'running')",
10791                fparams![db_path],
10792            )?)
10793        }
10794    }
10795
10796    /// Update embedding job progress.
10797    pub fn update_job_progress(&self, job_id: i64, completed_docs: i64) -> Result<()> {
10798        self.conn.execute_compat(
10799            "UPDATE embedding_jobs SET completed_docs = ?2 WHERE id = ?1",
10800            fparams![job_id, completed_docs],
10801        )?;
10802        Ok(())
10803    }
10804
10805    // =====================================================================
10806    // Analytics query methods
10807    // =====================================================================
10808
10809    /// Get session count for a date range using materialized stats.
10810    /// Returns (count, is_from_cache) where is_from_cache is true if from daily_stats.
10811    ///
10812    /// Falls back to COUNT(*) query when daily_stats table is empty or stale.
10813    pub fn count_sessions_in_range(
10814        &self,
10815        start_ts_ms: Option<i64>,
10816        end_ts_ms: Option<i64>,
10817        agent_slug: Option<&str>,
10818        source_id: Option<&str>,
10819    ) -> Result<(i64, bool)> {
10820        let agent = agent_slug.unwrap_or("all");
10821        let source = source_id.unwrap_or("all");
10822
10823        // Check if we have materialized stats
10824        let stats_count: i64 = self
10825            .conn
10826            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10827                row.get_typed(0)
10828            })
10829            .unwrap_or(0);
10830
10831        if stats_count == 0 {
10832            return self.count_sessions_direct(start_ts_ms, end_ts_ms, agent_slug, source_id);
10833        }
10834
10835        // Use materialized stats
10836        let start_day = start_ts_ms.map(Self::day_id_from_millis);
10837        let end_day = end_ts_ms.map(Self::day_id_from_millis);
10838
10839        let count: i64 = match (start_day, end_day) {
10840            (Some(start), Some(end)) => self.conn.query_row_map(
10841                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10842                 WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4",
10843                fparams![start, end, agent, source],
10844                |row| row.get_typed(0),
10845            )?,
10846            (Some(start), None) => self.conn.query_row_map(
10847                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10848                 WHERE day_id >= ?1 AND agent_slug = ?2 AND source_id = ?3",
10849                fparams![start, agent, source],
10850                |row| row.get_typed(0),
10851            )?,
10852            (None, Some(end)) => self.conn.query_row_map(
10853                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10854                 WHERE day_id <= ?1 AND agent_slug = ?2 AND source_id = ?3",
10855                fparams![end, agent, source],
10856                |row| row.get_typed(0),
10857            )?,
10858            (None, None) => self.conn.query_row_map(
10859                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10860                 WHERE agent_slug = ?1 AND source_id = ?2",
10861                fparams![agent, source],
10862                |row| row.get_typed(0),
10863            )?,
10864        };
10865
10866        Ok((count, true))
10867    }
10868
10869    /// Direct COUNT(*) query as fallback when daily_stats is empty.
10870    fn count_sessions_direct(
10871        &self,
10872        start_ts_ms: Option<i64>,
10873        end_ts_ms: Option<i64>,
10874        agent_slug: Option<&str>,
10875        source_id: Option<&str>,
10876    ) -> Result<(i64, bool)> {
10877        // Build dynamic SQL with positional params.  Single-table scan of
10878        // conversations; filter on agent slug via an EXISTS subquery only
10879        // when that filter is actually requested.  This avoids the unneeded
10880        // 2-table JOIN (which also silently dropped legacy conversations
10881        // with NULL agent_id) and sidesteps frankensqlite's materialization
10882        // fallback entirely.
10883        let mut sql = "SELECT COUNT(*) FROM conversations c WHERE 1=1".to_string();
10884        let mut param_values: Vec<ParamValue> = Vec::new();
10885        let mut idx = 1;
10886
10887        if let Some(start) = start_ts_ms {
10888            sql.push_str(&format!(" AND c.started_at >= ?{idx}"));
10889            param_values.push(ParamValue::from(start));
10890            idx += 1;
10891        }
10892        if let Some(end) = end_ts_ms {
10893            sql.push_str(&format!(" AND c.started_at <= ?{idx}"));
10894            param_values.push(ParamValue::from(end));
10895            idx += 1;
10896        }
10897        if let Some(agent) = agent_slug
10898            && agent != "all"
10899        {
10900            sql.push_str(&format!(
10901                " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug = ?{idx})"
10902            ));
10903            param_values.push(ParamValue::from(agent));
10904            idx += 1;
10905        }
10906        if let Some(source) = source_id
10907            && source != "all"
10908        {
10909            sql.push_str(&format!(" AND c.source_id = ?{idx}"));
10910            param_values.push(ParamValue::from(source));
10911            let _ = idx; // suppress unused warning
10912        }
10913
10914        let count: i64 = self
10915            .conn
10916            .query_row_map(&sql, &param_values, |row| row.get_typed(0))?;
10917        Ok((count, false))
10918    }
10919
10920    /// Get daily histogram data for a date range.
10921    pub fn get_daily_histogram(
10922        &self,
10923        start_ts_ms: i64,
10924        end_ts_ms: i64,
10925        agent_slug: Option<&str>,
10926        source_id: Option<&str>,
10927    ) -> Result<Vec<DailyCount>> {
10928        let start_day = Self::day_id_from_millis(start_ts_ms);
10929        let end_day = Self::day_id_from_millis(end_ts_ms);
10930        let agent = agent_slug.unwrap_or("all");
10931        let source = source_id.unwrap_or("all");
10932
10933        let rows = self.conn.query_map_collect(
10934            "SELECT day_id, session_count, message_count, total_chars
10935             FROM daily_stats
10936             WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4
10937             ORDER BY day_id",
10938            fparams![start_day, end_day, agent, source],
10939            |row| {
10940                Ok(DailyCount {
10941                    day_id: row.get_typed(0)?,
10942                    sessions: row.get_typed(1)?,
10943                    messages: row.get_typed(2)?,
10944                    chars: row.get_typed(3)?,
10945                })
10946            },
10947        )?;
10948
10949        Ok(rows)
10950    }
10951
10952    /// Check health of daily stats table.
10953    pub fn daily_stats_health(&self) -> Result<DailyStatsHealth> {
10954        let row_count: i64 =
10955            self.conn
10956                .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10957                    row.get_typed(0)
10958                })?;
10959
10960        let oldest_update: Option<i64> = self.conn.query_row_map(
10961            "SELECT MIN(last_updated) FROM daily_stats",
10962            fparams![],
10963            |row| row.get_typed(0),
10964        )?;
10965
10966        let conversation_count: i64 =
10967            self.conn
10968                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
10969                    row.get_typed(0)
10970                })?;
10971
10972        let materialized_total: i64 = self.conn.query_row_map(
10973            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10974                 WHERE agent_slug = 'all' AND source_id = 'all'",
10975            fparams![],
10976            |row| row.get_typed(0),
10977        )?;
10978
10979        Ok(DailyStatsHealth {
10980            populated: row_count > 0,
10981            row_count,
10982            oldest_update_ms: oldest_update,
10983            conversation_count,
10984            materialized_total,
10985            drift: (conversation_count - materialized_total).abs(),
10986        })
10987    }
10988
10989    /// Batch insert multiple conversations with full analytics (token usage,
10990    /// message metrics, rollups).  Frankensqlite equivalent of
10991    /// `SqliteStorage::insert_conversations_batched`.
10992    pub fn insert_conversations_batched(
10993        &self,
10994        conversations: &[(i64, Option<i64>, &Conversation)],
10995    ) -> Result<Vec<InsertOutcome>> {
10996        if conversations.is_empty() {
10997            return Ok(Vec::new());
10998        }
10999
11000        self.ensure_sources_for_batch(conversations)?;
11001
11002        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
11003        let defer_analytics_updates = defer_analytics_updates_enabled();
11004
11005        let pricing_table = PricingTable::franken_load(&self.conn).unwrap_or_else(|e| {
11006            tracing::warn!(target: "cass::analytics::pricing", error = %e, "failed to load pricing table");
11007            PricingTable { entries: Vec::new() }
11008        });
11009        let mut pricing_diag = PricingDiagnostics::default();
11010
11011        let mut tx = self.conn.transaction()?;
11012
11013        // Bug #167: Ensure all referenced agents, workspaces, and sources
11014        // exist inside the transaction so FK checks pass.  The caller resolves
11015        // IDs via ensure_agent / ensure_workspace / ensure_sources_for_batch
11016        // outside the transaction, but those autocommit writes may not be
11017        // visible inside the transaction snapshot in frankensqlite.  Re-verify
11018        // (and insert if missing) within the tx.
11019        ensure_agents_in_tx(&tx, conversations)?;
11020        ensure_workspaces_in_tx(&tx, conversations)?;
11021        ensure_sources_in_tx(&tx, conversations)?;
11022
11023        let mut outcomes = Vec::with_capacity(conversations.len());
11024        let mut fts_entries = Vec::new();
11025        let mut fts_pending_chars = 0usize;
11026        let mut fts_inserted_total = 0usize;
11027        let mut fts_count_total = 0usize;
11028        let mut stats = StatsAggregator::new();
11029        let mut token_stats = TokenStatsAggregator::new();
11030        let mut token_entries: Vec<TokenUsageEntry> = Vec::new();
11031        let mut metrics_entries: Vec<MessageMetricsEntry> = Vec::new();
11032        let mut rollup_agg = AnalyticsRollupAggregator::new();
11033        let mut conv_ids_to_summarize: Vec<i64> = Vec::new();
11034        let mut pending_conversation_ids: HashMap<PendingConversationKey, i64> = HashMap::new();
11035        let mut pending_message_fingerprints: HashMap<i64, HashMap<i64, MessageMergeFingerprint>> =
11036            HashMap::new();
11037        let mut pending_message_replay_fingerprints: HashMap<
11038            i64,
11039            HashSet<MessageReplayFingerprint>,
11040        > = HashMap::new();
11041
11042        for &(agent_id, workspace_id, raw_conv) in conversations {
11043            let normalized_conv = normalized_conversation_for_storage(raw_conv);
11044            let conv = normalized_conv.as_ref();
11045            let mut total_chars: i64 = 0;
11046            let mut inserted_indices = Vec::with_capacity(conv.messages.len());
11047            let mut inserted_messages: Vec<(i64, &Message)> =
11048                Vec::with_capacity(conv.messages.len());
11049            let mut session_count_delta = 1_i64;
11050            let conversation_key = conversation_merge_key(agent_id, conv);
11051
11052            let existing_conv_id = if let Some(existing_id) =
11053                pending_conversation_ids.get(&conversation_key)
11054            {
11055                Some(*existing_id)
11056            } else {
11057                let existing_id =
11058                    franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
11059                if let Some(existing_id) = existing_id {
11060                    pending_conversation_ids.insert(conversation_key.clone(), existing_id);
11061                }
11062                existing_id
11063            };
11064
11065            let conv_id = if let Some(existing_id) = existing_conv_id {
11066                session_count_delta = 0;
11067                let (
11068                    ExistingConversationNewMessages {
11069                        messages: new_messages,
11070                        new_chars: _planned_new_chars,
11071                        idx_collision_count,
11072                        first_collision_idx,
11073                    },
11074                    existing_messages,
11075                    existing_replay_fingerprints,
11076                ) = franken_collect_batched_existing_new_messages(
11077                    &tx,
11078                    existing_id,
11079                    conv,
11080                    &mut pending_message_fingerprints,
11081                    &mut pending_message_replay_fingerprints,
11082                    "skipping replay-equivalent recovered message with shifted idx during batched merge",
11083                )?;
11084                let (inserted_last_idx, inserted_last_created_at) =
11085                    borrowed_messages_tail_state(&new_messages);
11086                let inserted_append_messages =
11087                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
11088                total_chars += inserted_append_messages
11089                    .iter()
11090                    .map(|(_, msg)| msg.content.len() as i64)
11091                    .sum::<i64>();
11092                for (msg_id, msg) in inserted_append_messages {
11093                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
11094                    if !defer_lexical_updates {
11095                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
11096                        fts_count_total += 1;
11097                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
11098                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
11099                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
11100                        {
11101                            flush_pending_fts_entries(
11102                                self,
11103                                &tx,
11104                                &mut fts_entries,
11105                                &mut fts_pending_chars,
11106                                &mut fts_inserted_total,
11107                            )?;
11108                        }
11109                    }
11110                    inserted_indices.push(msg.idx);
11111                    inserted_messages.push((msg_id, msg));
11112                }
11113
11114                if idx_collision_count > 0 {
11115                    tracing::warn!(
11116                        conversation_id = existing_id,
11117                        collision_count = idx_collision_count,
11118                        first_idx = first_collision_idx,
11119                        source_path = %conv.source_path.display(),
11120                        "message idx collisions encountered during batched conversation merge; retaining canonical message variants"
11121                    );
11122                }
11123
11124                let conv_last_ts = conversation_tail_ended_at_candidate(conv);
11125                franken_update_conversation_tail_state(
11126                    &tx,
11127                    existing_id,
11128                    conv_last_ts,
11129                    inserted_last_idx,
11130                    inserted_last_created_at,
11131                )?;
11132                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
11133                {
11134                    franken_update_external_conversation_tail_lookup_key(
11135                        &tx,
11136                        &lookup_key,
11137                        conv_last_ts,
11138                        inserted_last_idx,
11139                        inserted_last_created_at,
11140                    )?;
11141                }
11142
11143                pending_message_fingerprints.insert(existing_id, existing_messages);
11144                pending_message_replay_fingerprints
11145                    .insert(existing_id, existing_replay_fingerprints);
11146
11147                existing_id
11148            } else {
11149                match franken_insert_conversation_or_get_existing(
11150                    &tx,
11151                    agent_id,
11152                    workspace_id,
11153                    conv,
11154                )? {
11155                    ConversationInsertStatus::Inserted(new_conv_id) => {
11156                        pending_conversation_ids.insert(conversation_key.clone(), new_conv_id);
11157                        let pending_messages =
11158                            pending_message_fingerprints.entry(new_conv_id).or_default();
11159                        let pending_replay_fingerprints = pending_message_replay_fingerprints
11160                            .entry(new_conv_id)
11161                            .or_default();
11162                        let mut new_messages = Vec::new();
11163                        for msg in &conv.messages {
11164                            let incoming_replay = message_replay_fingerprint(msg);
11165                            if pending_messages.contains_key(&msg.idx)
11166                                || pending_replay_fingerprints.contains(&incoming_replay)
11167                            {
11168                                continue;
11169                            }
11170                            pending_messages.insert(msg.idx, message_merge_fingerprint(msg));
11171                            pending_replay_fingerprints.insert(incoming_replay);
11172                            new_messages.push(msg);
11173                        }
11174                        let inserted_message_ids =
11175                            franken_batch_insert_new_messages(&tx, new_conv_id, &new_messages)?;
11176                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
11177                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
11178                            if !defer_lexical_updates {
11179                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
11180                                fts_count_total += 1;
11181                                fts_pending_chars =
11182                                    fts_pending_chars.saturating_add(msg.content.len());
11183                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
11184                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
11185                                {
11186                                    flush_pending_fts_entries(
11187                                        self,
11188                                        &tx,
11189                                        &mut fts_entries,
11190                                        &mut fts_pending_chars,
11191                                        &mut fts_inserted_total,
11192                                    )?;
11193                                }
11194                            }
11195                            total_chars += msg.content.len() as i64;
11196                            inserted_indices.push(msg.idx);
11197                            inserted_messages.push((msg_id, msg));
11198                        }
11199                        new_conv_id
11200                    }
11201                    ConversationInsertStatus::Existing(existing_id) => {
11202                        session_count_delta = 0;
11203                        pending_conversation_ids.insert(conversation_key.clone(), existing_id);
11204                        let (
11205                            ExistingConversationNewMessages {
11206                                messages: new_messages,
11207                                new_chars: _planned_new_chars,
11208                                idx_collision_count,
11209                                first_collision_idx,
11210                            },
11211                            existing_messages,
11212                            existing_replay_fingerprints,
11213                        ) = franken_collect_batched_existing_new_messages(
11214                            &tx,
11215                            existing_id,
11216                            conv,
11217                            &mut pending_message_fingerprints,
11218                            &mut pending_message_replay_fingerprints,
11219                            "skipping replay-equivalent recovered message with shifted idx after duplicate conversation recovery",
11220                        )?;
11221                        let (inserted_last_idx, inserted_last_created_at) =
11222                            borrowed_messages_tail_state(&new_messages);
11223                        let inserted_append_messages =
11224                            franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
11225                        total_chars += inserted_append_messages
11226                            .iter()
11227                            .map(|(_, msg)| msg.content.len() as i64)
11228                            .sum::<i64>();
11229                        for (msg_id, msg) in inserted_append_messages {
11230                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
11231                            if !defer_lexical_updates {
11232                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
11233                                fts_count_total += 1;
11234                                fts_pending_chars =
11235                                    fts_pending_chars.saturating_add(msg.content.len());
11236                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
11237                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
11238                                {
11239                                    flush_pending_fts_entries(
11240                                        self,
11241                                        &tx,
11242                                        &mut fts_entries,
11243                                        &mut fts_pending_chars,
11244                                        &mut fts_inserted_total,
11245                                    )?;
11246                                }
11247                            }
11248                            inserted_indices.push(msg.idx);
11249                            inserted_messages.push((msg_id, msg));
11250                        }
11251
11252                        if idx_collision_count > 0 {
11253                            tracing::warn!(
11254                                conversation_id = existing_id,
11255                                collision_count = idx_collision_count,
11256                                first_idx = first_collision_idx,
11257                                source_path = %conv.source_path.display(),
11258                                "message idx collisions encountered after duplicate conversation recovery; retaining canonical message variants"
11259                            );
11260                        }
11261
11262                        let conv_last_ts = conversation_tail_ended_at_candidate(conv);
11263                        franken_update_conversation_tail_state(
11264                            &tx,
11265                            existing_id,
11266                            conv_last_ts,
11267                            inserted_last_idx,
11268                            inserted_last_created_at,
11269                        )?;
11270                        if let Some(lookup_key) =
11271                            conversation_external_lookup_key_for_conv(agent_id, conv)
11272                        {
11273                            franken_update_external_conversation_tail_lookup_key(
11274                                &tx,
11275                                &lookup_key,
11276                                conv_last_ts,
11277                                inserted_last_idx,
11278                                inserted_last_created_at,
11279                            )?;
11280                        }
11281
11282                        pending_message_fingerprints.insert(existing_id, existing_messages);
11283                        pending_message_replay_fingerprints
11284                            .insert(existing_id, existing_replay_fingerprints);
11285
11286                        existing_id
11287                    }
11288                }
11289            };
11290
11291            if !defer_analytics_updates {
11292                let delta = StatsDelta {
11293                    session_count_delta,
11294                    message_count_delta: inserted_messages.len() as i64,
11295                    total_chars_delta: total_chars,
11296                };
11297
11298                let effective_started_at = conversation_effective_started_at(conv);
11299                let day_id = effective_started_at
11300                    .map(FrankenStorage::day_id_from_millis)
11301                    .unwrap_or(0);
11302                stats.record_delta(
11303                    &conv.agent_slug,
11304                    &conv.source_id,
11305                    day_id,
11306                    delta.session_count_delta,
11307                    delta.message_count_delta,
11308                    delta.total_chars_delta,
11309                );
11310
11311                let conv_day_id = day_id;
11312                let mut session_model_family = String::from("unknown");
11313                let mut has_any_tokens = false;
11314
11315                for &(message_id, msg) in &inserted_messages {
11316                    let role_s = role_str(&msg.role);
11317                    let usage = if historical_raw_json(&msg.extra_json).is_some() {
11318                        crate::connectors::extract_tokens_for_agent(
11319                            &conv.agent_slug,
11320                            &serde_json::Value::Null,
11321                            &msg.content,
11322                            &role_s,
11323                        )
11324                    } else {
11325                        crate::connectors::extract_tokens_for_agent(
11326                            &conv.agent_slug,
11327                            &msg.extra_json,
11328                            &msg.content,
11329                            &role_s,
11330                        )
11331                    };
11332
11333                    let msg_ts = msg
11334                        .created_at
11335                        .or(conversation_effective_started_at(conv))
11336                        .unwrap_or(0);
11337                    let msg_day_id = if msg_ts > 0 {
11338                        FrankenStorage::day_id_from_millis(msg_ts)
11339                    } else {
11340                        conv_day_id
11341                    };
11342
11343                    let model_info = usage
11344                        .model_name
11345                        .as_deref()
11346                        .map(crate::connectors::normalize_model);
11347
11348                    let model_family = model_info
11349                        .as_ref()
11350                        .map(|i| i.family.clone())
11351                        .unwrap_or_else(|| "unknown".into());
11352                    let model_tier = model_info
11353                        .as_ref()
11354                        .map(|i| i.tier.clone())
11355                        .unwrap_or_else(|| "unknown".into());
11356                    let provider = usage
11357                        .provider
11358                        .clone()
11359                        .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
11360                        .unwrap_or_else(|| "unknown".into());
11361
11362                    if model_family != "unknown" {
11363                        session_model_family = model_family.clone();
11364                    }
11365
11366                    let estimated_cost = pricing_table.compute_cost(
11367                        usage.model_name.as_deref(),
11368                        msg_day_id,
11369                        usage.input_tokens,
11370                        usage.output_tokens,
11371                        usage.cache_read_tokens,
11372                        usage.cache_creation_tokens,
11373                    );
11374                    if estimated_cost.is_some() {
11375                        pricing_diag.record_priced();
11376                    } else if usage.has_token_data() {
11377                        pricing_diag.record_unpriced(usage.model_name.as_deref());
11378                    }
11379
11380                    token_stats.record(
11381                        &conv.agent_slug,
11382                        &conv.source_id,
11383                        msg_day_id,
11384                        &model_family,
11385                        &role_s,
11386                        &usage,
11387                        msg.content.len() as i64,
11388                        estimated_cost.unwrap_or(0.0),
11389                    );
11390
11391                    if usage.has_token_data() {
11392                        has_any_tokens = true;
11393                    }
11394
11395                    let content_chars = msg.content.len() as i64;
11396                    let content_tokens_est = content_chars / 4;
11397                    let msg_hour_id = FrankenStorage::hour_id_from_millis(msg_ts);
11398                    let has_plan = has_plan_for_role(&role_s, &msg.content);
11399
11400                    token_entries.push(TokenUsageEntry {
11401                        message_id,
11402                        conversation_id: conv_id,
11403                        agent_id,
11404                        workspace_id,
11405                        source_id: conv.source_id.clone(),
11406                        timestamp_ms: msg_ts,
11407                        day_id: msg_day_id,
11408                        model_name: usage.model_name.clone(),
11409                        model_family: Some(model_family.clone()),
11410                        model_tier: Some(model_tier.clone()),
11411                        service_tier: usage.service_tier.clone(),
11412                        provider: Some(provider.clone()),
11413                        input_tokens: usage.input_tokens,
11414                        output_tokens: usage.output_tokens,
11415                        cache_read_tokens: usage.cache_read_tokens,
11416                        cache_creation_tokens: usage.cache_creation_tokens,
11417                        thinking_tokens: usage.thinking_tokens,
11418                        total_tokens: usage.total_tokens(),
11419                        estimated_cost_usd: estimated_cost,
11420                        role: role_s.to_string(),
11421                        content_chars,
11422                        has_tool_calls: usage.has_tool_calls,
11423                        tool_call_count: usage.tool_call_count,
11424                        data_source: usage.data_source.as_str().to_string(),
11425                    });
11426
11427                    let mm = MessageMetricsEntry {
11428                        message_id,
11429                        created_at_ms: msg_ts,
11430                        hour_id: msg_hour_id,
11431                        day_id: msg_day_id,
11432                        agent_slug: conv.agent_slug.clone(),
11433                        workspace_id: workspace_id.unwrap_or(0),
11434                        source_id: conv.source_id.clone(),
11435                        role: role_s.to_string(),
11436                        content_chars,
11437                        content_tokens_est,
11438                        model_name: usage.model_name.clone(),
11439                        model_family: model_family.clone(),
11440                        model_tier: model_tier.clone(),
11441                        provider,
11442                        api_input_tokens: usage.input_tokens,
11443                        api_output_tokens: usage.output_tokens,
11444                        api_cache_read_tokens: usage.cache_read_tokens,
11445                        api_cache_creation_tokens: usage.cache_creation_tokens,
11446                        api_thinking_tokens: usage.thinking_tokens,
11447                        api_service_tier: usage.service_tier.clone(),
11448                        api_data_source: usage.data_source.as_str().to_string(),
11449                        tool_call_count: usage.tool_call_count as i64,
11450                        has_tool_calls: usage.has_tool_calls,
11451                        has_plan,
11452                    };
11453                    rollup_agg.record(&mm);
11454                    metrics_entries.push(mm);
11455                }
11456
11457                if session_count_delta > 0 {
11458                    token_stats.record_session(
11459                        &conv.agent_slug,
11460                        &conv.source_id,
11461                        conv_day_id,
11462                        &session_model_family,
11463                    );
11464                }
11465
11466                if has_any_tokens {
11467                    conv_ids_to_summarize.push(conv_id);
11468                }
11469            }
11470
11471            outcomes.push(InsertOutcome {
11472                conversation_id: conv_id,
11473                conversation_inserted: session_count_delta > 0,
11474                inserted_indices,
11475            });
11476        }
11477
11478        // Batch insert all FTS entries at once
11479        if !defer_lexical_updates {
11480            flush_pending_fts_entries(
11481                self,
11482                &tx,
11483                &mut fts_entries,
11484                &mut fts_pending_chars,
11485                &mut fts_inserted_total,
11486            )?;
11487        }
11488        if !defer_lexical_updates && fts_count_total > 0 {
11489            tracing::debug!(
11490                target: "cass::perf::fts5",
11491                total = fts_count_total,
11492                inserted = fts_inserted_total,
11493                conversations = conversations.len(),
11494                "franken_batch_fts_insert_complete"
11495            );
11496        }
11497
11498        // Batched daily_stats update
11499        if !defer_analytics_updates && !stats.is_empty() {
11500            let entries = stats.expand();
11501            let affected = franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
11502            tracing::debug!(
11503                target: "cass::perf::daily_stats",
11504                raw = stats.raw_entry_count(),
11505                expanded = entries.len(),
11506                affected = affected,
11507                "franken_batched_stats_update_complete"
11508            );
11509        }
11510
11511        // Batch insert token_usage rows
11512        if !defer_analytics_updates && !token_entries.is_empty() {
11513            let token_count = token_entries.len();
11514            let inserted = franken_insert_token_usage_batched_in_tx(&tx, &token_entries)?;
11515            tracing::debug!(
11516                target: "cass::perf::token_usage",
11517                total = token_count,
11518                inserted = inserted,
11519                "franken_batch_token_usage_insert_complete"
11520            );
11521        }
11522
11523        // Batched token_daily_stats update
11524        if !defer_analytics_updates && !token_stats.is_empty() {
11525            let entries = token_stats.expand();
11526            let affected = franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
11527            tracing::debug!(
11528                target: "cass::perf::token_daily_stats",
11529                raw = token_stats.raw_entry_count(),
11530                expanded = entries.len(),
11531                affected = affected,
11532                "franken_batched_token_stats_update_complete"
11533            );
11534        }
11535
11536        // Batch insert message_metrics rows
11537        if !defer_analytics_updates && !metrics_entries.is_empty() {
11538            let mm_count = metrics_entries.len();
11539            let inserted = franken_insert_message_metrics_batched_in_tx(&tx, &metrics_entries)?;
11540            tracing::debug!(
11541                target: "cass::perf::message_metrics",
11542                total = mm_count,
11543                inserted = inserted,
11544                "franken_batch_message_metrics_insert_complete"
11545            );
11546        }
11547
11548        // Flush usage_hourly + usage_daily rollups
11549        if !defer_analytics_updates && !rollup_agg.is_empty() {
11550            let (hourly, daily, models_daily) =
11551                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
11552            tracing::debug!(
11553                target: "cass::perf::usage_rollups",
11554                hourly_buckets = rollup_agg.hourly_entry_count(),
11555                daily_buckets = rollup_agg.daily_entry_count(),
11556                models_daily_buckets = rollup_agg.models_daily_entry_count(),
11557                hourly_affected = hourly,
11558                daily_affected = daily,
11559                models_daily_affected = models_daily,
11560                "franken_batched_usage_rollups_complete"
11561            );
11562        }
11563
11564        // Update conversation-level token summaries
11565        if !defer_analytics_updates {
11566            for conv_id in &conv_ids_to_summarize {
11567                franken_update_conversation_token_summaries_in_tx(&tx, *conv_id)?;
11568            }
11569        }
11570
11571        tx.commit()?;
11572
11573        pricing_diag.log_summary();
11574
11575        Ok(outcomes)
11576    }
11577}
11578
11579fn normalized_storage_source_parts(
11580    source_id: Option<&str>,
11581    origin_kind: Option<&str>,
11582    origin_host: Option<&str>,
11583) -> (String, SourceKind, Option<String>) {
11584    let host_label = crate::search::tantivy::normalized_index_origin_host(origin_host);
11585    let source_id = crate::search::tantivy::normalized_index_source_id(
11586        source_id,
11587        origin_kind,
11588        host_label.as_deref(),
11589    );
11590
11591    if source_id == LOCAL_SOURCE_ID {
11592        (source_id, SourceKind::Local, None)
11593    } else {
11594        (source_id, SourceKind::Ssh, host_label)
11595    }
11596}
11597
11598fn normalized_source_for_conversation(conv: &Conversation) -> Source {
11599    let (id, kind, host_label) = normalized_storage_source_parts(
11600        Some(conv.source_id.as_str()),
11601        None,
11602        conv.origin_host.as_deref(),
11603    );
11604    Source {
11605        id,
11606        kind,
11607        host_label,
11608        machine_id: None,
11609        platform: None,
11610        config_json: None,
11611        created_at: None,
11612        updated_at: None,
11613    }
11614}
11615
11616fn is_bootstrap_local_source(source: &Source) -> bool {
11617    source.id == LOCAL_SOURCE_ID
11618        && matches!(source.kind, SourceKind::Local)
11619        && source.host_label.is_none()
11620        && source.machine_id.is_none()
11621        && source.platform.is_none()
11622        && source.config_json.is_none()
11623        && source.created_at.is_none()
11624        && source.updated_at.is_none()
11625}
11626
11627fn normalized_conversation_for_storage<'a>(conv: &'a Conversation) -> Cow<'a, Conversation> {
11628    let normalized_source = normalized_source_for_conversation(conv);
11629    if normalized_source.id == conv.source_id && normalized_source.host_label == conv.origin_host {
11630        Cow::Borrowed(conv)
11631    } else {
11632        let mut normalized = conv.clone();
11633        normalized.source_id = normalized_source.id;
11634        normalized.origin_host = normalized_source.host_label;
11635        Cow::Owned(normalized)
11636    }
11637}
11638
11639impl FrankenStorage {
11640    fn ensure_source_for_conversation(&self, conv: &Conversation) -> Result<()> {
11641        let source = normalized_source_for_conversation(conv);
11642        if is_bootstrap_local_source(&source) {
11643            // `open()` and schema repair always seed the canonical local source row.
11644            // Avoid an autocommit UPDATE on every local conversation insert.
11645            return Ok(());
11646        }
11647        let cache_key = EnsuredConversationSourceKey::from_source(&source);
11648        if self.conversation_source_already_ensured(&cache_key) {
11649            return Ok(());
11650        }
11651        self.upsert_source(&source)?;
11652        self.mark_conversation_source_ensured(cache_key);
11653        Ok(())
11654    }
11655
11656    fn ensure_sources_for_batch(
11657        &self,
11658        conversations: &[(i64, Option<i64>, &Conversation)],
11659    ) -> Result<()> {
11660        let mut seen = HashSet::with_capacity(conversations.len());
11661        for &(_, _, conv) in conversations {
11662            let source = normalized_source_for_conversation(conv);
11663            if seen.insert(source.id.clone()) {
11664                if is_bootstrap_local_source(&source) {
11665                    continue;
11666                }
11667                self.upsert_source(&source)?;
11668                self.mark_conversation_source_ensured(EnsuredConversationSourceKey::from_source(
11669                    &source,
11670                ));
11671            }
11672        }
11673        Ok(())
11674    }
11675}
11676
11677// =========================================================================
11678// FrankenStorage transaction helper functions
11679// =========================================================================
11680
11681/// Get last_insert_rowid from a frankensqlite transaction.
11682fn franken_last_rowid(tx: &FrankenTransaction<'_>) -> Result<i64> {
11683    tx.last_insert_rowid()
11684        .ok()
11685        .filter(|&id| id > 0)
11686        .with_context(|| "last_insert_rowid() returned NULL or 0 after INSERT")
11687}
11688
11689/// Bug #167: Ensure all agents referenced by a batch exist within the
11690/// transaction.  The caller already resolved `agent_id` values via
11691/// `ensure_agent` outside the transaction, but those autocommit writes may
11692/// not be visible inside a frankensqlite transaction snapshot.  This function
11693/// checks each unique agent_id and creates a stub row if it's missing.
11694fn ensure_agents_in_tx(
11695    tx: &FrankenTransaction<'_>,
11696    conversations: &[(i64, Option<i64>, &Conversation)],
11697) -> Result<()> {
11698    let mut seen = HashSet::new();
11699    let now = FrankenStorage::now_millis();
11700    for &(agent_id, _, conv) in conversations {
11701        if !seen.insert(agent_id) {
11702            continue;
11703        }
11704        let exists: i64 = tx.query_row_map(
11705            "SELECT COUNT(*) FROM agents WHERE id = ?1",
11706            fparams![agent_id],
11707            |row| row.get_typed(0),
11708        )?;
11709        if exists == 0 {
11710            tracing::debug!(
11711                target: "cass::fk_guard",
11712                agent_id,
11713                slug = %conv.agent_slug,
11714                "inserting agent row inside transaction to satisfy FK constraint"
11715            );
11716            // INSERT OR IGNORE: the slug might already exist with a different
11717            // id from a concurrent writer.  If the slug row exists, the FK
11718            // constraint is already satisfied (the caller just got a stale id).
11719            tx.execute_compat(
11720                "INSERT OR IGNORE INTO agents(id, slug, name, kind, created_at, updated_at)
11721                 VALUES(?1, ?2, ?3, 'cli', ?4, ?5)",
11722                fparams![
11723                    agent_id,
11724                    conv.agent_slug.as_str(),
11725                    conv.agent_slug.as_str(),
11726                    now,
11727                    now
11728                ],
11729            )?;
11730        }
11731    }
11732    Ok(())
11733}
11734
11735/// Bug #167: Ensure all workspaces referenced by a batch exist within the
11736/// transaction.  Same rationale as `ensure_agents_in_tx`.
11737fn ensure_workspaces_in_tx(
11738    tx: &FrankenTransaction<'_>,
11739    conversations: &[(i64, Option<i64>, &Conversation)],
11740) -> Result<()> {
11741    let mut seen = HashSet::new();
11742    for &(_, workspace_id, conv) in conversations {
11743        let ws_id = match workspace_id {
11744            Some(id) => id,
11745            None => continue,
11746        };
11747        if !seen.insert(ws_id) {
11748            continue;
11749        }
11750        let exists: i64 = tx.query_row_map(
11751            "SELECT COUNT(*) FROM workspaces WHERE id = ?1",
11752            fparams![ws_id],
11753            |row| row.get_typed(0),
11754        )?;
11755        if exists == 0 {
11756            let path_str = conv
11757                .workspace
11758                .as_ref()
11759                .map(|p| p.to_string_lossy().to_string())
11760                .unwrap_or_default();
11761            tracing::debug!(
11762                target: "cass::fk_guard",
11763                workspace_id = ws_id,
11764                path = %path_str,
11765                "inserting workspace row inside transaction to satisfy FK constraint"
11766            );
11767            tx.execute_compat(
11768                "INSERT OR IGNORE INTO workspaces(id, path) VALUES(?1, ?2)",
11769                fparams![ws_id, path_str.as_str()],
11770            )?;
11771        }
11772    }
11773    Ok(())
11774}
11775
11776/// Bug #167: Ensure all sources referenced by a batch exist within the
11777/// transaction.  Same rationale as `ensure_agents_in_tx` — source_id is a
11778/// TEXT FK on the conversations table.
11779fn ensure_sources_in_tx(
11780    tx: &FrankenTransaction<'_>,
11781    conversations: &[(i64, Option<i64>, &Conversation)],
11782) -> Result<()> {
11783    let mut seen = HashSet::new();
11784    for &(_, _, conv) in conversations {
11785        let (source_id, source_kind, host_label) = normalized_storage_source_parts(
11786            Some(conv.source_id.as_str()),
11787            None,
11788            conv.origin_host.as_deref(),
11789        );
11790        if !seen.insert(source_id.clone()) {
11791            continue;
11792        }
11793        let exists: i64 = tx.query_row_map(
11794            "SELECT COUNT(*) FROM sources WHERE id = ?1",
11795            fparams![source_id.as_str()],
11796            |row| row.get_typed(0),
11797        )?;
11798        if exists == 0 {
11799            let kind_str = source_kind.to_string();
11800            let now = FrankenStorage::now_millis();
11801            tracing::debug!(
11802                target: "cass::fk_guard",
11803                source_id = %source_id,
11804                kind = kind_str.as_str(),
11805                "inserting source row inside transaction to satisfy FK constraint"
11806            );
11807            tx.execute_compat(
11808                "INSERT OR IGNORE INTO sources(id, kind, host_label, created_at, updated_at)
11809                 VALUES(?1, ?2, ?3, ?4, ?5)",
11810                fparams![
11811                    source_id.as_str(),
11812                    kind_str.as_str(),
11813                    host_label.as_deref(),
11814                    now,
11815                    now
11816                ],
11817            )?;
11818        }
11819    }
11820    Ok(())
11821}
11822
11823fn env_flag_enabled(name: &str) -> bool {
11824    dotenvy::var(name).ok().is_some_and(|v| {
11825        matches!(
11826            v.trim(),
11827            "1" | "true" | "TRUE" | "yes" | "YES" | "on" | "ON"
11828        )
11829    })
11830}
11831
11832fn defer_storage_lexical_updates_enabled() -> bool {
11833    env_flag_enabled("CASS_DEFER_LEXICAL_UPDATES")
11834}
11835
11836fn defer_analytics_updates_enabled() -> bool {
11837    if env_flag_enabled("CASS_DEFER_ANALYTICS_UPDATES") {
11838        return true;
11839    }
11840    if env_flag_enabled("CASS_INLINE_ANALYTICS_UPDATES") {
11841        return false;
11842    }
11843    DEFAULT_DEFER_ANALYTICS_UPDATES.load(Ordering::Relaxed)
11844}
11845
11846enum ConversationInsertStatus {
11847    Inserted(i64),
11848    Existing(i64),
11849}
11850
11851fn franken_find_external_conversation_tail_lookup(
11852    tx: &FrankenTransaction<'_>,
11853    lookup_key: &str,
11854) -> Result<Option<ExistingConversationWithTail>> {
11855    let params = [SqliteValue::from(lookup_key)];
11856    let row = tx
11857        .query_row_with_params(
11858            "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
11859             FROM conversation_external_tail_lookup
11860             WHERE lookup_key = ?1",
11861            &params,
11862        )
11863        .optional()?;
11864    let Some(row) = row else {
11865        return Ok(None);
11866    };
11867    let id = row.get_typed(0)?;
11868    let ended_at = row.get_typed(1)?;
11869    let last_message_idx = row.get_typed(2)?;
11870    let last_message_created_at = row.get_typed(3)?;
11871    Ok(Some(ExistingConversationWithTail {
11872        id,
11873        tail_state: existing_conversation_tail_state_from_cached(
11874            last_message_idx,
11875            last_message_created_at,
11876            ended_at,
11877        ),
11878    }))
11879}
11880
11881fn franken_find_external_conversation_lookup(
11882    tx: &FrankenTransaction<'_>,
11883    lookup_key: &str,
11884) -> Result<Option<i64>> {
11885    Ok(franken_find_external_conversation_tail_lookup(tx, lookup_key)?.map(|existing| existing.id))
11886}
11887
11888fn franken_insert_external_conversation_tail_lookup_key(
11889    tx: &FrankenTransaction<'_>,
11890    lookup_key: &str,
11891    conversation_id: i64,
11892    ended_at: Option<i64>,
11893    last_message_idx: Option<i64>,
11894    last_message_created_at: Option<i64>,
11895) -> Result<()> {
11896    let params = [
11897        SqliteValue::from(lookup_key),
11898        SqliteValue::from(conversation_id),
11899        SqliteValue::from(ended_at),
11900        SqliteValue::from(last_message_idx),
11901        SqliteValue::from(last_message_created_at),
11902    ];
11903    tx.execute_with_params(
11904        "INSERT OR REPLACE INTO conversation_external_tail_lookup(
11905             lookup_key, conversation_id, ended_at, last_message_idx, last_message_created_at
11906         ) VALUES(?1, ?2, ?3, ?4, ?5)",
11907        &params,
11908    )?;
11909    Ok(())
11910}
11911
11912fn franken_insert_external_conversation_tail_lookup(
11913    tx: &FrankenTransaction<'_>,
11914    source_id: &str,
11915    agent_id: i64,
11916    external_id: &str,
11917    existing: ExistingConversationWithTail,
11918) -> Result<()> {
11919    let lookup_key = conversation_external_lookup_key(source_id, agent_id, external_id);
11920    let ended_at = existing.tail_state.and_then(|state| state.ended_at);
11921    let last_message_idx = existing.tail_state.map(|state| state.last_message_idx);
11922    let last_message_created_at = existing
11923        .tail_state
11924        .map(|state| state.last_message_created_at);
11925    franken_insert_external_conversation_tail_lookup_key(
11926        tx,
11927        &lookup_key,
11928        existing.id,
11929        ended_at,
11930        last_message_idx,
11931        last_message_created_at,
11932    )
11933}
11934
11935fn franken_update_external_conversation_tail_lookup_key(
11936    tx: &FrankenTransaction<'_>,
11937    lookup_key: &str,
11938    ended_at_candidate: Option<i64>,
11939    last_message_idx_candidate: Option<i64>,
11940    last_message_created_at_candidate: Option<i64>,
11941) -> Result<()> {
11942    if ended_at_candidate.is_none()
11943        && last_message_idx_candidate.is_none()
11944        && last_message_created_at_candidate.is_none()
11945    {
11946        return Ok(());
11947    }
11948    tx.execute_compat(
11949        "UPDATE conversation_external_tail_lookup
11950         SET ended_at = CASE
11951                 WHEN ?1 IS NULL THEN ended_at
11952                 ELSE MAX(IFNULL(ended_at, 0), ?1)
11953             END,
11954             last_message_idx = CASE
11955                 WHEN ?2 IS NULL THEN last_message_idx
11956                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
11957                 ELSE last_message_idx
11958             END,
11959             last_message_created_at = CASE
11960                 WHEN ?3 IS NULL THEN last_message_created_at
11961                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
11962                 ELSE last_message_created_at
11963             END
11964         WHERE lookup_key = ?4",
11965        fparams![
11966            ended_at_candidate,
11967            last_message_idx_candidate,
11968            last_message_created_at_candidate,
11969            lookup_key
11970        ],
11971    )?;
11972    Ok(())
11973}
11974
11975fn franken_set_external_conversation_tail_lookup_after_append(
11976    tx: &FrankenTransaction<'_>,
11977    lookup_key: &str,
11978    ended_at: i64,
11979    last_message_idx: i64,
11980    last_message_created_at: i64,
11981) -> Result<()> {
11982    tx.execute_compat(
11983        "UPDATE conversation_external_tail_lookup
11984         SET ended_at = ?1,
11985             last_message_idx = ?2,
11986             last_message_created_at = ?3
11987         WHERE lookup_key = ?4",
11988        fparams![
11989            ended_at,
11990            last_message_idx,
11991            last_message_created_at,
11992            lookup_key
11993        ],
11994    )?;
11995    Ok(())
11996}
11997
11998fn franken_update_external_conversation_tail_after_append(
11999    tx: &FrankenTransaction<'_>,
12000    agent_id: i64,
12001    conv: &Conversation,
12002    used_append_tail_plan: bool,
12003    exact_append_set: bool,
12004    inserted_last_idx: Option<i64>,
12005    inserted_last_created_at: Option<i64>,
12006) -> Result<()> {
12007    let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv) else {
12008        return Ok(());
12009    };
12010
12011    if exact_append_set
12012        && let (Some(last_message_idx), Some(last_message_created_at)) =
12013            (inserted_last_idx, inserted_last_created_at)
12014    {
12015        return franken_set_external_conversation_tail_lookup_after_append(
12016            tx,
12017            &lookup_key,
12018            last_message_created_at,
12019            last_message_idx,
12020            last_message_created_at,
12021        );
12022    }
12023
12024    let ended_at_candidate = if used_append_tail_plan {
12025        inserted_last_created_at
12026    } else {
12027        conv.messages.iter().filter_map(|m| m.created_at).max()
12028    };
12029    franken_update_external_conversation_tail_lookup_key(
12030        tx,
12031        &lookup_key,
12032        ended_at_candidate,
12033        inserted_last_idx,
12034        inserted_last_created_at,
12035    )
12036}
12037
12038fn franken_find_existing_conversation_by_key(
12039    tx: &FrankenTransaction<'_>,
12040    key: &PendingConversationKey,
12041    conv: Option<&Conversation>,
12042) -> Result<Option<i64>> {
12043    franken_find_existing_conversation_by_key_impl(tx, key, conv, false)
12044}
12045
12046fn franken_find_existing_conversation_by_key_after_conflict(
12047    tx: &FrankenTransaction<'_>,
12048    key: &PendingConversationKey,
12049    conv: Option<&Conversation>,
12050) -> Result<Option<i64>> {
12051    franken_find_existing_conversation_by_key_impl(tx, key, conv, true)
12052}
12053
12054fn franken_find_existing_conversation_by_key_impl(
12055    tx: &FrankenTransaction<'_>,
12056    key: &PendingConversationKey,
12057    conv: Option<&Conversation>,
12058    allow_legacy_external_scan: bool,
12059) -> Result<Option<i64>> {
12060    match key {
12061        PendingConversationKey::External {
12062            source_id,
12063            agent_id,
12064            external_id,
12065        } => {
12066            let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
12067            if let Some(existing_id) = franken_find_external_conversation_lookup(tx, &lookup_key)? {
12068                return Ok(Some(existing_id));
12069            }
12070            if !allow_legacy_external_scan {
12071                return Ok(None);
12072            }
12073
12074            let existing_id = tx
12075                .query_row_map(
12076                    "SELECT id
12077                 FROM conversations
12078                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
12079                    fparams![source_id.as_str(), *agent_id, external_id.as_str()],
12080                    |row| row.get_typed(0),
12081                )
12082                .optional()?;
12083            if let Some(existing_id) = existing_id {
12084                let tail_state = franken_existing_conversation_append_tail_state(tx, existing_id)?;
12085                franken_insert_external_conversation_tail_lookup_key(
12086                    tx,
12087                    &lookup_key,
12088                    existing_id,
12089                    tail_state.and_then(|state| state.ended_at),
12090                    tail_state.map(|state| state.last_message_idx),
12091                    tail_state.map(|state| state.last_message_created_at),
12092                )?;
12093                Ok(Some(existing_id))
12094            } else {
12095                Ok(None)
12096            }
12097        }
12098        PendingConversationKey::SourcePath {
12099            source_id,
12100            agent_id,
12101            source_path,
12102            started_at,
12103        } => {
12104            let exact_match = tx
12105                .query_row_map(
12106                    "SELECT c.id
12107                     FROM conversations c
12108                     WHERE c.source_id = ?1
12109                       AND c.agent_id = ?2
12110                       AND c.source_path = ?3
12111                       AND ((
12112                            COALESCE(
12113                                c.started_at,
12114                                (SELECT MIN(created_at)
12115                                 FROM messages
12116                                 WHERE conversation_id = c.id
12117                                   AND created_at IS NOT NULL)
12118                            ) IS NULL
12119                            AND ?4 IS NULL
12120                       ) OR COALESCE(
12121                            c.started_at,
12122                            (SELECT MIN(created_at)
12123                             FROM messages
12124                             WHERE conversation_id = c.id
12125                               AND created_at IS NOT NULL)
12126                       ) = ?4)
12127                     ORDER BY c.id
12128                     LIMIT 1",
12129                    fparams![
12130                        source_id.as_str(),
12131                        *agent_id,
12132                        source_path.as_str(),
12133                        *started_at
12134                    ],
12135                    |row| row.get_typed(0),
12136                )
12137                .optional()?;
12138            if exact_match.is_some() {
12139                return Ok(exact_match);
12140            }
12141
12142            let Some(conv) = conv else {
12143                return Ok(None);
12144            };
12145            let incoming_fingerprints = conversation_message_fingerprints(conv);
12146            if incoming_fingerprints.is_empty() {
12147                return Ok(None);
12148            }
12149            let incoming_replay_fingerprints = conversation_message_replay_fingerprints(conv);
12150
12151            let candidates: Vec<(i64, Option<i64>)> = tx.query_map_collect(
12152                "SELECT
12153                     c.id,
12154                     COALESCE(
12155                         c.started_at,
12156                         (SELECT MIN(created_at)
12157                          FROM messages
12158                          WHERE conversation_id = c.id
12159                            AND created_at IS NOT NULL)
12160                     ) AS effective_started_at
12161                 FROM conversations c
12162                 WHERE c.source_id = ?1
12163                   AND c.agent_id = ?2
12164                   AND c.source_path = ?3
12165                 ORDER BY c.id",
12166                fparams![source_id.as_str(), *agent_id, source_path.as_str()],
12167                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
12168            )?;
12169
12170            let mut best_candidate: Option<(i64, ConversationMergeEvidence)> = None;
12171            for (candidate_id, candidate_started_at) in candidates {
12172                let existing_fingerprints =
12173                    franken_existing_message_fingerprints(tx, candidate_id)?;
12174                let existing_replay_fingerprints =
12175                    replay_fingerprints_from_merge_set(&existing_fingerprints);
12176                let Some(evidence) = conversation_merge_evidence(
12177                    &incoming_fingerprints,
12178                    &incoming_replay_fingerprints,
12179                    &existing_fingerprints,
12180                    &existing_replay_fingerprints,
12181                    *started_at,
12182                    candidate_started_at,
12183                ) else {
12184                    continue;
12185                };
12186
12187                let candidate_key = (
12188                    evidence.exact_overlap,
12189                    evidence.replay_overlap,
12190                    evidence.started_close,
12191                    evidence.smaller_replay_set,
12192                    std::cmp::Reverse(evidence.start_distance_ms),
12193                );
12194                let should_replace = best_candidate
12195                    .as_ref()
12196                    .map(|(_, best_evidence)| {
12197                        candidate_key
12198                            > (
12199                                best_evidence.exact_overlap,
12200                                best_evidence.replay_overlap,
12201                                best_evidence.started_close,
12202                                best_evidence.smaller_replay_set,
12203                                std::cmp::Reverse(best_evidence.start_distance_ms),
12204                            )
12205                    })
12206                    .unwrap_or(true);
12207
12208                if should_replace {
12209                    best_candidate = Some((candidate_id, evidence));
12210                }
12211            }
12212
12213            Ok(best_candidate.map(|(candidate_id, _)| candidate_id))
12214        }
12215    }
12216}
12217
12218fn franken_insert_conversation_or_get_existing(
12219    tx: &FrankenTransaction<'_>,
12220    agent_id: i64,
12221    workspace_id: Option<i64>,
12222    conv: &Conversation,
12223) -> Result<ConversationInsertStatus> {
12224    let conversation_key = conversation_merge_key(agent_id, conv);
12225    if let Some(existing_id) =
12226        franken_find_existing_conversation_by_key(tx, &conversation_key, Some(conv))?
12227    {
12228        return Ok(ConversationInsertStatus::Existing(existing_id));
12229    }
12230
12231    franken_insert_conversation_or_get_existing_after_miss(
12232        tx,
12233        agent_id,
12234        workspace_id,
12235        conv,
12236        &conversation_key,
12237    )
12238}
12239
12240fn franken_insert_conversation_or_get_existing_after_miss(
12241    tx: &FrankenTransaction<'_>,
12242    agent_id: i64,
12243    workspace_id: Option<i64>,
12244    conv: &Conversation,
12245    conversation_key: &PendingConversationKey,
12246) -> Result<ConversationInsertStatus> {
12247    match franken_insert_conversation(tx, agent_id, workspace_id, conv) {
12248        Ok(Some(conv_id)) => Ok(ConversationInsertStatus::Inserted(conv_id)),
12249        Ok(None) => {
12250            // A concurrent writer won the unique-provenance race. Resolve the
12251            // canonical row so callers can merge messages into it.
12252            let existing_id =
12253                franken_find_existing_conversation_by_key_after_conflict(
12254                    tx,
12255                    conversation_key,
12256                    Some(conv),
12257                )?
12258                    .with_context(|| {
12259                        format!(
12260                            "conversation INSERT produced a duplicate conflict but existing row was not found for source_id={} agent_id={} external_id={:?} source_path={}",
12261                            conv.source_id,
12262                            agent_id,
12263                            conv.external_id,
12264                            conv.source_path.display()
12265                        )
12266                    })?;
12267            tracing::warn!(
12268                source_id = %conv.source_id,
12269                agent_id,
12270                external_id = ?conv.external_id,
12271                existing_id,
12272                source_path = %conv.source_path.display(),
12273                "conversation INSERT: duplicate gracefully recovered, reusing existing row"
12274            );
12275            Ok(ConversationInsertStatus::Existing(existing_id))
12276        }
12277        Err(error) => {
12278            tracing::error!(
12279                source_id = %conv.source_id,
12280                agent_id,
12281                external_id = ?conv.external_id,
12282                error = %error,
12283                source_path = %conv.source_path.display(),
12284                "franken_insert_conversation failed"
12285            );
12286            Err(error)
12287        }
12288    }
12289}
12290
12291/// Insert a conversation into the DB within a frankensqlite transaction.
12292///
12293/// Uses a plain `INSERT` so the common miss path stays on the slim direct
12294/// insert lane. Duplicate provenance conflicts are converted into `Ok(None)`
12295/// so callers can recover the canonical row and merge messages into it.
12296fn franken_insert_conversation(
12297    tx: &FrankenTransaction<'_>,
12298    agent_id: i64,
12299    workspace_id: Option<i64>,
12300    conv: &Conversation,
12301) -> Result<Option<i64>> {
12302    let (metadata_json_str, metadata_bin) = franken_metadata_insert_payload(&conv.metadata_json)?;
12303    let (last_message_idx, last_message_created_at) = conversation_tail_state(conv);
12304    let metadata_bin_bytes = metadata_bin.as_deref();
12305
12306    match tx.execute_compat(
12307        "INSERT INTO conversations(
12308            agent_id, workspace_id, source_id, external_id, title, source_path,
12309            started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin,
12310            last_message_idx, last_message_created_at
12311        ) VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14)",
12312        fparams![
12313            agent_id,
12314            workspace_id,
12315            conv.source_id.as_str(),
12316            conv.external_id.as_deref(),
12317            conv.title.as_deref(),
12318            path_to_string(&conv.source_path),
12319            conv.started_at,
12320            conv.ended_at,
12321            conv.approx_tokens,
12322            metadata_json_str.as_deref(),
12323            conv.origin_host.as_deref(),
12324            metadata_bin_bytes,
12325            last_message_idx,
12326            last_message_created_at
12327        ],
12328    ) {
12329        Ok(_) => {
12330            let conv_id = franken_last_rowid(tx)?;
12331            franken_insert_conversation_tail_state(
12332                tx,
12333                conv_id,
12334                conv.ended_at,
12335                last_message_idx,
12336                last_message_created_at,
12337            )?;
12338            if let Some(external_id) = conv.external_id.as_deref() {
12339                franken_insert_external_conversation_tail_lookup(
12340                    tx,
12341                    conv.source_id.as_str(),
12342                    agent_id,
12343                    external_id,
12344                    ExistingConversationWithTail {
12345                        id: conv_id,
12346                        tail_state: existing_conversation_tail_state_from_cached(
12347                            last_message_idx,
12348                            last_message_created_at,
12349                            conv.ended_at,
12350                        ),
12351                    },
12352                )?;
12353            }
12354            Ok(Some(conv_id))
12355        }
12356        Err(frankensqlite::FrankenError::UniqueViolation { .. }) => {
12357            tracing::debug!(
12358                source_id = %conv.source_id,
12359                agent_id,
12360                external_id = ?conv.external_id,
12361                source_path = %conv.source_path.display(),
12362                "conversation INSERT: duplicate provenance conflict"
12363            );
12364            Ok(None)
12365        }
12366        Err(error) => Err(error.into()),
12367    }
12368}
12369
12370type MetadataInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
12371
12372fn franken_metadata_insert_payload(value: &serde_json::Value) -> Result<MetadataInsertPayload<'_>> {
12373    if let Some(raw) = historical_raw_json(value) {
12374        Ok((Some(Cow::Borrowed(raw)), None))
12375    } else if value.is_null() {
12376        Ok((Some(Cow::Borrowed("null")), None))
12377    } else if value.as_object().is_some_and(|object| object.is_empty()) {
12378        Ok((None, None))
12379    } else if let Some(metadata_bin) = serialize_json_to_msgpack(value) {
12380        Ok((None, Some(metadata_bin)))
12381    } else {
12382        Ok((Some(Cow::Owned(serde_json::to_string(value)?)), None))
12383    }
12384}
12385
12386fn franken_insert_new_message(
12387    tx: &FrankenTransaction<'_>,
12388    conversation_id: i64,
12389    msg: &Message,
12390) -> Result<i64> {
12391    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12392    let extra_bin_bytes = extra_bin.as_deref();
12393
12394    tx.execute_compat(
12395        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
12396         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
12397            fparams![
12398                conversation_id,
12399                msg.idx,
12400                role_as_str(&msg.role),
12401                msg.author.as_deref(),
12402                msg.created_at,
12403                msg.content.as_str(),
12404                extra_json_str.as_deref(),
12405                extra_bin_bytes
12406        ],
12407    )?;
12408    franken_last_rowid(tx)
12409}
12410
12411fn franken_insert_new_message_ignore_duplicate(
12412    tx: &FrankenTransaction<'_>,
12413    conversation_id: i64,
12414    msg: &Message,
12415) -> Result<Option<i64>> {
12416    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12417    let extra_bin_bytes = extra_bin.as_deref();
12418
12419    let changed = tx.execute_compat(
12420        "INSERT OR IGNORE INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
12421         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
12422            fparams![
12423                conversation_id,
12424                msg.idx,
12425                role_as_str(&msg.role),
12426                msg.author.as_deref(),
12427                msg.created_at,
12428                msg.content.as_str(),
12429                extra_json_str.as_deref(),
12430                extra_bin_bytes
12431        ],
12432    )?;
12433    if changed == 0 {
12434        return Ok(None);
12435    }
12436    franken_last_rowid(tx).map(Some)
12437}
12438
12439type MessageInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
12440
12441fn franken_message_insert_payload(msg: &Message) -> Result<MessageInsertPayload<'_>> {
12442    if let Some(raw) = historical_raw_json(&msg.extra_json) {
12443        Ok((Some(Cow::Borrowed(raw)), None))
12444    } else if msg.extra_json.is_null() {
12445        Ok((None, None))
12446    } else {
12447        let extra_bin = serialize_json_to_msgpack(&msg.extra_json);
12448        if extra_bin.is_some() {
12449            Ok((None, extra_bin))
12450        } else {
12451            Ok((
12452                Some(Cow::Owned(serde_json::to_string(&msg.extra_json)?)),
12453                None,
12454            ))
12455        }
12456    }
12457}
12458
12459/// Batch size for proven-new message inserts.
12460///
12461/// Each row binds 8 values, so 100 rows stays well under SQLite's default
12462/// `SQLITE_MAX_VARIABLE_NUMBER` limit of 999 while still amortizing parse cost.
12463const MESSAGE_INSERT_BATCH_SIZE: usize = 100;
12464
12465/// Append workloads profile fastest with larger chunks on current frankensqlite.
12466///
12467/// After the tail-state hot table removed conversation-row rewrites from the
12468/// append path, 50-row chunks beat the old 20-row setting on the append-merge
12469/// profile. 100-row chunks slightly regress the 20-message workload.
12470const APPEND_MESSAGE_INSERT_BATCH_SIZE: usize = 50;
12471
12472fn message_insert_batch_sql(row_count: usize) -> &'static str {
12473    static MESSAGE_INSERT_BATCH_SQL: std::sync::OnceLock<Vec<String>> = std::sync::OnceLock::new();
12474
12475    let max_batch_size = MESSAGE_INSERT_BATCH_SIZE.max(APPEND_MESSAGE_INSERT_BATCH_SIZE);
12476    let cached_sql = MESSAGE_INSERT_BATCH_SQL.get_or_init(|| {
12477        let mut sql_by_row_count = Vec::with_capacity(max_batch_size + 1);
12478        sql_by_row_count.push(String::new());
12479        for row_count in 1..=max_batch_size {
12480            let placeholders = (0..row_count)
12481                .map(|idx| {
12482                    let base = idx * 8;
12483                    format!(
12484                        "(?{},?{},?{},?{},?{},?{},?{},?{})",
12485                        base + 1,
12486                        base + 2,
12487                        base + 3,
12488                        base + 4,
12489                        base + 5,
12490                        base + 6,
12491                        base + 7,
12492                        base + 8
12493                    )
12494                })
12495                .collect::<Vec<_>>()
12496                .join(",");
12497            sql_by_row_count.push(format!(
12498                "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin) VALUES {placeholders}"
12499            ));
12500        }
12501        sql_by_row_count
12502    });
12503
12504    cached_sql
12505        .get(row_count)
12506        .map(String::as_str)
12507        .expect("message insert batch size must be covered by the cached SQL table")
12508}
12509
12510fn franken_batch_insert_new_messages(
12511    tx: &FrankenTransaction<'_>,
12512    conversation_id: i64,
12513    messages: &[&Message],
12514) -> Result<Vec<i64>> {
12515    franken_batch_insert_new_messages_with_batch_size(
12516        tx,
12517        conversation_id,
12518        messages,
12519        MESSAGE_INSERT_BATCH_SIZE,
12520    )
12521}
12522
12523fn franken_append_insert_new_messages<'a>(
12524    tx: &FrankenTransaction<'_>,
12525    conversation_id: i64,
12526    messages: &[&'a Message],
12527) -> Result<Vec<(i64, &'a Message)>> {
12528    let mut inserted = Vec::with_capacity(messages.len());
12529    for msg in messages {
12530        if let Some(message_id) =
12531            franken_insert_new_message_ignore_duplicate(tx, conversation_id, msg)?
12532        {
12533            inserted.push((message_id, *msg));
12534        }
12535    }
12536    Ok(inserted)
12537}
12538
12539fn franken_batch_insert_new_messages_with_batch_size(
12540    tx: &FrankenTransaction<'_>,
12541    conversation_id: i64,
12542    messages: &[&Message],
12543    batch_size: usize,
12544) -> Result<Vec<i64>> {
12545    let batch_size = batch_size.max(1);
12546    let mut inserted_ids = Vec::with_capacity(messages.len());
12547    for chunk in messages.chunks(batch_size) {
12548        if chunk.len() == 1 {
12549            inserted_ids.push(franken_insert_new_message(tx, conversation_id, chunk[0])?);
12550            continue;
12551        }
12552        let sql = message_insert_batch_sql(chunk.len());
12553
12554        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
12555        for msg in chunk {
12556            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12557            param_values.push(SqliteValue::from(conversation_id));
12558            param_values.push(SqliteValue::from(msg.idx));
12559            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
12560            param_values.push(SqliteValue::from(msg.author.as_deref()));
12561            param_values.push(SqliteValue::from(msg.created_at));
12562            param_values.push(SqliteValue::from(msg.content.as_str()));
12563            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
12564            param_values.push(SqliteValue::from(extra_bin.as_deref()));
12565        }
12566
12567        tx.execute_with_params(sql, &param_values)?;
12568
12569        let last_id = franken_last_rowid(tx)?;
12570        let first_id = last_id
12571            .checked_sub((chunk.len() - 1) as i64)
12572            .with_context(|| {
12573                format!(
12574                    "inferring rowid range for {}-row message batch ending at {last_id}",
12575                    chunk.len()
12576                )
12577            })?;
12578        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
12579    }
12580
12581    Ok(inserted_ids)
12582}
12583
12584#[cfg(test)]
12585fn franken_insert_new_message_with_profile(
12586    tx: &FrankenTransaction<'_>,
12587    conversation_id: i64,
12588    msg: &Message,
12589    profile: &mut MessageInsertSubstageProfile,
12590) -> Result<i64> {
12591    profile.single_row_calls += 1;
12592    profile.batch_rows += 1;
12593
12594    let payload_start = Instant::now();
12595    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12596    profile.payload_duration += payload_start.elapsed();
12597    let extra_bin_bytes = extra_bin.as_deref();
12598
12599    let execute_start = Instant::now();
12600    tx.execute_compat(
12601        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
12602         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
12603            fparams![
12604                conversation_id,
12605                msg.idx,
12606                role_as_str(&msg.role),
12607                msg.author.as_deref(),
12608                msg.created_at,
12609                msg.content.as_str(),
12610                extra_json_str.as_deref(),
12611                extra_bin_bytes
12612        ],
12613    )?;
12614    profile.execute_duration += execute_start.elapsed();
12615
12616    let rowid_start = Instant::now();
12617    let rowid = franken_last_rowid(tx)?;
12618    profile.rowid_duration += rowid_start.elapsed();
12619    Ok(rowid)
12620}
12621
12622#[cfg(test)]
12623fn franken_batch_insert_new_messages_with_profile(
12624    tx: &FrankenTransaction<'_>,
12625    conversation_id: i64,
12626    messages: &[&Message],
12627    profile: &mut MessageInsertSubstageProfile,
12628) -> Result<Vec<i64>> {
12629    franken_batch_insert_new_messages_with_profile_batch_size(
12630        tx,
12631        conversation_id,
12632        messages,
12633        profile,
12634        MESSAGE_INSERT_BATCH_SIZE,
12635    )
12636}
12637
12638#[cfg(test)]
12639fn franken_append_insert_new_messages_with_profile(
12640    tx: &FrankenTransaction<'_>,
12641    conversation_id: i64,
12642    messages: &[&Message],
12643    profile: &mut MessageInsertSubstageProfile,
12644) -> Result<Vec<i64>> {
12645    franken_batch_insert_new_messages_with_profile_batch_size(
12646        tx,
12647        conversation_id,
12648        messages,
12649        profile,
12650        APPEND_MESSAGE_INSERT_BATCH_SIZE,
12651    )
12652}
12653
12654#[cfg(test)]
12655fn franken_batch_insert_new_messages_with_profile_batch_size(
12656    tx: &FrankenTransaction<'_>,
12657    conversation_id: i64,
12658    messages: &[&Message],
12659    profile: &mut MessageInsertSubstageProfile,
12660    batch_size: usize,
12661) -> Result<Vec<i64>> {
12662    let batch_size = batch_size.max(1);
12663    let mut inserted_ids = Vec::with_capacity(messages.len());
12664    for chunk in messages.chunks(batch_size) {
12665        if chunk.len() == 1 {
12666            inserted_ids.push(franken_insert_new_message_with_profile(
12667                tx,
12668                conversation_id,
12669                chunk[0],
12670                profile,
12671            )?);
12672            continue;
12673        }
12674
12675        profile.batch_calls += 1;
12676        profile.batch_rows += chunk.len();
12677
12678        let sql_build_start = Instant::now();
12679        let sql = message_insert_batch_sql(chunk.len());
12680        profile.sql_build_duration += sql_build_start.elapsed();
12681
12682        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
12683        for msg in chunk {
12684            let payload_start = Instant::now();
12685            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12686            profile.payload_duration += payload_start.elapsed();
12687
12688            let param_build_start = Instant::now();
12689            param_values.push(SqliteValue::from(conversation_id));
12690            param_values.push(SqliteValue::from(msg.idx));
12691            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
12692            param_values.push(SqliteValue::from(msg.author.as_deref()));
12693            param_values.push(SqliteValue::from(msg.created_at));
12694            param_values.push(SqliteValue::from(msg.content.as_str()));
12695            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
12696            param_values.push(SqliteValue::from(extra_bin.as_deref()));
12697            profile.param_build_duration += param_build_start.elapsed();
12698        }
12699
12700        let execute_start = Instant::now();
12701        tx.execute_with_params(sql, &param_values)?;
12702        profile.execute_duration += execute_start.elapsed();
12703
12704        let rowid_start = Instant::now();
12705        let last_id = franken_last_rowid(tx)?;
12706        let first_id = last_id
12707            .checked_sub((chunk.len() - 1) as i64)
12708            .with_context(|| {
12709                format!(
12710                    "inferring rowid range for {}-row message batch ending at {last_id}",
12711                    chunk.len()
12712                )
12713            })?;
12714        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
12715        profile.rowid_duration += rowid_start.elapsed();
12716    }
12717
12718    Ok(inserted_ids)
12719}
12720
12721/// Insert snippets within a frankensqlite transaction.
12722fn franken_insert_snippets(
12723    tx: &FrankenTransaction<'_>,
12724    message_id: i64,
12725    snippets: &[Snippet],
12726) -> Result<()> {
12727    for snip in snippets {
12728        let file_path_str = snip.file_path.as_ref().map(path_to_string);
12729        tx.execute_compat(
12730            "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text)
12731             VALUES(?1,?2,?3,?4,?5,?6)",
12732            fparams![
12733                message_id,
12734                file_path_str.as_deref(),
12735                snip.start_line,
12736                snip.end_line,
12737                snip.language.as_deref(),
12738                snip.snippet_text.as_deref()
12739            ],
12740        )?;
12741    }
12742    Ok(())
12743}
12744
12745fn franken_existing_message_fingerprints(
12746    tx: &FrankenTransaction<'_>,
12747    conversation_id: i64,
12748) -> Result<HashSet<MessageMergeFingerprint>> {
12749    let rows = tx.query_params(
12750        "SELECT idx, role, author, created_at, content
12751         FROM messages
12752         WHERE conversation_id = ?1",
12753        fparams![conversation_id],
12754    )?;
12755    let mut fingerprints = HashSet::with_capacity(rows.len());
12756    for row in rows {
12757        let role: String = row.get_typed(1)?;
12758        let content: String = row.get_typed(4)?;
12759        fingerprints.insert(MessageMergeFingerprint {
12760            idx: row.get_typed(0)?,
12761            created_at: row.get_typed(3)?,
12762            role: role_from_str(&role),
12763            author: row.get_typed(2)?,
12764            content_hash: *blake3::hash(content.as_bytes()).as_bytes(),
12765        });
12766    }
12767    Ok(fingerprints)
12768}
12769
12770struct ExistingMessageLookup {
12771    by_idx: HashMap<i64, MessageMergeFingerprint>,
12772    replay: HashSet<MessageReplayFingerprint>,
12773}
12774
12775fn existing_message_lookup_from_rows(
12776    rows: Vec<FrankenRow>,
12777    min_idx: i64,
12778    max_idx: i64,
12779    created_bounds: Option<(i64, i64)>,
12780    replay_full_scan: bool,
12781) -> Result<ExistingMessageLookup> {
12782    let mut by_idx = HashMap::with_capacity(rows.len());
12783    let mut replay = HashSet::with_capacity(rows.len());
12784    for row in rows {
12785        let idx: i64 = row.get_typed(0)?;
12786        let role: String = row.get_typed(1)?;
12787        let author: Option<String> = row.get_typed(2)?;
12788        let created_at: Option<i64> = row.get_typed(3)?;
12789        let content: String = row.get_typed(4)?;
12790        let role = role_from_str(&role);
12791        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12792
12793        if idx >= min_idx && idx <= max_idx {
12794            by_idx.insert(
12795                idx,
12796                MessageMergeFingerprint {
12797                    idx,
12798                    created_at,
12799                    role: role.clone(),
12800                    author: author.clone(),
12801                    content_hash,
12802                },
12803            );
12804        }
12805
12806        let replay_matches = if replay_full_scan {
12807            true
12808        } else if let Some((min_created_at, max_created_at)) = created_bounds {
12809            created_at.is_some_and(|ts| ts >= min_created_at && ts <= max_created_at)
12810        } else {
12811            true
12812        };
12813        if replay_matches {
12814            replay.insert(MessageReplayFingerprint {
12815                created_at,
12816                role,
12817                author,
12818                content_hash,
12819            });
12820        }
12821    }
12822    Ok(ExistingMessageLookup { by_idx, replay })
12823}
12824
12825fn franken_existing_message_lookup(
12826    tx: &FrankenTransaction<'_>,
12827    conversation_id: i64,
12828    incoming_messages: &[Message],
12829) -> Result<ExistingMessageLookup> {
12830    if incoming_messages.is_empty() {
12831        return Ok(ExistingMessageLookup {
12832            by_idx: HashMap::new(),
12833            replay: HashSet::new(),
12834        });
12835    }
12836
12837    let min_idx = incoming_messages
12838        .iter()
12839        .map(|msg| msg.idx)
12840        .min()
12841        .unwrap_or(0);
12842    let max_idx = incoming_messages
12843        .iter()
12844        .map(|msg| msg.idx)
12845        .max()
12846        .unwrap_or(min_idx);
12847    let idx_rows = tx.query_params(
12848        "SELECT idx
12849         FROM messages INDEXED BY sqlite_autoindex_messages_1
12850         WHERE conversation_id = ?1
12851           AND idx >= ?2
12852           AND idx <= ?3",
12853        fparams![conversation_id, min_idx, max_idx],
12854    )?;
12855    record_message_lookup_bounded_queries(1, idx_rows.len());
12856
12857    let mut existing_indices = HashSet::with_capacity(idx_rows.len());
12858    for row in idx_rows {
12859        let idx: i64 = row.get_typed(0)?;
12860        existing_indices.insert(idx);
12861    }
12862
12863    let mut by_idx = HashMap::with_capacity(incoming_messages.len().min(existing_indices.len()));
12864    let mut missing_messages = Vec::new();
12865    for msg in incoming_messages {
12866        if existing_indices.contains(&msg.idx) {
12867            // Same-idx messages are skipped by merge policy even when content has
12868            // diverged. Use the incoming fingerprint as a lightweight presence
12869            // marker so normal reprocessing does not need to read stored content.
12870            by_idx.insert(msg.idx, message_merge_fingerprint(msg));
12871        } else {
12872            missing_messages.push(msg);
12873        }
12874    }
12875
12876    if missing_messages.is_empty() {
12877        return Ok(ExistingMessageLookup {
12878            by_idx,
12879            replay: HashSet::new(),
12880        });
12881    }
12882
12883    let requires_full_scan = missing_messages.iter().any(|msg| msg.created_at.is_none());
12884    let created_bounds = missing_messages
12885        .iter()
12886        .filter_map(|msg| msg.created_at)
12887        .fold(None, |bounds: Option<(i64, i64)>, created_at| {
12888            Some(match bounds {
12889                Some((min_created_at, max_created_at)) => (
12890                    min_created_at.min(created_at),
12891                    max_created_at.max(created_at),
12892                ),
12893                None => (created_at, created_at),
12894            })
12895        });
12896
12897    let mut replay = HashSet::new();
12898    if requires_full_scan {
12899        let rows = tx.query_params(
12900            "SELECT idx, role, author, created_at, content
12901             FROM messages INDEXED BY sqlite_autoindex_messages_1
12902             WHERE conversation_id = ?1",
12903            fparams![conversation_id],
12904        )?;
12905        record_message_lookup_full_scan_query(rows.len());
12906        let content_lookup =
12907            existing_message_lookup_from_rows(rows, min_idx, max_idx, created_bounds, true)?;
12908        by_idx.extend(content_lookup.by_idx);
12909        replay.extend(content_lookup.replay);
12910    } else if let Some((min_created_at, max_created_at)) = created_bounds {
12911        let rows = tx.query_params(
12912            "SELECT idx, role, author, created_at, content
12913             FROM messages INDEXED BY sqlite_autoindex_messages_1
12914             WHERE conversation_id = ?1
12915               AND created_at IS NOT NULL
12916               AND created_at >= ?2
12917               AND created_at <= ?3",
12918            fparams![conversation_id, min_created_at, max_created_at],
12919        )?;
12920        record_message_lookup_bounded_queries(1, rows.len());
12921        let created_lookup =
12922            existing_message_lookup_from_rows(rows, min_idx, max_idx, created_bounds, false)?;
12923        by_idx.extend(created_lookup.by_idx);
12924        replay.extend(created_lookup.replay);
12925    }
12926
12927    Ok(ExistingMessageLookup { by_idx, replay })
12928}
12929
12930fn franken_existing_message_lookup_with_pending(
12931    tx: &FrankenTransaction<'_>,
12932    conversation_id: i64,
12933    incoming_messages: &[Message],
12934    pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
12935    pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
12936) -> Result<ExistingMessageLookup> {
12937    if let (Some(by_idx), Some(replay)) = (
12938        pending_message_fingerprints.get(&conversation_id),
12939        pending_message_replay_fingerprints.get(&conversation_id),
12940    ) {
12941        if incoming_messages.iter().all(|msg| {
12942            by_idx.contains_key(&msg.idx) || replay.contains(&message_replay_fingerprint(msg))
12943        }) {
12944            return Ok(ExistingMessageLookup {
12945                by_idx: by_idx.clone(),
12946                replay: replay.clone(),
12947            });
12948        }
12949
12950        let fresh = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12951        let mut merged_by_idx = by_idx.clone();
12952        let mut merged_replay = replay.clone();
12953        merged_by_idx.extend(fresh.by_idx);
12954        merged_replay.extend(fresh.replay);
12955        pending_message_fingerprints.insert(conversation_id, merged_by_idx.clone());
12956        pending_message_replay_fingerprints.insert(conversation_id, merged_replay.clone());
12957        return Ok(ExistingMessageLookup {
12958            by_idx: merged_by_idx,
12959            replay: merged_replay,
12960        });
12961    }
12962
12963    let lookup = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12964    pending_message_fingerprints.insert(conversation_id, lookup.by_idx.clone());
12965    pending_message_replay_fingerprints.insert(conversation_id, lookup.replay.clone());
12966    Ok(lookup)
12967}
12968
12969fn franken_collect_batched_existing_new_messages<'a>(
12970    tx: &FrankenTransaction<'_>,
12971    conversation_id: i64,
12972    conv: &'a Conversation,
12973    pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
12974    pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
12975    replay_skip_log: &'static str,
12976) -> Result<(
12977    ExistingConversationNewMessages<'a>,
12978    HashMap<i64, MessageMergeFingerprint>,
12979    HashSet<MessageReplayFingerprint>,
12980)> {
12981    let tail_metadata = franken_cached_existing_conversation_tail_metadata(tx, conversation_id)?;
12982    let tail_state = tail_metadata.complete_tail_state();
12983    if let Some(tail_state) = tail_state
12984        && let Some(tail_plan) = collect_append_only_tail_messages(
12985            conv,
12986            tail_state.last_message_idx,
12987            tail_state.last_message_created_at,
12988        )
12989    {
12990        let mut by_idx = pending_message_fingerprints
12991            .remove(&conversation_id)
12992            .unwrap_or_default();
12993        let mut replay = pending_message_replay_fingerprints
12994            .remove(&conversation_id)
12995            .unwrap_or_default();
12996        for msg in &tail_plan.messages {
12997            let fingerprint = message_merge_fingerprint(msg);
12998            by_idx.insert(msg.idx, fingerprint.clone());
12999            replay.insert(replay_fingerprint_from_merge(&fingerprint));
13000        }
13001        return Ok((tail_plan, by_idx, replay));
13002    }
13003
13004    let timestamp_data_incomplete = tail_metadata.last_message_created_at.is_none()
13005        || conv.messages.iter().any(|msg| msg.created_at.is_none());
13006    if timestamp_data_incomplete
13007        && let Some(existing_ended_at) = tail_metadata.ended_at
13008        && let Some(noop_plan) =
13009            collect_existing_conversation_noop_from_conversation_ended_at(conv, existing_ended_at)
13010    {
13011        let by_idx = pending_message_fingerprints
13012            .remove(&conversation_id)
13013            .unwrap_or_default();
13014        let replay = pending_message_replay_fingerprints
13015            .remove(&conversation_id)
13016            .unwrap_or_default();
13017        return Ok((noop_plan, by_idx, replay));
13018    }
13019
13020    if timestamp_data_incomplete
13021        && let Some(last_message_idx) = tail_metadata.last_message_idx
13022        && let Some(tail_plan) =
13023            collect_existing_conversation_noop_from_idx_tail(conv, last_message_idx)
13024    {
13025        let mut by_idx = pending_message_fingerprints
13026            .remove(&conversation_id)
13027            .unwrap_or_default();
13028        let mut replay = pending_message_replay_fingerprints
13029            .remove(&conversation_id)
13030            .unwrap_or_default();
13031        for msg in &tail_plan.messages {
13032            let fingerprint = message_merge_fingerprint(msg);
13033            by_idx.insert(msg.idx, fingerprint.clone());
13034            replay.insert(replay_fingerprint_from_merge(&fingerprint));
13035        }
13036        return Ok((tail_plan, by_idx, replay));
13037    }
13038
13039    let existing_ended_at = if tail_metadata.ended_at.is_some() {
13040        tail_metadata.ended_at
13041    } else {
13042        franken_existing_conversation_ended_at(tx, conversation_id)?
13043    };
13044    if let Some(existing_ended_at) = existing_ended_at
13045        && let Some(tail_plan) =
13046            collect_existing_conversation_tail_from_ended_at(conv, existing_ended_at)
13047    {
13048        let mut by_idx = pending_message_fingerprints
13049            .remove(&conversation_id)
13050            .unwrap_or_default();
13051        let mut replay = pending_message_replay_fingerprints
13052            .remove(&conversation_id)
13053            .unwrap_or_default();
13054        for msg in &tail_plan.messages {
13055            let fingerprint = message_merge_fingerprint(msg);
13056            by_idx.insert(msg.idx, fingerprint.clone());
13057            replay.insert(replay_fingerprint_from_merge(&fingerprint));
13058        }
13059        return Ok((tail_plan, by_idx, replay));
13060    }
13061
13062    trace_existing_conversation_lookup_fallback(
13063        conversation_id,
13064        conv,
13065        tail_state,
13066        existing_ended_at,
13067    );
13068
13069    let ExistingMessageLookup {
13070        by_idx: mut existing_messages,
13071        replay: mut existing_replay_fingerprints,
13072    } = franken_existing_message_lookup_with_pending(
13073        tx,
13074        conversation_id,
13075        &conv.messages,
13076        pending_message_fingerprints,
13077        pending_message_replay_fingerprints,
13078    )?;
13079    let new_messages = collect_new_messages_for_existing_conversation(
13080        conversation_id,
13081        conv,
13082        &mut existing_messages,
13083        &mut existing_replay_fingerprints,
13084        replay_skip_log,
13085    );
13086    Ok((
13087        new_messages,
13088        existing_messages,
13089        existing_replay_fingerprints,
13090    ))
13091}
13092
13093/// Batch insert FTS5 entries within a frankensqlite transaction.
13094fn franken_batch_insert_fts(tx: &FrankenTransaction<'_>, entries: &[FtsEntry]) -> Result<usize> {
13095    if entries.is_empty() {
13096        return Ok(0);
13097    }
13098
13099    let mut inserted = 0;
13100
13101    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
13102        let placeholders: String = chunk
13103            .iter()
13104            .enumerate()
13105            .map(|(i, _)| {
13106                let base = i * 7 + 1; // +1 for 1-indexed params
13107                format!(
13108                    "(?{},?{},?{},?{},?{},?{},?{})",
13109                    base,
13110                    base + 1,
13111                    base + 2,
13112                    base + 3,
13113                    base + 4,
13114                    base + 5,
13115                    base + 6
13116                )
13117            })
13118            .collect::<Vec<_>>()
13119            .join(",");
13120
13121        let sql = format!(
13122            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
13123        );
13124
13125        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
13126        for entry in chunk {
13127            param_values.push(SqliteValue::from(entry.message_id));
13128            param_values.push(SqliteValue::from(entry.content.as_str()));
13129            param_values.push(SqliteValue::from(entry.title.as_str()));
13130            param_values.push(SqliteValue::from(entry.agent.as_str()));
13131            param_values.push(SqliteValue::from(entry.workspace.as_str()));
13132            param_values.push(SqliteValue::from(entry.source_path.as_str()));
13133            param_values.push(SqliteValue::from(entry.created_at));
13134        }
13135
13136        match tx.execute_with_params(&sql, &param_values) {
13137            Ok(_) => {
13138                inserted += chunk.len();
13139            }
13140            Err(err) => {
13141                tracing::warn!(
13142                    error = %err,
13143                    chunk_docs = chunk.len(),
13144                    "frankensqlite FTS batch insert failed; skipping db-resident FTS maintenance because Tantivy is authoritative"
13145                );
13146                return Ok(inserted);
13147            }
13148        }
13149    }
13150
13151    Ok(inserted)
13152}
13153
13154fn franken_batch_insert_fts_on_connection(
13155    conn: &FrankenConnection,
13156    entries: &[FtsEntry],
13157) -> Result<usize> {
13158    if entries.is_empty() {
13159        return Ok(0);
13160    }
13161
13162    let mut inserted = 0;
13163
13164    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
13165        let placeholders: String = chunk
13166            .iter()
13167            .enumerate()
13168            .map(|(i, _)| {
13169                let base = i * 7 + 1;
13170                format!(
13171                    "(?{},?{},?{},?{},?{},?{},?{})",
13172                    base,
13173                    base + 1,
13174                    base + 2,
13175                    base + 3,
13176                    base + 4,
13177                    base + 5,
13178                    base + 6
13179                )
13180            })
13181            .collect::<Vec<_>>()
13182            .join(",");
13183
13184        let sql = format!(
13185            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
13186        );
13187
13188        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
13189        for entry in chunk {
13190            param_values.push(SqliteValue::from(entry.message_id));
13191            param_values.push(SqliteValue::from(entry.content.as_str()));
13192            param_values.push(SqliteValue::from(entry.title.as_str()));
13193            param_values.push(SqliteValue::from(entry.agent.as_str()));
13194            param_values.push(SqliteValue::from(entry.workspace.as_str()));
13195            param_values.push(SqliteValue::from(entry.source_path.as_str()));
13196            param_values.push(SqliteValue::from(entry.created_at));
13197        }
13198
13199        conn.execute_with_params(&sql, &param_values)
13200            .with_context(|| {
13201                format!(
13202                    "inserting {} rows into fts_messages during streaming FTS maintenance",
13203                    chunk.len()
13204                )
13205            })?;
13206        inserted += chunk.len();
13207    }
13208
13209    Ok(inserted)
13210}
13211
13212/// Update daily stats within a frankensqlite transaction.
13213fn franken_update_daily_stats_in_tx(
13214    storage: &FrankenStorage,
13215    tx: &FrankenTransaction<'_>,
13216    agent_slug: &str,
13217    source_id: &str,
13218    started_at: Option<i64>,
13219    delta: StatsDelta,
13220) -> Result<()> {
13221    let day_id = started_at
13222        .map(FrankenStorage::day_id_from_millis)
13223        .unwrap_or(0);
13224    let now = FrankenStorage::now_millis();
13225
13226    let targets = [
13227        DailyStatsTarget {
13228            day_id,
13229            agent_slug,
13230            source_id,
13231        },
13232        DailyStatsTarget {
13233            day_id,
13234            agent_slug: "all",
13235            source_id,
13236        },
13237        DailyStatsTarget {
13238            day_id,
13239            agent_slug,
13240            source_id: "all",
13241        },
13242        DailyStatsTarget {
13243            day_id,
13244            agent_slug: "all",
13245            source_id: "all",
13246        },
13247    ];
13248
13249    if agent_slug != "all"
13250        && source_id != "all"
13251        && franken_update_ensured_daily_stats_targets_in_tx(storage, tx, &targets, now, delta)?
13252    {
13253        return Ok(());
13254    }
13255
13256    for target in targets {
13257        franken_apply_daily_stats_delta_in_tx(storage, tx, target, now, delta)?;
13258    }
13259
13260    Ok(())
13261}
13262
13263#[derive(Clone, Copy)]
13264struct DailyStatsTarget<'a> {
13265    day_id: i64,
13266    agent_slug: &'a str,
13267    source_id: &'a str,
13268}
13269
13270fn franken_update_ensured_daily_stats_targets_in_tx(
13271    storage: &FrankenStorage,
13272    tx: &FrankenTransaction<'_>,
13273    targets: &[DailyStatsTarget<'_>; 4],
13274    now: i64,
13275    delta: StatsDelta,
13276) -> Result<bool> {
13277    let cache_keys = targets.map(|target| {
13278        EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id)
13279    });
13280    if !storage.daily_stats_keys_already_ensured(&cache_keys) {
13281        return Ok(false);
13282    }
13283
13284    let primary = targets[0];
13285    let rows_changed = tx.execute_compat(
13286        "UPDATE daily_stats
13287         SET session_count = session_count + ?4,
13288             message_count = message_count + ?5,
13289             total_chars = total_chars + ?6,
13290             last_updated = ?7
13291         WHERE day_id = ?1
13292           AND ((agent_slug = ?2 AND source_id = ?3)
13293                OR (agent_slug = 'all' AND source_id = ?3)
13294                OR (agent_slug = ?2 AND source_id = 'all')
13295                OR (agent_slug = 'all' AND source_id = 'all'))",
13296        fparams![
13297            primary.day_id,
13298            primary.agent_slug,
13299            primary.source_id,
13300            delta.session_count_delta,
13301            delta.message_count_delta,
13302            delta.total_chars_delta,
13303            now
13304        ],
13305    )?;
13306    if rows_changed == targets.len() {
13307        return Ok(true);
13308    }
13309
13310    for (target, cache_key) in targets.iter().copied().zip(cache_keys) {
13311        let exists = tx
13312            .query_row_map(
13313                "SELECT 1 FROM daily_stats
13314                 WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3
13315                 LIMIT 1",
13316                fparams![target.day_id, target.agent_slug, target.source_id],
13317                |row| row.get_typed::<i64>(0),
13318            )
13319            .optional()?
13320            .is_some();
13321        if exists {
13322            continue;
13323        }
13324
13325        tx.execute_compat(
13326            "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
13327             VALUES(?1,?2,?3,?4,?5,?6,?7)",
13328            fparams![
13329                target.day_id,
13330                target.agent_slug,
13331                target.source_id,
13332                delta.session_count_delta,
13333                delta.message_count_delta,
13334                delta.total_chars_delta,
13335                now
13336            ],
13337        )?;
13338        storage.mark_daily_stats_key_ensured(cache_key);
13339    }
13340
13341    Ok(true)
13342}
13343
13344fn franken_apply_daily_stats_delta_in_tx(
13345    storage: &FrankenStorage,
13346    tx: &FrankenTransaction<'_>,
13347    target: DailyStatsTarget<'_>,
13348    now: i64,
13349    delta: StatsDelta,
13350) -> Result<()> {
13351    let cache_key = EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id);
13352    if storage.daily_stats_key_already_ensured(&cache_key) {
13353        let rows_changed = tx.execute_compat(
13354            "UPDATE daily_stats
13355             SET session_count = session_count + ?4,
13356                 message_count = message_count + ?5,
13357                 total_chars = total_chars + ?6,
13358                 last_updated = ?7
13359             WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3",
13360            fparams![
13361                target.day_id,
13362                target.agent_slug,
13363                target.source_id,
13364                delta.session_count_delta,
13365                delta.message_count_delta,
13366                delta.total_chars_delta,
13367                now
13368            ],
13369        )?;
13370        if rows_changed > 0 {
13371            return Ok(());
13372        }
13373    }
13374
13375    tx.execute_compat(
13376        "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
13377         VALUES(?1,?2,?3,?4,?5,?6,?7)
13378         ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
13379            session_count = session_count + excluded.session_count,
13380            message_count = message_count + excluded.message_count,
13381            total_chars = total_chars + excluded.total_chars,
13382            last_updated = excluded.last_updated",
13383        fparams![
13384            target.day_id,
13385            target.agent_slug,
13386            target.source_id,
13387            delta.session_count_delta,
13388            delta.message_count_delta,
13389            delta.total_chars_delta,
13390            now
13391        ],
13392    )?;
13393    storage.mark_daily_stats_key_ensured(cache_key);
13394    Ok(())
13395}
13396
13397// -------------------------------------------------------------------------
13398// Frankensqlite batch helpers
13399// -------------------------------------------------------------------------
13400
13401/// Batch upsert daily_stats within a frankensqlite transaction.
13402fn franken_update_daily_stats_batched_in_tx(
13403    tx: &FrankenTransaction<'_>,
13404    entries: &[(i64, String, String, StatsDelta)],
13405) -> Result<usize> {
13406    if entries.is_empty() {
13407        return Ok(0);
13408    }
13409
13410    let now = FrankenStorage::now_millis();
13411    let mut total_affected = 0;
13412
13413    // Keep frankensqlite UPSERTs row-wise inside the transaction. The
13414    // multi-row VALUES ... ON CONFLICT form still falls back through
13415    // INSERT...SELECT in fsqlite-core, which rejects UPSERT/RETURNING during
13416    // real cass indexing.
13417    for (day_id, agent, source, delta) in entries {
13418        total_affected += tx.execute_compat(
13419            "INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
13420             VALUES(?1,?2,?3,?4,?5,?6,?7)
13421             ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
13422                 session_count = session_count + excluded.session_count,
13423                 message_count = message_count + excluded.message_count,
13424                 total_chars = total_chars + excluded.total_chars,
13425                 last_updated = excluded.last_updated",
13426            fparams![
13427                *day_id,
13428                agent.as_str(),
13429                source.as_str(),
13430                delta.session_count_delta,
13431                delta.message_count_delta,
13432                delta.total_chars_delta,
13433                now
13434            ],
13435        )?;
13436    }
13437
13438    Ok(total_affected)
13439}
13440
13441/// Batch insert token_usage rows within a frankensqlite transaction.
13442///
13443/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
13444/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
13445/// UPSERT/OR IGNORE conflict clauses.
13446fn franken_insert_token_usage_batched_in_tx(
13447    tx: &FrankenTransaction<'_>,
13448    entries: &[TokenUsageEntry],
13449) -> Result<usize> {
13450    if entries.is_empty() {
13451        return Ok(0);
13452    }
13453
13454    let mut total_inserted = 0;
13455
13456    for e in entries {
13457        let params_vec: Vec<ParamValue> = vec![
13458            ParamValue::from(e.message_id),
13459            ParamValue::from(e.conversation_id),
13460            ParamValue::from(e.agent_id),
13461            ParamValue::from(e.workspace_id),
13462            ParamValue::from(e.source_id.clone()),
13463            ParamValue::from(e.timestamp_ms),
13464            ParamValue::from(e.day_id),
13465            ParamValue::from(e.model_name.clone()),
13466            ParamValue::from(e.model_family.clone()),
13467            ParamValue::from(e.model_tier.clone()),
13468            ParamValue::from(e.service_tier.clone()),
13469            ParamValue::from(e.provider.clone()),
13470            ParamValue::from(e.input_tokens),
13471            ParamValue::from(e.output_tokens),
13472            ParamValue::from(e.cache_read_tokens),
13473            ParamValue::from(e.cache_creation_tokens),
13474            ParamValue::from(e.thinking_tokens),
13475            ParamValue::from(e.total_tokens),
13476            ParamValue::from(e.estimated_cost_usd),
13477            ParamValue::from(e.role.clone()),
13478            ParamValue::from(e.content_chars),
13479            ParamValue::from(e.has_tool_calls as i64),
13480            ParamValue::from(e.tool_call_count as i64),
13481            ParamValue::from(e.data_source.clone()),
13482        ];
13483
13484        let values = param_slice_to_values(&params_vec);
13485        total_inserted += tx.execute_with_params(
13486            "INSERT OR IGNORE INTO token_usage (
13487                message_id, conversation_id, agent_id, workspace_id, source_id,
13488                timestamp_ms, day_id,
13489                model_name, model_family, model_tier, service_tier, provider,
13490                input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
13491                thinking_tokens, total_tokens, estimated_cost_usd,
13492                role, content_chars, has_tool_calls, tool_call_count, data_source
13493            )
13494            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
13495            &values,
13496        )?;
13497    }
13498
13499    Ok(total_inserted)
13500}
13501
13502/// Batch upsert token_daily_stats within a frankensqlite transaction.
13503fn franken_update_token_daily_stats_batched_in_tx(
13504    tx: &FrankenTransaction<'_>,
13505    entries: &[(i64, String, String, String, TokenStatsDelta)],
13506) -> Result<usize> {
13507    if entries.is_empty() {
13508        return Ok(0);
13509    }
13510
13511    let now = FrankenStorage::now_millis();
13512    let mut total_affected = 0;
13513
13514    for (day_id, agent, source, model, delta) in entries {
13515        total_affected += tx.execute_compat(
13516            "INSERT INTO token_daily_stats (
13517                day_id, agent_slug, source_id, model_family,
13518                api_call_count, user_message_count, assistant_message_count, tool_message_count,
13519                total_input_tokens, total_output_tokens, total_cache_read_tokens,
13520                total_cache_creation_tokens, total_thinking_tokens, grand_total_tokens,
13521                total_content_chars, total_tool_calls, estimated_cost_usd, session_count,
13522                last_updated
13523            )
13524            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19)
13525            ON CONFLICT(day_id, agent_slug, source_id, model_family) DO UPDATE SET
13526                api_call_count = api_call_count + excluded.api_call_count,
13527                user_message_count = user_message_count + excluded.user_message_count,
13528                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13529                tool_message_count = tool_message_count + excluded.tool_message_count,
13530                total_input_tokens = total_input_tokens + excluded.total_input_tokens,
13531                total_output_tokens = total_output_tokens + excluded.total_output_tokens,
13532                total_cache_read_tokens = total_cache_read_tokens + excluded.total_cache_read_tokens,
13533                total_cache_creation_tokens = total_cache_creation_tokens + excluded.total_cache_creation_tokens,
13534                total_thinking_tokens = total_thinking_tokens + excluded.total_thinking_tokens,
13535                grand_total_tokens = grand_total_tokens + excluded.grand_total_tokens,
13536                total_content_chars = total_content_chars + excluded.total_content_chars,
13537                total_tool_calls = total_tool_calls + excluded.total_tool_calls,
13538                estimated_cost_usd = estimated_cost_usd + excluded.estimated_cost_usd,
13539                session_count = session_count + excluded.session_count,
13540                last_updated = excluded.last_updated",
13541            fparams![
13542                *day_id,
13543                agent.as_str(),
13544                source.as_str(),
13545                model.as_str(),
13546                delta.api_call_count,
13547                delta.user_message_count,
13548                delta.assistant_message_count,
13549                delta.tool_message_count,
13550                delta.total_input_tokens,
13551                delta.total_output_tokens,
13552                delta.total_cache_read_tokens,
13553                delta.total_cache_creation_tokens,
13554                delta.total_thinking_tokens,
13555                delta.grand_total_tokens,
13556                delta.total_content_chars,
13557                delta.total_tool_calls,
13558                delta.estimated_cost_usd,
13559                delta.session_count,
13560                now
13561            ],
13562        )?;
13563    }
13564
13565    Ok(total_affected)
13566}
13567
13568/// Batch insert message_metrics rows within a frankensqlite transaction.
13569///
13570/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
13571/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
13572/// UPSERT/OR IGNORE conflict clauses.
13573fn franken_insert_message_metrics_batched_in_tx(
13574    tx: &FrankenTransaction<'_>,
13575    entries: &[MessageMetricsEntry],
13576) -> Result<usize> {
13577    if entries.is_empty() {
13578        return Ok(0);
13579    }
13580
13581    let mut total_inserted = 0;
13582
13583    for e in entries {
13584        let params_vec: Vec<ParamValue> = vec![
13585            ParamValue::from(e.message_id),
13586            ParamValue::from(e.created_at_ms),
13587            ParamValue::from(e.hour_id),
13588            ParamValue::from(e.day_id),
13589            ParamValue::from(e.agent_slug.clone()),
13590            ParamValue::from(e.workspace_id),
13591            ParamValue::from(e.source_id.clone()),
13592            ParamValue::from(e.role.clone()),
13593            ParamValue::from(e.content_chars),
13594            ParamValue::from(e.content_tokens_est),
13595            ParamValue::from(e.model_name.clone()),
13596            ParamValue::from(e.model_family.clone()),
13597            ParamValue::from(e.model_tier.clone()),
13598            ParamValue::from(e.provider.clone()),
13599            ParamValue::from(e.api_input_tokens),
13600            ParamValue::from(e.api_output_tokens),
13601            ParamValue::from(e.api_cache_read_tokens),
13602            ParamValue::from(e.api_cache_creation_tokens),
13603            ParamValue::from(e.api_thinking_tokens),
13604            ParamValue::from(e.api_service_tier.clone()),
13605            ParamValue::from(e.api_data_source.clone()),
13606            ParamValue::from(e.tool_call_count),
13607            ParamValue::from(e.has_tool_calls as i64),
13608            ParamValue::from(e.has_plan as i64),
13609        ];
13610
13611        let values = param_slice_to_values(&params_vec);
13612        total_inserted += tx.execute_with_params(
13613            "INSERT OR IGNORE INTO message_metrics (
13614                message_id, created_at_ms, hour_id, day_id,
13615                agent_slug, workspace_id, source_id, role,
13616                content_chars, content_tokens_est,
13617                model_name, model_family, model_tier, provider,
13618                api_input_tokens, api_output_tokens, api_cache_read_tokens,
13619                api_cache_creation_tokens, api_thinking_tokens,
13620                api_service_tier, api_data_source,
13621                tool_call_count, has_tool_calls, has_plan
13622            )
13623            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
13624            &values,
13625        )?;
13626    }
13627
13628    Ok(total_inserted)
13629}
13630
13631/// Flush one rollup table (shared logic for hourly + daily) within a frankensqlite transaction.
13632fn franken_flush_rollup_table(
13633    tx: &FrankenTransaction<'_>,
13634    table: &str,
13635    bucket_col: &str,
13636    deltas: &HashMap<(i64, String, i64, String), UsageRollupDelta>,
13637    now: i64,
13638) -> Result<usize> {
13639    if deltas.is_empty() {
13640        return Ok(0);
13641    }
13642
13643    let mut total_affected = 0;
13644
13645    for ((bucket_id, agent, workspace_id, source), d) in deltas {
13646        let sql = format!(
13647            "INSERT INTO {table} (
13648                {bucket_col}, agent_slug, workspace_id, source_id,
13649                message_count, user_message_count, assistant_message_count,
13650                tool_call_count, plan_message_count, plan_content_tokens_est_total,
13651                plan_api_tokens_total, api_coverage_message_count,
13652                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
13653                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
13654                api_cache_read_tokens_total, api_cache_creation_tokens_total,
13655                api_thinking_tokens_total, last_updated
13656            )
13657            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
13658            ON CONFLICT({bucket_col}, agent_slug, workspace_id, source_id) DO UPDATE SET
13659                message_count = message_count + excluded.message_count,
13660                user_message_count = user_message_count + excluded.user_message_count,
13661                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13662                tool_call_count = tool_call_count + excluded.tool_call_count,
13663                plan_message_count = plan_message_count + excluded.plan_message_count,
13664                plan_content_tokens_est_total = plan_content_tokens_est_total + excluded.plan_content_tokens_est_total,
13665                plan_api_tokens_total = plan_api_tokens_total + excluded.plan_api_tokens_total,
13666                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
13667                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
13668                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
13669                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
13670                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
13671                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
13672                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
13673                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
13674                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
13675                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
13676                last_updated = excluded.last_updated"
13677        );
13678
13679        total_affected += tx.execute_compat(
13680            &sql,
13681            fparams![
13682                *bucket_id,
13683                agent.as_str(),
13684                *workspace_id,
13685                source.as_str(),
13686                d.message_count,
13687                d.user_message_count,
13688                d.assistant_message_count,
13689                d.tool_call_count,
13690                d.plan_message_count,
13691                d.plan_content_tokens_est_total,
13692                d.plan_api_tokens_total,
13693                d.api_coverage_message_count,
13694                d.content_tokens_est_total,
13695                d.content_tokens_est_user,
13696                d.content_tokens_est_assistant,
13697                d.api_tokens_total,
13698                d.api_input_tokens_total,
13699                d.api_output_tokens_total,
13700                d.api_cache_read_tokens_total,
13701                d.api_cache_creation_tokens_total,
13702                d.api_thinking_tokens_total,
13703                now
13704            ],
13705        )?;
13706    }
13707
13708    Ok(total_affected)
13709}
13710
13711/// Flush usage_models_daily rollup within a frankensqlite transaction.
13712fn franken_flush_model_daily_rollup_table(
13713    tx: &FrankenTransaction<'_>,
13714    deltas: &HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
13715    now: i64,
13716) -> Result<usize> {
13717    if deltas.is_empty() {
13718        return Ok(0);
13719    }
13720
13721    let mut total_affected = 0;
13722
13723    for ((day_id, agent, workspace_id, source, model_family, model_tier), d) in deltas {
13724        total_affected += tx.execute_compat(
13725            "INSERT INTO usage_models_daily (
13726                day_id, agent_slug, workspace_id, source_id, model_family, model_tier,
13727                message_count, user_message_count, assistant_message_count,
13728                tool_call_count, plan_message_count, api_coverage_message_count,
13729                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
13730                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
13731                api_cache_read_tokens_total, api_cache_creation_tokens_total,
13732                api_thinking_tokens_total, last_updated
13733            )
13734            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
13735            ON CONFLICT(day_id, agent_slug, workspace_id, source_id, model_family, model_tier) DO UPDATE SET
13736                message_count = message_count + excluded.message_count,
13737                user_message_count = user_message_count + excluded.user_message_count,
13738                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13739                tool_call_count = tool_call_count + excluded.tool_call_count,
13740                plan_message_count = plan_message_count + excluded.plan_message_count,
13741                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
13742                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
13743                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
13744                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
13745                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
13746                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
13747                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
13748                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
13749                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
13750                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
13751                last_updated = excluded.last_updated",
13752            fparams![
13753                *day_id,
13754                agent.as_str(),
13755                *workspace_id,
13756                source.as_str(),
13757                model_family.as_str(),
13758                model_tier.as_str(),
13759                d.message_count,
13760                d.user_message_count,
13761                d.assistant_message_count,
13762                d.tool_call_count,
13763                d.plan_message_count,
13764                d.api_coverage_message_count,
13765                d.content_tokens_est_total,
13766                d.content_tokens_est_user,
13767                d.content_tokens_est_assistant,
13768                d.api_tokens_total,
13769                d.api_input_tokens_total,
13770                d.api_output_tokens_total,
13771                d.api_cache_read_tokens_total,
13772                d.api_cache_creation_tokens_total,
13773                d.api_thinking_tokens_total,
13774                now
13775            ],
13776        )?;
13777    }
13778
13779    Ok(total_affected)
13780}
13781
13782/// Flush AnalyticsRollupAggregator deltas via frankensqlite transaction.
13783fn franken_flush_analytics_rollups_in_tx(
13784    tx: &FrankenTransaction<'_>,
13785    agg: &AnalyticsRollupAggregator,
13786) -> Result<(usize, usize, usize)> {
13787    let now = FrankenStorage::now_millis();
13788
13789    let hourly_affected =
13790        franken_flush_rollup_table(tx, "usage_hourly", "hour_id", &agg.hourly, now)?;
13791    let daily_affected = franken_flush_rollup_table(tx, "usage_daily", "day_id", &agg.daily, now)?;
13792    let models_daily_affected = franken_flush_model_daily_rollup_table(tx, &agg.models_daily, now)?;
13793
13794    Ok((hourly_affected, daily_affected, models_daily_affected))
13795}
13796
13797/// Update conversation-level token summary columns via frankensqlite transaction.
13798fn franken_update_conversation_token_summaries_in_tx(
13799    tx: &FrankenTransaction<'_>,
13800    conversation_id: i64,
13801) -> Result<()> {
13802    tx.execute_compat(
13803        "UPDATE conversations SET
13804            total_input_tokens = (SELECT SUM(input_tokens) FROM token_usage WHERE conversation_id = ?1),
13805            total_output_tokens = (SELECT SUM(output_tokens) FROM token_usage WHERE conversation_id = ?1),
13806            total_cache_read_tokens = (SELECT SUM(cache_read_tokens) FROM token_usage WHERE conversation_id = ?1),
13807            total_cache_creation_tokens = (SELECT SUM(cache_creation_tokens) FROM token_usage WHERE conversation_id = ?1),
13808            grand_total_tokens = (SELECT SUM(total_tokens) FROM token_usage WHERE conversation_id = ?1),
13809            estimated_cost_usd = (SELECT SUM(estimated_cost_usd) FROM token_usage WHERE conversation_id = ?1),
13810            primary_model = (SELECT model_name FROM token_usage WHERE conversation_id = ?1
13811                             AND model_name IS NOT NULL
13812                             GROUP BY model_name ORDER BY COUNT(*) DESC LIMIT 1),
13813            api_call_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13814                              AND data_source = 'api'),
13815            tool_call_count = (SELECT SUM(tool_call_count) FROM token_usage WHERE conversation_id = ?1),
13816            user_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13817                                  AND role = 'user'),
13818            assistant_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13819                                       AND role IN ('assistant', 'agent'))
13820         WHERE id = ?1",
13821        fparams![conversation_id],
13822    )?;
13823    Ok(())
13824}
13825
13826impl FrankenStorage {
13827    /// Rebuild token_daily_stats from the token_usage ledger.
13828    pub fn rebuild_token_daily_stats(&self) -> Result<usize> {
13829        const CONVERSATION_BATCH_SIZE: usize = 1_000;
13830        const TOKEN_USAGE_BATCH_SIZE: usize = 10_000;
13831
13832        let total_usage_rows: i64 =
13833            self.conn
13834                .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
13835                    row.get_typed(0)
13836                })?;
13837        tracing::info!(
13838            target: "cass::analytics",
13839            total_usage_rows,
13840            "token_daily_stats_rebuild_start"
13841        );
13842
13843        let mut tx = self.conn.transaction()?;
13844        tx.execute("DELETE FROM token_daily_stats")?;
13845
13846        let mut last_conversation_id = 0_i64;
13847        let mut rows_created = 0_usize;
13848
13849        loop {
13850            let conversation_rows = tx.query_map_collect(
13851                "SELECT c.id, c.started_at, c.source_id,
13852                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown')
13853                 FROM conversations c
13854                 WHERE c.id > ?1
13855                 ORDER BY c.id
13856                 LIMIT ?2",
13857                fparams![last_conversation_id, CONVERSATION_BATCH_SIZE as i64],
13858                |row| {
13859                    Ok((
13860                        row.get_typed::<i64>(0)?,
13861                        row.get_typed::<Option<i64>>(1)?,
13862                        row.get_typed::<String>(2)?,
13863                        row.get_typed::<String>(3)?,
13864                    ))
13865                },
13866            )?;
13867            if conversation_rows.is_empty() {
13868                break;
13869            }
13870
13871            let mut aggregate = TokenStatsAggregator::new();
13872
13873            for (conversation_id, started_at, source_id, agent_slug) in conversation_rows {
13874                last_conversation_id = conversation_id;
13875                let conversation_day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13876                let mut last_token_usage_id = 0_i64;
13877                let mut session_model_family = String::from("unknown");
13878
13879                loop {
13880                    let usage_rows = tx.query_map_collect(
13881                        "SELECT id, day_id, role,
13882                                COALESCE(model_family, 'unknown'),
13883                                input_tokens, output_tokens, cache_read_tokens,
13884                                cache_creation_tokens, thinking_tokens,
13885                                has_tool_calls, tool_call_count,
13886                                content_chars, estimated_cost_usd
13887                         FROM token_usage
13888                         WHERE conversation_id = ?1
13889                           AND id > ?2
13890                         ORDER BY id
13891                         LIMIT ?3",
13892                        fparams![
13893                            conversation_id,
13894                            last_token_usage_id,
13895                            TOKEN_USAGE_BATCH_SIZE as i64
13896                        ],
13897                        |row| {
13898                            Ok((
13899                                row.get_typed::<i64>(0)?,
13900                                row.get_typed::<i64>(1)?,
13901                                row.get_typed::<String>(2)?,
13902                                row.get_typed::<String>(3)?,
13903                                row.get_typed::<Option<i64>>(4)?,
13904                                row.get_typed::<Option<i64>>(5)?,
13905                                row.get_typed::<Option<i64>>(6)?,
13906                                row.get_typed::<Option<i64>>(7)?,
13907                                row.get_typed::<Option<i64>>(8)?,
13908                                row.get_typed::<i64>(9)?,
13909                                row.get_typed::<i64>(10)?,
13910                                row.get_typed::<i64>(11)?,
13911                                row.get_typed::<Option<f64>>(12)?,
13912                            ))
13913                        },
13914                    )?;
13915                    if usage_rows.is_empty() {
13916                        break;
13917                    }
13918
13919                    for (
13920                        token_usage_id,
13921                        day_id,
13922                        role,
13923                        model_family,
13924                        input_tokens,
13925                        output_tokens,
13926                        cache_read_tokens,
13927                        cache_creation_tokens,
13928                        thinking_tokens,
13929                        has_tool_calls,
13930                        tool_call_count,
13931                        content_chars,
13932                        estimated_cost_usd,
13933                    ) in usage_rows
13934                    {
13935                        last_token_usage_id = token_usage_id;
13936                        if model_family != "unknown" {
13937                            session_model_family = model_family.clone();
13938                        }
13939                        let usage = crate::connectors::ExtractedTokenUsage {
13940                            model_name: None,
13941                            provider: None,
13942                            input_tokens,
13943                            output_tokens,
13944                            cache_read_tokens,
13945                            cache_creation_tokens,
13946                            thinking_tokens,
13947                            service_tier: None,
13948                            has_tool_calls: has_tool_calls != 0,
13949                            tool_call_count: u32::try_from(tool_call_count.max(0)).unwrap_or(0),
13950                            data_source: franken_agent_detection::TokenDataSource::Api,
13951                        };
13952                        aggregate.record(
13953                            &agent_slug,
13954                            &source_id,
13955                            day_id,
13956                            &model_family,
13957                            &role,
13958                            &usage,
13959                            content_chars,
13960                            estimated_cost_usd.unwrap_or(0.0),
13961                        );
13962                    }
13963                }
13964
13965                aggregate.record_session(
13966                    &agent_slug,
13967                    &source_id,
13968                    conversation_day_id,
13969                    &session_model_family,
13970                );
13971            }
13972
13973            let entries = aggregate.expand();
13974            rows_created = rows_created.saturating_add(entries.len());
13975            franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
13976        }
13977
13978        tx.commit()?;
13979
13980        tracing::info!(
13981            target: "cass::analytics",
13982            rows_created,
13983            "token_daily_stats_rebuild_complete"
13984        );
13985
13986        Ok(rows_created)
13987    }
13988
13989    /// Rebuild analytics tables (message_metrics + rollups) from existing
13990    /// messages in the database. Does NOT re-parse raw agent session files.
13991    pub fn rebuild_analytics(&self) -> Result<AnalyticsRebuildResult> {
13992        let start = Instant::now();
13993
13994        let total_messages: i64 =
13995            self.conn
13996                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13997                    row.get_typed(0)
13998                })?;
13999        tracing::info!(
14000            target: "cass::analytics",
14001            total_messages,
14002            "analytics_rebuild_start"
14003        );
14004
14005        let mut tx = self.conn.transaction()?;
14006
14007        tx.execute("DELETE FROM message_metrics")?;
14008        tx.execute("DELETE FROM usage_hourly")?;
14009        tx.execute("DELETE FROM usage_daily")?;
14010        tx.execute("DELETE FROM usage_models_daily")?;
14011
14012        const CHUNK_SIZE: i64 = 10_000;
14013        let mut offset: i64 = 0;
14014        let mut total_inserted: usize = 0;
14015        let mut usage_hourly_rows: usize = 0;
14016        let mut usage_daily_rows: usize = 0;
14017        let mut usage_models_daily_rows: usize = 0;
14018
14019        loop {
14020            #[allow(clippy::type_complexity)]
14021            let rows: Vec<(
14022                i64,
14023                String,
14024                String,
14025                Option<serde_json::Value>,
14026                Option<i64>,
14027                Option<i64>,
14028                String,
14029                Option<i64>,
14030                String,
14031            )> = tx.query_map_collect(
14032                // Avoid the 3-table JOIN with LIMIT/OFFSET that triggers
14033                // frankensqlite's materialization fallback (see 860acb12).
14034                // Inline the agent slug lookup as a correlated subquery and
14035                // fall back to 'unknown' for NULL agent_id, matching the
14036                // FTS / lexical rebuild paths.
14037                "SELECT m.id, m.idx, m.role, m.content, m.extra_json, m.extra_bin,
14038                        m.created_at,
14039                        c.id AS conv_id, c.started_at AS conv_started_at,
14040                        c.source_id, c.workspace_id,
14041                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown') AS agent_slug
14042                 FROM messages m
14043                 JOIN conversations c ON m.conversation_id = c.id
14044                 ORDER BY m.id
14045                 LIMIT ?1 OFFSET ?2",
14046                fparams![CHUNK_SIZE, offset],
14047                |row| {
14048                    let msg_id: i64 = row.get_typed(0)?;
14049                    let role: String = row.get_typed(2)?;
14050                    let content: String = row.get_typed(3)?;
14051                    let extra_json = row
14052                        .get_typed::<Option<String>>(4)?
14053                        .and_then(|s| serde_json::from_str(&s).ok())
14054                        .or_else(|| {
14055                            row.get_typed::<Option<Vec<u8>>>(5)
14056                                .ok()
14057                                .flatten()
14058                                .and_then(|b| rmp_serde::from_slice(&b).ok())
14059                        });
14060                    let msg_ts: Option<i64> = row.get_typed(6)?;
14061                    let conv_started_at: Option<i64> = row.get_typed(8)?;
14062                    let source_id: String = row.get_typed(9)?;
14063                    let workspace_id: Option<i64> = row.get_typed(10)?;
14064                    let agent_slug: String = row.get_typed(11)?;
14065                    let effective_ts = msg_ts.or(conv_started_at).unwrap_or(0);
14066
14067                    Ok((
14068                        msg_id,
14069                        role,
14070                        content,
14071                        extra_json,
14072                        Some(effective_ts),
14073                        workspace_id,
14074                        source_id,
14075                        conv_started_at,
14076                        agent_slug,
14077                    ))
14078                },
14079            )?;
14080
14081            if rows.is_empty() {
14082                break;
14083            }
14084
14085            let chunk_len = rows.len();
14086            let mut entries = Vec::with_capacity(chunk_len);
14087            let mut rollup_agg = AnalyticsRollupAggregator::new();
14088
14089            for (
14090                msg_id,
14091                role,
14092                content,
14093                extra_json,
14094                effective_ts,
14095                workspace_id,
14096                source_id,
14097                _conv_started_at,
14098                agent_slug,
14099            ) in &rows
14100            {
14101                let ts = effective_ts.unwrap_or(0);
14102                let day_id = Self::day_id_from_millis(ts);
14103                let hour_id = Self::hour_id_from_millis(ts);
14104                let content_chars = content.len() as i64;
14105                let content_tokens_est = content_chars / 4;
14106                let extra = extra_json
14107                    .as_ref()
14108                    .cloned()
14109                    .unwrap_or(serde_json::Value::Null);
14110                let usage =
14111                    crate::connectors::extract_tokens_for_agent(agent_slug, &extra, content, role);
14112                let model_info = usage
14113                    .model_name
14114                    .as_deref()
14115                    .map(crate::connectors::normalize_model);
14116                let model_family = model_info
14117                    .as_ref()
14118                    .map(|i| i.family.clone())
14119                    .unwrap_or_else(|| "unknown".into());
14120                let model_tier = model_info
14121                    .as_ref()
14122                    .map(|i| i.tier.clone())
14123                    .unwrap_or_else(|| "unknown".into());
14124                let provider = usage
14125                    .provider
14126                    .clone()
14127                    .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
14128                    .unwrap_or_else(|| "unknown".into());
14129
14130                let entry = MessageMetricsEntry {
14131                    message_id: *msg_id,
14132                    created_at_ms: ts,
14133                    hour_id,
14134                    day_id,
14135                    agent_slug: agent_slug.clone(),
14136                    workspace_id: workspace_id.unwrap_or(0),
14137                    source_id: source_id.clone(),
14138                    role: role.clone(),
14139                    content_chars,
14140                    content_tokens_est,
14141                    model_name: usage.model_name.clone(),
14142                    model_family,
14143                    model_tier,
14144                    provider,
14145                    api_input_tokens: usage.input_tokens,
14146                    api_output_tokens: usage.output_tokens,
14147                    api_cache_read_tokens: usage.cache_read_tokens,
14148                    api_cache_creation_tokens: usage.cache_creation_tokens,
14149                    api_thinking_tokens: usage.thinking_tokens,
14150                    api_service_tier: usage.service_tier,
14151                    api_data_source: usage.data_source.as_str().to_string(),
14152                    tool_call_count: usage.tool_call_count as i64,
14153                    has_tool_calls: usage.has_tool_calls,
14154                    has_plan: has_plan_for_role(role, content),
14155                };
14156                rollup_agg.record(&entry);
14157                entries.push(entry);
14158            }
14159
14160            total_inserted += franken_insert_message_metrics_batched_in_tx(&tx, &entries)?;
14161            let (hourly, daily, models_daily) =
14162                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
14163            usage_hourly_rows += hourly;
14164            usage_daily_rows += daily;
14165            usage_models_daily_rows += models_daily;
14166            offset += chunk_len as i64;
14167
14168            tracing::debug!(
14169                target: "cass::analytics",
14170                offset,
14171                chunk = chunk_len,
14172                inserted = entries.len(),
14173                total = total_inserted,
14174                "analytics_rebuild_chunk"
14175            );
14176
14177            if (chunk_len as i64) < CHUNK_SIZE {
14178                break;
14179            }
14180        }
14181
14182        tx.commit()?;
14183
14184        let elapsed = start.elapsed();
14185        let elapsed_ms = elapsed.as_millis() as u64;
14186        let msgs_per_sec = if elapsed_ms > 0 {
14187            (total_inserted as f64) / (elapsed_ms as f64 / 1000.0)
14188        } else {
14189            0.0
14190        };
14191
14192        tracing::info!(
14193            target: "cass::analytics",
14194            message_metrics_rows = total_inserted,
14195            usage_hourly_rows,
14196            usage_daily_rows,
14197            usage_models_daily_rows,
14198            elapsed_ms,
14199            messages_per_sec = format!("{:.0}", msgs_per_sec),
14200            "analytics_rebuild_complete"
14201        );
14202
14203        Ok(AnalyticsRebuildResult {
14204            message_metrics_rows: total_inserted,
14205            usage_hourly_rows,
14206            usage_daily_rows,
14207            usage_models_daily_rows,
14208            elapsed_ms,
14209            messages_per_sec: msgs_per_sec,
14210        })
14211    }
14212
14213    /// Rebuild all daily stats from scratch.
14214    pub fn rebuild_daily_stats(&self) -> Result<DailyStatsRebuildResult> {
14215        const DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE: usize = 1_000;
14216        const DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE: usize = 10_000;
14217
14218        let mut conversation_batch_size = rebuild_batch_size_env(
14219            "CASS_DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE",
14220            DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE,
14221        );
14222        let mut message_batch_size = rebuild_batch_size_env(
14223            "CASS_DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE",
14224            DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE,
14225        );
14226
14227        let total_messages: i64 =
14228            self.conn
14229                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
14230                    row.get_typed(0)
14231                })?;
14232        let message_metrics_rows: i64 =
14233            self.conn
14234                .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
14235                    row.get_typed(0)
14236                })?;
14237        let use_message_metrics = total_messages > 0 && total_messages == message_metrics_rows;
14238
14239        tracing::info!(
14240            target: "cass::perf::daily_stats",
14241            total_messages,
14242            message_metrics_rows,
14243            use_message_metrics,
14244            "daily_stats rebuild selected message source"
14245        );
14246
14247        let mut tx = self.conn.transaction()?;
14248        tx.execute("DELETE FROM daily_stats")?;
14249
14250        let mut last_conversation_id = 0_i64;
14251        let mut conversation_batch_count = 0_usize;
14252        let mut conversations_processed = 0_usize;
14253        let mut messages_processed = 0_usize;
14254        let mut message_batch_count = 0_usize;
14255        let mut raw_entries_flushed = 0_usize;
14256        let mut expanded_entries_flushed = 0_usize;
14257        let message_scan_sql = if use_message_metrics {
14258            "SELECT m.idx, mm.content_chars
14259             FROM messages m
14260             JOIN message_metrics mm ON mm.message_id = m.id
14261             WHERE m.conversation_id = ?1
14262               AND m.idx > ?2
14263             ORDER BY m.conversation_id, m.idx
14264             LIMIT ?3"
14265        } else {
14266            "SELECT m.idx, COALESCE(LENGTH(CAST(m.content AS BLOB)), 0)
14267             FROM messages m
14268             WHERE m.conversation_id = ?1
14269               AND m.idx > ?2
14270             ORDER BY m.conversation_id, m.idx
14271             LIMIT ?3"
14272        };
14273
14274        loop {
14275            // Avoid the 2-table JOIN with LIMIT that triggers frankensqlite's
14276            // materialization fallback (which is what the OOM retry below is
14277            // defending against — see 860acb12).  Inline agent slug via
14278            // correlated subquery and degrade NULL agent_id to 'unknown' for
14279            // consistency with the lexical/FTS rebuild paths.
14280            let conversation_rows = match self.conn.query_with_params(
14281                "SELECT c.id, c.started_at,
14282                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
14283                        c.source_id
14284                 FROM conversations c
14285                 WHERE c.id > ?1
14286                 ORDER BY c.id
14287                 LIMIT ?2",
14288                &params_from_iter([
14289                    ParamValue::from(last_conversation_id),
14290                    ParamValue::from(conversation_batch_size as i64),
14291                ]),
14292            ) {
14293                Ok(rows) => rows,
14294                Err(err) if is_out_of_memory_error(&err) && conversation_batch_size > 1 => {
14295                    let previous_batch_size = conversation_batch_size;
14296                    conversation_batch_size = (conversation_batch_size / 2).max(1);
14297                    tracing::warn!(
14298                        previous_batch_size,
14299                        conversation_batch_size,
14300                        last_conversation_id,
14301                        "daily_stats conversation scan ran out of memory; retrying with smaller batch"
14302                    );
14303                    continue;
14304                }
14305                Err(err) => return Err(err.into()),
14306            };
14307            if conversation_rows.is_empty() {
14308                break;
14309            }
14310
14311            let mut aggregate = StatsAggregator::new();
14312            let mut conversation_batch_meta: Vec<(i64, i64, String, String)> =
14313                Vec::with_capacity(conversation_rows.len());
14314            for row in &conversation_rows {
14315                let conversation_id: i64 = row.get_typed(0)?;
14316                let started_at: Option<i64> = row.get_typed(1)?;
14317                let agent_slug: String = row.get_typed(2)?;
14318                let source_id: String = row.get_typed(3)?;
14319                last_conversation_id = conversation_id;
14320                let day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
14321                aggregate.record_delta(&agent_slug, &source_id, day_id, 1, 0, 0);
14322                conversation_batch_meta.push((conversation_id, day_id, agent_slug, source_id));
14323                conversations_processed += 1;
14324            }
14325
14326            conversation_batch_count += 1;
14327            raw_entries_flushed += aggregate.raw_entry_count();
14328            let entries = aggregate.expand();
14329            expanded_entries_flushed += entries.len();
14330            if !entries.is_empty() {
14331                franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
14332            }
14333            if conversation_batch_count.is_multiple_of(25) {
14334                tracing::info!(
14335                    target: "cass::perf::daily_stats",
14336                    conversations_processed,
14337                    batches = conversation_batch_count,
14338                    batch_size = conversation_batch_size,
14339                    last_conversation_id,
14340                    "daily_stats rebuild conversation scan progress"
14341                );
14342            }
14343            if conversation_batch_meta.is_empty() {
14344                continue;
14345            }
14346
14347            for (conversation_id, day_id, agent_slug, source_id) in conversation_batch_meta {
14348                let mut cursor_message_idx = -1_i64;
14349                loop {
14350                    let message_rows = match self.conn.query_with_params(
14351                        message_scan_sql,
14352                        &params_from_iter([
14353                            ParamValue::from(conversation_id),
14354                            ParamValue::from(cursor_message_idx),
14355                            ParamValue::from(message_batch_size as i64),
14356                        ]),
14357                    ) {
14358                        Ok(rows) => rows,
14359                        Err(err) if is_out_of_memory_error(&err) && message_batch_size > 1 => {
14360                            let previous_batch_size = message_batch_size;
14361                            message_batch_size = (message_batch_size / 2).max(1);
14362                            tracing::warn!(
14363                                previous_batch_size,
14364                                message_batch_size,
14365                                conversation_id,
14366                                cursor_message_idx,
14367                                "daily_stats message scan ran out of memory; retrying with smaller batch"
14368                            );
14369                            continue;
14370                        }
14371                        Err(err) => return Err(err.into()),
14372                    };
14373                    if message_rows.is_empty() {
14374                        break;
14375                    }
14376
14377                    let mut aggregate = StatsAggregator::new();
14378                    for row in &message_rows {
14379                        let message_idx: i64 = row.get_typed(0)?;
14380                        let content_len: i64 = row.get_typed(1)?;
14381                        cursor_message_idx = message_idx;
14382                        aggregate.record_delta(&agent_slug, &source_id, day_id, 0, 1, content_len);
14383                        messages_processed += 1;
14384                    }
14385
14386                    message_batch_count += 1;
14387                    raw_entries_flushed += aggregate.raw_entry_count();
14388                    let entries = aggregate.expand();
14389                    expanded_entries_flushed += entries.len();
14390                    if !entries.is_empty() {
14391                        franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
14392                    }
14393                    if message_batch_count.is_multiple_of(50) {
14394                        tracing::info!(
14395                            target: "cass::perf::daily_stats",
14396                            messages_processed,
14397                            batches = message_batch_count,
14398                            batch_size = message_batch_size,
14399                            source = if use_message_metrics {
14400                                "message_metrics"
14401                            } else {
14402                                "messages"
14403                            },
14404                            conversation_id,
14405                            cursor_message_idx,
14406                            "daily_stats rebuild message scan progress"
14407                        );
14408                    }
14409                }
14410            }
14411        }
14412
14413        let rows_created: i64 =
14414            tx.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
14415                row.get_typed(0)
14416            })?;
14417        let total_sessions: i64 = tx.query_row_map(
14418            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
14419            fparams![],
14420            |row| row.get_typed(0),
14421        )?;
14422
14423        tx.commit()?;
14424
14425        tracing::info!(
14426            target: "cass::perf::daily_stats",
14427            rows_created,
14428            total_sessions,
14429            conversations_processed,
14430            conversation_batches = conversation_batch_count,
14431            conversation_batch_size,
14432            message_batches = message_batch_count,
14433            message_batch_size,
14434            messages_processed,
14435            use_message_metrics,
14436            raw_entries_flushed,
14437            expanded_entries_flushed,
14438            "Daily stats rebuilt from conversations"
14439        );
14440
14441        Ok(DailyStatsRebuildResult {
14442            rows_created,
14443            total_sessions,
14444        })
14445    }
14446}
14447
14448// SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
14449// All methods are available through FrankenStorage.
14450
14451// -------------------------------------------------------------------------
14452// IndexingCache (Opt 7.2) - N+1 Prevention for Agent/Workspace IDs
14453// -------------------------------------------------------------------------
14454
14455/// Cache for agent and workspace IDs during batch indexing.
14456///
14457/// Prevents N+1 database queries by caching the results of ensure_agent
14458/// and ensure_workspace calls within a batch. This is per-batch and
14459/// single-threaded, so no synchronization is needed.
14460///
14461/// # Usage
14462/// ```ignore
14463/// let mut cache = IndexingCache::new();
14464/// for conv in conversations {
14465///     let agent_id = cache.get_or_insert_agent(storage, &agent)?;
14466///     let workspace_id = cache.get_or_insert_workspace(storage, workspace)?;
14467///     // ... use agent_id and workspace_id
14468/// }
14469/// ```
14470///
14471/// # Rollback
14472/// Set environment variable `CASS_SQLITE_CACHE=0` to bypass caching
14473/// and use direct DB calls (useful for debugging).
14474#[derive(Debug, Default)]
14475pub struct IndexingCache {
14476    agent_ids: HashMap<String, i64>,
14477    workspace_ids: HashMap<PathBuf, i64>,
14478    hits: u64,
14479    misses: u64,
14480}
14481
14482pub trait IndexingCacheStorage {
14483    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64>;
14484    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64>;
14485}
14486
14487impl IndexingCacheStorage for FrankenStorage {
14488    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64> {
14489        self.ensure_agent(agent)
14490    }
14491
14492    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
14493        self.ensure_workspace(path, display_name)
14494    }
14495}
14496
14497// IndexingCacheStorage for SqliteStorage removed: SqliteStorage is a type alias for FrankenStorage.
14498
14499impl IndexingCache {
14500    /// Create a new empty cache.
14501    pub fn new() -> Self {
14502        Self {
14503            agent_ids: HashMap::new(),
14504            workspace_ids: HashMap::new(),
14505            hits: 0,
14506            misses: 0,
14507        }
14508    }
14509
14510    /// Check if caching is enabled via environment variable.
14511    /// Returns true unless CASS_SQLITE_CACHE is set to "0" or "false".
14512    pub fn is_enabled() -> bool {
14513        dotenvy::var("CASS_SQLITE_CACHE")
14514            .map(|v| v != "0" && v.to_lowercase() != "false")
14515            .unwrap_or(true)
14516    }
14517
14518    /// Get or insert an agent ID, using cache if available.
14519    ///
14520    /// Returns the cached ID if present, otherwise calls ensure_agent
14521    /// and caches the result.
14522    pub fn get_or_insert_agent<S>(&mut self, storage: &S, agent: &Agent) -> Result<i64>
14523    where
14524        S: IndexingCacheStorage + ?Sized,
14525    {
14526        if let Some(&cached) = self.agent_ids.get(&agent.slug) {
14527            self.hits += 1;
14528            return Ok(cached);
14529        }
14530
14531        self.misses += 1;
14532        let id = storage.ensure_indexing_agent(agent)?;
14533        self.agent_ids.insert(agent.slug.clone(), id);
14534        Ok(id)
14535    }
14536
14537    /// Get or insert a workspace ID, using cache if available.
14538    ///
14539    /// Returns the cached ID if present, otherwise calls ensure_workspace
14540    /// and caches the result.
14541    pub fn get_or_insert_workspace(
14542        &mut self,
14543        storage: &(impl IndexingCacheStorage + ?Sized),
14544        path: &Path,
14545        display_name: Option<&str>,
14546    ) -> Result<i64> {
14547        if let Some(&cached) = self.workspace_ids.get(path) {
14548            self.hits += 1;
14549            return Ok(cached);
14550        }
14551
14552        self.misses += 1;
14553        let id = storage.ensure_indexing_workspace(path, display_name)?;
14554        self.workspace_ids.insert(path.to_path_buf(), id);
14555        Ok(id)
14556    }
14557
14558    /// Get cache statistics: (hits, misses, hit_rate).
14559    pub fn stats(&self) -> (u64, u64, f64) {
14560        let total = self.hits + self.misses;
14561        let hit_rate = if total > 0 {
14562            self.hits as f64 / total as f64
14563        } else {
14564            0.0
14565        };
14566        (self.hits, self.misses, hit_rate)
14567    }
14568
14569    /// Clear the cache, resetting all state.
14570    pub fn clear(&mut self) {
14571        self.agent_ids.clear();
14572        self.workspace_ids.clear();
14573        self.hits = 0;
14574        self.misses = 0;
14575    }
14576
14577    /// Number of cached agents.
14578    pub fn agent_count(&self) -> usize {
14579        self.agent_ids.len()
14580    }
14581
14582    /// Number of cached workspaces.
14583    pub fn workspace_count(&self) -> usize {
14584        self.workspace_ids.len()
14585    }
14586}
14587
14588// -------------------------------------------------------------------------
14589// StatsAggregator (kzxu) - Batched Daily Stats Updates
14590// -------------------------------------------------------------------------
14591// Aggregates daily stats in memory during batch ingestion, then flushes
14592// to the database in a single batched INSERT...ON CONFLICT operation.
14593// This prevents N×4 database writes (4 permutations per conversation).
14594
14595/// Accumulated statistics delta for a single (day_id, agent, source) combination.
14596#[derive(Clone, Copy, Debug, Default)]
14597pub struct StatsDelta {
14598    pub session_count_delta: i64,
14599    pub message_count_delta: i64,
14600    pub total_chars_delta: i64,
14601}
14602
14603/// In-memory aggregator for batched daily stats updates.
14604///
14605/// During batch ingestion, we accumulate deltas per (day_id, agent, source) key.
14606/// After processing all conversations, call `expand()` to generate the 4
14607/// permutations per raw entry, then flush via `SqliteStorage::update_daily_stats_batched`.
14608///
14609/// # Example
14610/// ```ignore
14611/// let mut agg = StatsAggregator::new();
14612/// for conv in conversations {
14613///     agg.record(&conv.agent_slug, source_id, day_id, msg_count, char_count);
14614/// }
14615/// let entries = agg.expand();
14616/// storage.update_daily_stats_batched(&entries)?;
14617/// ```
14618#[derive(Debug, Default)]
14619pub struct StatsAggregator {
14620    /// Raw deltas keyed by (day_id, agent_slug, source_id).
14621    /// Only stores specific (non-"all") combinations.
14622    deltas: HashMap<(i64, String, String), StatsDelta>,
14623}
14624
14625impl StatsAggregator {
14626    /// Create a new empty aggregator.
14627    pub fn new() -> Self {
14628        Self {
14629            deltas: HashMap::new(),
14630        }
14631    }
14632
14633    /// Record a conversation's contribution to stats (session + messages + chars).
14634    ///
14635    /// This increments session_count by 1.
14636    ///
14637    /// # Arguments
14638    /// * `agent_slug` - The specific agent slug (not "all")
14639    /// * `source_id` - The specific source ID (not "all")
14640    /// * `day_id` - Days since 2020-01-01 (from `SqliteStorage::day_id_from_millis`)
14641    /// * `message_count` - Number of messages in the conversation
14642    /// * `total_chars` - Total character count across all messages
14643    pub fn record(
14644        &mut self,
14645        agent_slug: &str,
14646        source_id: &str,
14647        day_id: i64,
14648        message_count: i64,
14649        total_chars: i64,
14650    ) {
14651        self.record_delta(agent_slug, source_id, day_id, 1, message_count, total_chars);
14652    }
14653
14654    /// Record an arbitrary delta. Use this for append-only updates where
14655    /// `session_count_delta` may be 0 but message/char deltas are non-zero.
14656    pub fn record_delta(
14657        &mut self,
14658        agent_slug: &str,
14659        source_id: &str,
14660        day_id: i64,
14661        session_count_delta: i64,
14662        message_count_delta: i64,
14663        total_chars_delta: i64,
14664    ) {
14665        if session_count_delta == 0 && message_count_delta == 0 && total_chars_delta == 0 {
14666            return;
14667        }
14668        let key = (day_id, agent_slug.to_owned(), source_id.to_owned());
14669        let delta = self.deltas.entry(key).or_default();
14670        delta.session_count_delta += session_count_delta;
14671        delta.message_count_delta += message_count_delta;
14672        delta.total_chars_delta += total_chars_delta;
14673    }
14674
14675    /// Expand raw deltas into the 4 permutation keys:
14676    /// - (agent, source) - specific both
14677    /// - ("all", source) - all agents, specific source
14678    /// - (agent, "all") - specific agent, all sources
14679    /// - ("all", "all") - totals
14680    ///
14681    /// Returns entries sorted by (day_id, agent_slug, source_id) for deterministic batching.
14682    pub fn expand(&self) -> Vec<(i64, String, String, StatsDelta)> {
14683        let mut expanded: HashMap<(i64, String, String), StatsDelta> = HashMap::new();
14684
14685        for ((day_id, agent, source), delta) in &self.deltas {
14686            let permutations = [
14687                (agent.as_str(), source.as_str()),
14688                ("all", source.as_str()),
14689                (agent.as_str(), "all"),
14690                ("all", "all"),
14691            ];
14692
14693            // Ensure we don't double-apply deltas if agent/source is already "all".
14694            for idx in 0..permutations.len() {
14695                let (a, s) = permutations[idx];
14696                if permutations[..idx].contains(&(a, s)) {
14697                    continue;
14698                }
14699                let key = (*day_id, a.to_owned(), s.to_owned());
14700                let entry = expanded.entry(key).or_default();
14701                entry.session_count_delta += delta.session_count_delta;
14702                entry.message_count_delta += delta.message_count_delta;
14703                entry.total_chars_delta += delta.total_chars_delta;
14704            }
14705        }
14706
14707        let mut out: Vec<(i64, String, String, StatsDelta)> = expanded
14708            .into_iter()
14709            .map(|((d, a, s), delta)| (d, a, s, delta))
14710            .collect();
14711        out.sort_by(|(d1, a1, s1, _), (d2, a2, s2, _)| {
14712            d1.cmp(d2).then_with(|| a1.cmp(a2)).then_with(|| s1.cmp(s2))
14713        });
14714        out
14715    }
14716
14717    /// Check if the aggregator is empty (no data recorded).
14718    pub fn is_empty(&self) -> bool {
14719        self.deltas.is_empty()
14720    }
14721
14722    /// Get number of distinct raw (day, agent, source) combinations recorded.
14723    pub fn raw_entry_count(&self) -> usize {
14724        self.deltas.len()
14725    }
14726}
14727
14728// -------------------------------------------------------------------------
14729// TokenStatsAggregator — Batched Token Analytics Daily Stats
14730// -------------------------------------------------------------------------
14731// Mirrors StatsAggregator pattern for token-level metrics.
14732// Aggregates token usage in memory during batch ingestion, then flushes
14733// to token_daily_stats in a single batched INSERT...ON CONFLICT operation.
14734
14735/// Accumulated token statistics delta for a single (day_id, agent, source, model_family) combination.
14736#[derive(Clone, Debug, Default)]
14737pub struct TokenStatsDelta {
14738    pub api_call_count: i64,
14739    pub user_message_count: i64,
14740    pub assistant_message_count: i64,
14741    pub tool_message_count: i64,
14742    pub total_input_tokens: i64,
14743    pub total_output_tokens: i64,
14744    pub total_cache_read_tokens: i64,
14745    pub total_cache_creation_tokens: i64,
14746    pub total_thinking_tokens: i64,
14747    pub grand_total_tokens: i64,
14748    pub total_content_chars: i64,
14749    pub total_tool_calls: i64,
14750    pub estimated_cost_usd: f64,
14751    pub session_count: i64,
14752}
14753
14754/// In-memory aggregator for batched token daily stats updates.
14755///
14756/// During batch ingestion, accumulate token deltas per (day_id, agent, source, model_family) key.
14757/// After processing, call `expand()` to generate the 5 permutation keys, then flush via
14758/// `update_token_daily_stats_batched_in_tx`.
14759#[derive(Debug, Default)]
14760pub struct TokenStatsAggregator {
14761    /// Raw deltas keyed by (day_id, agent_slug, source_id, model_family).
14762    deltas: HashMap<(i64, String, String, String), TokenStatsDelta>,
14763}
14764
14765impl TokenStatsAggregator {
14766    pub fn new() -> Self {
14767        Self {
14768            deltas: HashMap::new(),
14769        }
14770    }
14771
14772    /// Record a single message's token contribution.
14773    #[allow(clippy::too_many_arguments)]
14774    pub fn record(
14775        &mut self,
14776        agent_slug: &str,
14777        source_id: &str,
14778        day_id: i64,
14779        model_family: &str,
14780        role: &str,
14781        usage: &crate::connectors::ExtractedTokenUsage,
14782        content_chars: i64,
14783        estimated_cost_usd: f64,
14784    ) {
14785        let key = (
14786            day_id,
14787            agent_slug.to_owned(),
14788            source_id.to_owned(),
14789            model_family.to_owned(),
14790        );
14791        let delta = self.deltas.entry(key).or_default();
14792
14793        delta.api_call_count += 1;
14794        match role {
14795            "user" => delta.user_message_count += 1,
14796            "assistant" | "agent" => delta.assistant_message_count += 1,
14797            "tool" => delta.tool_message_count += 1,
14798            _ => {}
14799        }
14800
14801        delta.total_input_tokens += usage.input_tokens.unwrap_or(0);
14802        delta.total_output_tokens += usage.output_tokens.unwrap_or(0);
14803        delta.total_cache_read_tokens += usage.cache_read_tokens.unwrap_or(0);
14804        delta.total_cache_creation_tokens += usage.cache_creation_tokens.unwrap_or(0);
14805        delta.total_thinking_tokens += usage.thinking_tokens.unwrap_or(0);
14806        delta.grand_total_tokens += usage.total_tokens().unwrap_or(0);
14807        delta.total_content_chars += content_chars;
14808        delta.total_tool_calls += usage.tool_call_count as i64;
14809        delta.estimated_cost_usd += estimated_cost_usd;
14810    }
14811
14812    /// Record a session count bump for a given day/agent/source/model.
14813    pub fn record_session(
14814        &mut self,
14815        agent_slug: &str,
14816        source_id: &str,
14817        day_id: i64,
14818        model_family: &str,
14819    ) {
14820        let key = (
14821            day_id,
14822            agent_slug.to_owned(),
14823            source_id.to_owned(),
14824            model_family.to_owned(),
14825        );
14826        self.deltas.entry(key).or_default().session_count += 1;
14827    }
14828
14829    /// Expand raw deltas into 5 permutation keys for the 4-dimensional composite PK:
14830    /// - (agent, source, model)  — specific all three
14831    /// - ("all", source, model)  — all agents
14832    /// - (agent, "all", model)   — all sources
14833    /// - (agent, source, "all")  — all models
14834    /// - ("all", "all", "all")   — global total
14835    pub fn expand(&self) -> Vec<(i64, String, String, String, TokenStatsDelta)> {
14836        let mut expanded: HashMap<(i64, String, String, String), TokenStatsDelta> = HashMap::new();
14837
14838        for ((day_id, agent, source, model), delta) in &self.deltas {
14839            let permutations = [
14840                (agent.as_str(), source.as_str(), model.as_str()),
14841                ("all", source.as_str(), model.as_str()),
14842                (agent.as_str(), "all", model.as_str()),
14843                (agent.as_str(), source.as_str(), "all"),
14844                ("all", "all", "all"),
14845            ];
14846
14847            for idx in 0..permutations.len() {
14848                let (a, s, m) = permutations[idx];
14849                // Deduplicate if agent/source/model is already "all"
14850                if permutations[..idx].contains(&(a, s, m)) {
14851                    continue;
14852                }
14853                let key = (*day_id, a.to_owned(), s.to_owned(), m.to_owned());
14854                let entry = expanded.entry(key).or_default();
14855                entry.api_call_count += delta.api_call_count;
14856                entry.user_message_count += delta.user_message_count;
14857                entry.assistant_message_count += delta.assistant_message_count;
14858                entry.tool_message_count += delta.tool_message_count;
14859                entry.total_input_tokens += delta.total_input_tokens;
14860                entry.total_output_tokens += delta.total_output_tokens;
14861                entry.total_cache_read_tokens += delta.total_cache_read_tokens;
14862                entry.total_cache_creation_tokens += delta.total_cache_creation_tokens;
14863                entry.total_thinking_tokens += delta.total_thinking_tokens;
14864                entry.grand_total_tokens += delta.grand_total_tokens;
14865                entry.total_content_chars += delta.total_content_chars;
14866                entry.total_tool_calls += delta.total_tool_calls;
14867                entry.estimated_cost_usd += delta.estimated_cost_usd;
14868                entry.session_count += delta.session_count;
14869            }
14870        }
14871
14872        let mut out: Vec<(i64, String, String, String, TokenStatsDelta)> = expanded
14873            .into_iter()
14874            .map(|((d, a, s, m), delta)| (d, a, s, m, delta))
14875            .collect();
14876        out.sort_by(|(d1, a1, s1, m1, _), (d2, a2, s2, m2, _)| {
14877            d1.cmp(d2)
14878                .then_with(|| a1.cmp(a2))
14879                .then_with(|| s1.cmp(s2))
14880                .then_with(|| m1.cmp(m2))
14881        });
14882        out
14883    }
14884
14885    pub fn is_empty(&self) -> bool {
14886        self.deltas.is_empty()
14887    }
14888
14889    pub fn raw_entry_count(&self) -> usize {
14890        self.deltas.len()
14891    }
14892}
14893
14894// -------------------------------------------------------------------------
14895// AnalyticsRollupAggregator — Batched usage_hourly + usage_daily Updates
14896// -------------------------------------------------------------------------
14897// Accumulates per-message deltas in memory, then flushes to both
14898// usage_hourly and usage_daily in a single batched operation.
14899
14900/// Delta for a single (bucket, agent_slug, workspace_id, source_id) rollup key.
14901#[derive(Clone, Debug, Default)]
14902pub struct UsageRollupDelta {
14903    pub message_count: i64,
14904    pub user_message_count: i64,
14905    pub assistant_message_count: i64,
14906    pub tool_call_count: i64,
14907    pub plan_message_count: i64,
14908    pub plan_content_tokens_est_total: i64,
14909    pub plan_api_tokens_total: i64,
14910    pub api_coverage_message_count: i64,
14911    pub content_tokens_est_total: i64,
14912    pub content_tokens_est_user: i64,
14913    pub content_tokens_est_assistant: i64,
14914    pub api_tokens_total: i64,
14915    pub api_input_tokens_total: i64,
14916    pub api_output_tokens_total: i64,
14917    pub api_cache_read_tokens_total: i64,
14918    pub api_cache_creation_tokens_total: i64,
14919    pub api_thinking_tokens_total: i64,
14920}
14921
14922/// Pending message_metrics row for batch insertion.
14923#[derive(Debug, Clone)]
14924pub struct MessageMetricsEntry {
14925    pub message_id: i64,
14926    pub created_at_ms: i64,
14927    pub hour_id: i64,
14928    pub day_id: i64,
14929    pub agent_slug: String,
14930    pub workspace_id: i64,
14931    pub source_id: String,
14932    pub role: String,
14933    pub content_chars: i64,
14934    pub content_tokens_est: i64,
14935    pub model_name: Option<String>,
14936    pub model_family: String,
14937    pub model_tier: String,
14938    pub provider: String,
14939    pub api_input_tokens: Option<i64>,
14940    pub api_output_tokens: Option<i64>,
14941    pub api_cache_read_tokens: Option<i64>,
14942    pub api_cache_creation_tokens: Option<i64>,
14943    pub api_thinking_tokens: Option<i64>,
14944    pub api_service_tier: Option<String>,
14945    pub api_data_source: String,
14946    pub tool_call_count: i64,
14947    pub has_tool_calls: bool,
14948    pub has_plan: bool,
14949}
14950
14951/// In-memory aggregator for batched usage_hourly and usage_daily rollup updates.
14952///
14953/// Keyed by (bucket_id, agent_slug, workspace_id, source_id).
14954/// Maintains separate hourly and daily delta maps.
14955#[derive(Debug, Default)]
14956pub struct AnalyticsRollupAggregator {
14957    hourly: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14958    daily: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14959    models_daily: HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
14960}
14961
14962impl AnalyticsRollupAggregator {
14963    pub fn new() -> Self {
14964        Self::default()
14965    }
14966
14967    /// Record a single message's contribution to both hourly and daily rollups.
14968    pub fn record(&mut self, entry: &MessageMetricsEntry) {
14969        let content_est = entry.content_tokens_est;
14970        let api_total = entry.api_input_tokens.unwrap_or(0)
14971            + entry.api_output_tokens.unwrap_or(0)
14972            + entry.api_cache_read_tokens.unwrap_or(0)
14973            + entry.api_cache_creation_tokens.unwrap_or(0)
14974            + entry.api_thinking_tokens.unwrap_or(0);
14975        let is_api = entry.api_data_source == "api";
14976        let is_user = entry.role == "user";
14977        let is_assistant = entry.role == "assistant" || entry.role == "agent";
14978
14979        // Apply to both hourly and daily
14980        for (map, bucket_id) in [
14981            (&mut self.hourly, entry.hour_id),
14982            (&mut self.daily, entry.day_id),
14983        ] {
14984            let key = (
14985                bucket_id,
14986                entry.agent_slug.clone(),
14987                entry.workspace_id,
14988                entry.source_id.clone(),
14989            );
14990            let d = map.entry(key).or_default();
14991            d.message_count += 1;
14992            if is_user {
14993                d.user_message_count += 1;
14994                d.content_tokens_est_user += content_est;
14995            }
14996            if is_assistant {
14997                d.assistant_message_count += 1;
14998                d.content_tokens_est_assistant += content_est;
14999            }
15000            d.tool_call_count += entry.tool_call_count;
15001            if entry.has_plan {
15002                d.plan_message_count += 1;
15003                d.plan_content_tokens_est_total += content_est;
15004                if is_api {
15005                    d.plan_api_tokens_total += api_total;
15006                }
15007            }
15008            if is_api {
15009                d.api_coverage_message_count += 1;
15010                d.api_tokens_total += api_total;
15011                d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
15012                d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
15013                d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
15014                d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
15015                d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
15016            }
15017            d.content_tokens_est_total += content_est;
15018        }
15019
15020        let model_key = (
15021            entry.day_id,
15022            entry.agent_slug.clone(),
15023            entry.workspace_id,
15024            entry.source_id.clone(),
15025            entry.model_family.clone(),
15026            entry.model_tier.clone(),
15027        );
15028        let d = self.models_daily.entry(model_key).or_default();
15029        d.message_count += 1;
15030        if is_user {
15031            d.user_message_count += 1;
15032            d.content_tokens_est_user += content_est;
15033        }
15034        if is_assistant {
15035            d.assistant_message_count += 1;
15036            d.content_tokens_est_assistant += content_est;
15037        }
15038        d.tool_call_count += entry.tool_call_count;
15039        if entry.has_plan {
15040            d.plan_message_count += 1;
15041            d.plan_content_tokens_est_total += content_est;
15042            if is_api {
15043                d.plan_api_tokens_total += api_total;
15044            }
15045        }
15046        if is_api {
15047            d.api_coverage_message_count += 1;
15048            d.api_tokens_total += api_total;
15049            d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
15050            d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
15051            d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
15052            d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
15053            d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
15054        }
15055        d.content_tokens_est_total += content_est;
15056    }
15057
15058    pub fn is_empty(&self) -> bool {
15059        self.hourly.is_empty() && self.daily.is_empty() && self.models_daily.is_empty()
15060    }
15061
15062    pub fn hourly_entry_count(&self) -> usize {
15063        self.hourly.len()
15064    }
15065
15066    pub fn daily_entry_count(&self) -> usize {
15067        self.daily.len()
15068    }
15069
15070    pub fn models_daily_entry_count(&self) -> usize {
15071        self.models_daily.len()
15072    }
15073}
15074
15075/// Whether the current role should be considered for plan attribution.
15076///
15077/// Plan attribution v2 defaults to assistant/agent messages only.
15078fn has_plan_for_role(role: &str, content: &str) -> bool {
15079    let role = role.trim();
15080    (role.eq_ignore_ascii_case("assistant") || role.eq_ignore_ascii_case("agent"))
15081        && has_plan_heuristic(content)
15082}
15083
15084/// Heuristic to detect "plan" messages.
15085///
15086/// v2 behavior:
15087/// - Require an explicit plan marker near the top of the message.
15088/// - Require structured steps (numbered or bullets) to reduce false positives.
15089/// - Avoid classifying tool-output blobs as plans.
15090fn has_plan_heuristic(content: &str) -> bool {
15091    if content.len() < 24 {
15092        return false;
15093    }
15094
15095    let lower = content.to_lowercase();
15096
15097    // Ignore tool-output-like blobs unless they also have a strong plan header.
15098    let looks_like_tool_blob = lower.contains("```")
15099        || lower.contains("\"tool\"")
15100        || lower.contains("stdout:")
15101        || lower.contains("stderr:")
15102        || lower.contains("exit code:");
15103
15104    let mut lines: Vec<&str> = Vec::with_capacity(60);
15105    let mut in_fenced_code = false;
15106    for raw in lower.lines() {
15107        let line = raw.trim();
15108        if line.starts_with("```") {
15109            in_fenced_code = !in_fenced_code;
15110            continue;
15111        }
15112        if in_fenced_code || line.is_empty() {
15113            continue;
15114        }
15115        lines.push(line);
15116        if lines.len() >= 60 {
15117            break;
15118        }
15119    }
15120
15121    let header_pos = lines.iter().position(|line| {
15122        line.starts_with("## plan")
15123            || line.starts_with("# plan")
15124            || line.starts_with("plan:")
15125            || line.starts_with("implementation plan")
15126            || line.starts_with("next steps:")
15127            || line.starts_with("action plan:")
15128    });
15129    let preview_top = lines.iter().take(8).copied().collect::<Vec<_>>().join("\n");
15130    let header_near_top = header_pos.is_some_and(|idx| idx <= 6) || preview_top.contains("plan:");
15131
15132    if !header_near_top {
15133        return false;
15134    }
15135    if looks_like_tool_blob && header_pos.is_none() {
15136        return false;
15137    }
15138
15139    let numbered_steps = lines
15140        .iter()
15141        .filter(|line| is_numbered_step_line(line))
15142        .count();
15143    let bullet_steps = lines
15144        .iter()
15145        .filter(|line| {
15146            line.starts_with("- ")
15147                || line.starts_with("* ")
15148                || line.starts_with("+ ")
15149                || line.starts_with("- [ ] ")
15150                || line.starts_with("- [x] ")
15151        })
15152        .count();
15153
15154    numbered_steps >= 2 || (numbered_steps >= 1 && bullet_steps >= 1) || bullet_steps >= 3
15155}
15156
15157fn is_numbered_step_line(line: &str) -> bool {
15158    let trimmed = line.trim_start();
15159    let digit_count = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
15160    if digit_count == 0 || digit_count > 3 {
15161        return false;
15162    }
15163    let rest = &trimmed[digit_count..];
15164    rest.starts_with(". ") || rest.starts_with(") ")
15165}
15166
15167/// Pending token_usage row to be batch-inserted.
15168#[derive(Debug, Clone)]
15169pub struct TokenUsageEntry {
15170    pub message_id: i64,
15171    pub conversation_id: i64,
15172    pub agent_id: i64,
15173    pub workspace_id: Option<i64>,
15174    pub source_id: String,
15175    pub timestamp_ms: i64,
15176    pub day_id: i64,
15177    pub model_name: Option<String>,
15178    pub model_family: Option<String>,
15179    pub model_tier: Option<String>,
15180    pub service_tier: Option<String>,
15181    pub provider: Option<String>,
15182    pub input_tokens: Option<i64>,
15183    pub output_tokens: Option<i64>,
15184    pub cache_read_tokens: Option<i64>,
15185    pub cache_creation_tokens: Option<i64>,
15186    pub thinking_tokens: Option<i64>,
15187    pub total_tokens: Option<i64>,
15188    pub estimated_cost_usd: Option<f64>,
15189    pub role: String,
15190    pub content_chars: i64,
15191    pub has_tool_calls: bool,
15192    pub tool_call_count: u32,
15193    pub data_source: String,
15194}
15195
15196// -------------------------------------------------------------------------
15197// PricingTable — In-memory cache for model_pricing lookups (bead z9fse.10)
15198// -------------------------------------------------------------------------
15199
15200/// One pricing row loaded from the `model_pricing` table.
15201#[derive(Debug, Clone)]
15202pub struct PricingEntry {
15203    pub model_pattern: String,
15204    pub provider: String,
15205    pub input_cost_per_mtok: f64,
15206    pub output_cost_per_mtok: f64,
15207    pub cache_read_cost_per_mtok: Option<f64>,
15208    pub cache_creation_cost_per_mtok: Option<f64>,
15209    /// Effective date as day_id (days since 2020-01-01).
15210    pub effective_day_id: i64,
15211}
15212
15213/// Diagnostics for pricing coverage during a batch operation.
15214#[derive(Debug, Clone, Default)]
15215pub struct PricingDiagnostics {
15216    pub priced_count: u64,
15217    pub unpriced_count: u64,
15218    /// Top unknown model names → count.
15219    pub unknown_models: HashMap<String, u64>,
15220}
15221
15222impl PricingDiagnostics {
15223    fn record_priced(&mut self) {
15224        self.priced_count += 1;
15225    }
15226
15227    fn record_unpriced(&mut self, model_name: Option<&str>) {
15228        self.unpriced_count += 1;
15229        let key = model_name.unwrap_or("(none)").to_string();
15230        *self.unknown_models.entry(key).or_insert(0) += 1;
15231    }
15232
15233    /// Log a summary of pricing coverage.
15234    pub fn log_summary(&self) {
15235        let total = self.priced_count + self.unpriced_count;
15236        if total == 0 {
15237            return;
15238        }
15239        let pct = (self.priced_count as f64 / total as f64) * 100.0;
15240        tracing::info!(
15241            target: "cass::analytics::pricing",
15242            priced = self.priced_count,
15243            unpriced = self.unpriced_count,
15244            total = total,
15245            coverage_pct = format!("{pct:.1}%"),
15246            "pricing coverage"
15247        );
15248        if !self.unknown_models.is_empty() {
15249            let mut sorted: Vec<_> = self.unknown_models.iter().collect();
15250            sorted.sort_by(|a, b| b.1.cmp(a.1));
15251            for (model, count) in sorted.iter().take(5) {
15252                tracing::debug!(
15253                    target: "cass::analytics::pricing",
15254                    model = model.as_str(),
15255                    count = count,
15256                    "unknown model (no pricing)"
15257                );
15258            }
15259        }
15260    }
15261}
15262
15263/// In-memory pricing table loaded from `model_pricing` for fast lookups.
15264#[derive(Debug, Clone)]
15265pub struct PricingTable {
15266    entries: Vec<PricingEntry>,
15267}
15268
15269impl PricingTable {
15270    /// Load all pricing entries from the database.
15271    pub fn load(conn: &FrankenConnection) -> Result<Self> {
15272        Self::franken_load(conn)
15273    }
15274
15275    /// Load all pricing entries from a frankensqlite connection.
15276    pub fn franken_load(conn: &FrankenConnection) -> Result<Self> {
15277        let rows = conn.query(
15278            "SELECT model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
15279                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
15280             FROM model_pricing
15281             ORDER BY effective_date DESC",
15282        )?;
15283        let mut entries = Vec::with_capacity(rows.len());
15284        for row in &rows {
15285            let effective_date: String = row.get_typed(6)?;
15286            let effective_day_id = date_str_to_day_id(&effective_date)?;
15287            entries.push(PricingEntry {
15288                model_pattern: row.get_typed(0)?,
15289                provider: row.get_typed(1)?,
15290                input_cost_per_mtok: row.get_typed(2)?,
15291                output_cost_per_mtok: row.get_typed(3)?,
15292                cache_read_cost_per_mtok: row.get_typed(4)?,
15293                cache_creation_cost_per_mtok: row.get_typed(5)?,
15294                effective_day_id,
15295            });
15296        }
15297        Ok(Self { entries })
15298    }
15299
15300    /// Look up the best pricing entry for a given model name and date.
15301    ///
15302    /// Selection rules:
15303    /// 1. Pattern must match model_name (SQL LIKE semantics).
15304    /// 2. effective_day_id must be <= message_day_id.
15305    /// 3. Among matches, prefer the most recent effective_date.
15306    /// 4. Tie-break by pattern specificity (longest pattern wins).
15307    pub fn lookup(&self, model_name: &str, message_day_id: i64) -> Option<&PricingEntry> {
15308        let mut best: Option<&PricingEntry> = None;
15309
15310        for entry in &self.entries {
15311            if entry.effective_day_id > message_day_id {
15312                continue;
15313            }
15314            if !sql_like_match(model_name, &entry.model_pattern) {
15315                continue;
15316            }
15317
15318            match best {
15319                None => best = Some(entry),
15320                Some(current) => {
15321                    if entry.effective_day_id > current.effective_day_id
15322                        || (entry.effective_day_id == current.effective_day_id
15323                            && entry.model_pattern.len() > current.model_pattern.len())
15324                    {
15325                        best = Some(entry);
15326                    }
15327                }
15328            }
15329        }
15330
15331        best
15332    }
15333
15334    /// Compute estimated cost in USD for a set of token counts.
15335    ///
15336    /// Returns `None` if no pricing entry matches or if no token counts are available.
15337    pub fn compute_cost(
15338        &self,
15339        model_name: Option<&str>,
15340        message_day_id: i64,
15341        input_tokens: Option<i64>,
15342        output_tokens: Option<i64>,
15343        cache_read_tokens: Option<i64>,
15344        cache_creation_tokens: Option<i64>,
15345    ) -> Option<f64> {
15346        let model = model_name?;
15347        let pricing = self.lookup(model, message_day_id)?;
15348
15349        if input_tokens.is_none() && output_tokens.is_none() {
15350            return None;
15351        }
15352
15353        let mut cost = 0.0;
15354        let cache_read = cache_read_tokens.unwrap_or(0);
15355        let cache_creation = cache_creation_tokens.unwrap_or(0);
15356        // input_tokens includes cache tokens as a subset; subtract them
15357        // so we don't charge at both the full input rate AND the cache rate.
15358        let non_cache_input = input_tokens
15359            .unwrap_or(0)
15360            .saturating_sub(cache_read)
15361            .saturating_sub(cache_creation)
15362            .max(0);
15363        cost += non_cache_input as f64 * pricing.input_cost_per_mtok / 1_000_000.0;
15364        cost += output_tokens.unwrap_or(0) as f64 * pricing.output_cost_per_mtok / 1_000_000.0;
15365
15366        if let Some(cache_price) = pricing.cache_read_cost_per_mtok {
15367            cost += cache_read as f64 * cache_price / 1_000_000.0;
15368        }
15369        if let Some(cache_price) = pricing.cache_creation_cost_per_mtok {
15370            cost += cache_creation as f64 * cache_price / 1_000_000.0;
15371        }
15372
15373        Some(cost)
15374    }
15375
15376    /// Whether the pricing table has any entries.
15377    pub fn is_empty(&self) -> bool {
15378        self.entries.is_empty()
15379    }
15380}
15381
15382/// Convert "YYYY-MM-DD" date string to day_id (days since 2020-01-01),
15383/// matching the format produced by `day_id_from_millis`.
15384fn date_str_to_day_id(s: &str) -> Result<i64> {
15385    use chrono::NaiveDate;
15386    const EPOCH_2020: NaiveDate = match NaiveDate::from_ymd_opt(2020, 1, 1) {
15387        Some(d) => d,
15388        None => unreachable!(),
15389    };
15390    NaiveDate::parse_from_str(s, "%Y-%m-%d")
15391        .map(|d| (d - EPOCH_2020).num_days())
15392        .with_context(|| format!("invalid effective_date '{s}'"))
15393}
15394
15395/// SQL LIKE pattern matcher (case-insensitive). `%` = any sequence, `_` = any single char.
15396fn sql_like_match(value: &str, pattern: &str) -> bool {
15397    sql_like_match_bytes(
15398        value.to_ascii_lowercase().as_bytes(),
15399        pattern.to_ascii_lowercase().as_bytes(),
15400    )
15401}
15402
15403/// Determine the byte length of the UTF-8 character starting at `b`.
15404fn utf8_char_len(b: u8) -> usize {
15405    if b < 0x80 {
15406        1
15407    } else if b < 0xE0 {
15408        2
15409    } else if b < 0xF0 {
15410        3
15411    } else {
15412        4
15413    }
15414}
15415
15416fn sql_like_match_bytes(val: &[u8], pat: &[u8]) -> bool {
15417    if pat.is_empty() {
15418        return val.is_empty();
15419    }
15420    match pat[0] {
15421        b'%' => {
15422            let mut p = 1;
15423            while p < pat.len() && pat[p] == b'%' {
15424                p += 1;
15425            }
15426            let rest = &pat[p..];
15427            // Iterate only at UTF-8 char boundaries
15428            let mut i = 0;
15429            while i <= val.len() {
15430                if sql_like_match_bytes(&val[i..], rest) {
15431                    return true;
15432                }
15433                if i < val.len() {
15434                    i += utf8_char_len(val[i]);
15435                } else {
15436                    break;
15437                }
15438            }
15439            false
15440        }
15441        b'_' => {
15442            // Match one full UTF-8 character, not just one byte
15443            if val.is_empty() {
15444                return false;
15445            }
15446            let char_len = utf8_char_len(val[0]);
15447            val.len() >= char_len && sql_like_match_bytes(&val[char_len..], &pat[1..])
15448        }
15449        c => !val.is_empty() && val[0] == c && sql_like_match_bytes(&val[1..], &pat[1..]),
15450    }
15451}
15452
15453fn rebuild_batch_size_env(var: &str, default: usize) -> usize {
15454    dotenvy::var(var)
15455        .ok()
15456        .and_then(|raw| raw.parse::<usize>().ok())
15457        .filter(|value| *value > 0)
15458        .unwrap_or(default)
15459}
15460
15461/// Returns true when the error chain represents a real `FrankenError::OutOfMemory`
15462/// (typed variant) or a bare "out of memory" / "not enough memory" message.
15463///
15464/// We *deliberately* do not do substring matching on the rendered chain: frankensqlite's
15465/// `FrankenError::OutOfMemory` renders as the literal "out of memory" and is also emitted
15466/// for several non-process-OOM internal conditions (VFS buffer / VDBE register allocation).
15467/// Contextual messages like "connector parse failed: not enough memory in record" must not
15468/// be promoted into the OOM-bisect/quarantine path. See `retryable_franken_anyhow` above
15469/// for the same downcast idiom.
15470fn is_out_of_memory_error<E: OutOfMemoryProbe + ?Sized>(err: &E) -> bool {
15471    err.is_out_of_memory()
15472}
15473
15474trait OutOfMemoryProbe {
15475    fn is_out_of_memory(&self) -> bool;
15476}
15477
15478impl OutOfMemoryProbe for anyhow::Error {
15479    fn is_out_of_memory(&self) -> bool {
15480        self.chain().any(|cause| {
15481            if cause
15482                .downcast_ref::<frankensqlite::FrankenError>()
15483                .is_some_and(|err| matches!(err, frankensqlite::FrankenError::OutOfMemory))
15484            {
15485                return true;
15486            }
15487            is_exact_out_of_memory_message(&cause.to_string())
15488        })
15489    }
15490}
15491
15492impl OutOfMemoryProbe for frankensqlite::FrankenError {
15493    fn is_out_of_memory(&self) -> bool {
15494        matches!(self, frankensqlite::FrankenError::OutOfMemory)
15495    }
15496}
15497
15498fn is_exact_out_of_memory_message(message: &str) -> bool {
15499    matches!(
15500        message.trim().to_ascii_lowercase().as_str(),
15501        "out of memory" | "not enough memory"
15502    )
15503}
15504
15505// Second SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
15506// All methods (insert_conversation_tree, list_agents, list_conversations, etc.) are
15507// available through FrankenStorage.
15508
15509/// Daily count data for histogram display.
15510#[derive(Debug, Clone)]
15511pub struct DailyCount {
15512    pub day_id: i64,
15513    pub sessions: i64,
15514    pub messages: i64,
15515    pub chars: i64,
15516}
15517
15518/// Result of an analytics rebuild operation.
15519#[derive(Debug, Clone)]
15520pub struct AnalyticsRebuildResult {
15521    pub message_metrics_rows: usize,
15522    pub usage_hourly_rows: usize,
15523    pub usage_daily_rows: usize,
15524    pub usage_models_daily_rows: usize,
15525    pub elapsed_ms: u64,
15526    pub messages_per_sec: f64,
15527}
15528
15529/// Result of rebuilding daily stats.
15530#[derive(Debug, Clone)]
15531pub struct DailyStatsRebuildResult {
15532    pub rows_created: i64,
15533    pub total_sessions: i64,
15534}
15535
15536/// Result of purging archived data for a single agent.
15537#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
15538pub struct AgentArchivePurgeResult {
15539    pub conversations_deleted: usize,
15540    pub messages_deleted: usize,
15541}
15542
15543/// Health status of daily stats table.
15544#[derive(Debug, Clone)]
15545pub struct DailyStatsHealth {
15546    pub populated: bool,
15547    pub row_count: i64,
15548    pub oldest_update_ms: Option<i64>,
15549    pub conversation_count: i64,
15550    pub materialized_total: i64,
15551    pub drift: i64,
15552}
15553
15554// -------------------------------------------------------------------------
15555// FTS5 Batch Insert (P2 Opt 2.1)
15556// -------------------------------------------------------------------------
15557
15558/// Batch size for FTS5 inserts. With 7 columns per row (rowid + 6 cols) and
15559/// SQLite's SQLITE_MAX_VARIABLE_NUMBER default of 999, max batch is ~142 rows.
15560/// Using 100 for safety margin and memory efficiency.
15561const FTS5_BATCH_SIZE: usize = 100;
15562
15563#[derive(Debug, Clone)]
15564struct FtsRebuildMessageRow {
15565    rowid: i64,
15566    message_id: i64,
15567    conversation_id: i64,
15568    content: String,
15569    created_at: Option<i64>,
15570}
15571
15572#[derive(Debug, Clone)]
15573struct FtsConversationProjection {
15574    title: String,
15575    agent_id: Option<i64>,
15576    workspace_id: Option<i64>,
15577    source_path: String,
15578}
15579
15580/// Entry for pending FTS5 insert.
15581#[derive(Debug, Clone)]
15582pub struct FtsEntry {
15583    pub content: String,
15584    pub title: String,
15585    pub agent: String,
15586    pub workspace: String,
15587    pub source_path: String,
15588    pub created_at: Option<i64>,
15589    pub message_id: i64,
15590}
15591
15592impl FtsEntry {
15593    /// Create an FTS entry from a message and conversation.
15594    pub fn from_message(message_id: i64, msg: &Message, conv: &Conversation) -> Self {
15595        FtsEntry {
15596            content: msg.content.clone(),
15597            title: conv.title.clone().unwrap_or_default(),
15598            agent: conv.agent_slug.clone(),
15599            workspace: conv
15600                .workspace
15601                .as_ref()
15602                .map(|p| p.to_string_lossy().into_owned())
15603                .unwrap_or_default(),
15604            source_path: path_to_string(&conv.source_path),
15605            created_at: msg.created_at.or(conv.started_at),
15606            message_id,
15607        }
15608    }
15609}
15610
15611const FTS_ENTRY_BATCH_MAX_DOCS: usize = 512;
15612const FTS_ENTRY_BATCH_MAX_CHARS: usize = 1024 * 1024;
15613
15614/// Default batch size for the FTS rebuild INSERT (Bug #168).  When
15615/// `fts_messages` is empty but `messages` has 100K+ rows, a single unbounded
15616/// INSERT-SELECT OOMs.  This constant caps each batch so peak memory stays
15617/// bounded.  Override via `CASS_FTS_REBUILD_BATCH_SIZE` for tuning.
15618const FTS_REBUILD_BATCH_SIZE_DEFAULT: usize = 5_000;
15619
15620/// Read the FTS rebuild batch size from the environment, falling back to the
15621/// compiled-in default.
15622fn fts_rebuild_batch_size() -> usize {
15623    dotenvy::var("CASS_FTS_REBUILD_BATCH_SIZE")
15624        .ok()
15625        .and_then(|v| v.parse::<usize>().ok())
15626        .filter(|&n| n > 0)
15627        .unwrap_or(FTS_REBUILD_BATCH_SIZE_DEFAULT)
15628}
15629
15630fn flush_pending_fts_entries(
15631    storage: &FrankenStorage,
15632    tx: &FrankenTransaction<'_>,
15633    entries: &mut Vec<FtsEntry>,
15634    pending_chars: &mut usize,
15635    inserted_total: &mut usize,
15636) -> Result<()> {
15637    if entries.is_empty() {
15638        return Ok(());
15639    }
15640
15641    if storage.fts_messages_present_cached(tx) {
15642        *inserted_total += franken_batch_insert_fts(tx, entries)?;
15643    }
15644    entries.clear();
15645    *pending_chars = 0;
15646    Ok(())
15647}
15648
15649fn path_to_string<P: AsRef<Path>>(p: P) -> String {
15650    p.as_ref().to_string_lossy().into_owned()
15651}
15652
15653fn role_str(role: &MessageRole) -> String {
15654    role_as_str(role).to_owned()
15655}
15656
15657fn role_as_str(role: &MessageRole) -> &str {
15658    match role {
15659        MessageRole::User => "user",
15660        MessageRole::Agent => "agent",
15661        MessageRole::Tool => "tool",
15662        MessageRole::System => "system",
15663        MessageRole::Other(v) => v.as_str(),
15664    }
15665}
15666
15667fn agent_kind_str(kind: AgentKind) -> String {
15668    match kind {
15669        AgentKind::Cli => "cli".into(),
15670        AgentKind::VsCode => "vscode".into(),
15671        AgentKind::Hybrid => "hybrid".into(),
15672    }
15673}
15674
15675// =============================================================================
15676// Tests (bead yln.4)
15677// =============================================================================
15678
15679#[cfg(test)]
15680mod tests {
15681    use super::*;
15682    use serial_test::serial;
15683    use tempfile::TempDir;
15684
15685    struct EnvGuard {
15686        key: &'static str,
15687        previous: Option<String>,
15688    }
15689
15690    impl Drop for EnvGuard {
15691        fn drop(&mut self) {
15692            if let Some(value) = &self.previous {
15693                // SAFETY: test helper restores prior process env for isolation.
15694                unsafe {
15695                    std::env::set_var(self.key, value);
15696                }
15697            } else {
15698                // SAFETY: test helper restores prior process env for isolation.
15699                unsafe {
15700                    std::env::remove_var(self.key);
15701                }
15702            }
15703        }
15704    }
15705
15706    fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
15707        let previous = dotenvy::var(key).ok();
15708        // SAFETY: test helper toggles a process-local env var for isolation.
15709        unsafe {
15710            std::env::set_var(key, value.as_ref());
15711        }
15712        EnvGuard { key, previous }
15713    }
15714
15715    fn unset_env_var(key: &'static str) -> EnvGuard {
15716        let previous = dotenvy::var(key).ok();
15717        // SAFETY: test helper toggles a process-local env var for isolation.
15718        unsafe {
15719            std::env::remove_var(key);
15720        }
15721        EnvGuard { key, previous }
15722    }
15723
15724    #[test]
15725    #[serial]
15726    fn storage_env_flags_are_truthy_only() {
15727        for value in ["1", "true", "TRUE", "yes", "YES", "on", "ON"] {
15728            let _guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", value);
15729            assert!(
15730                defer_storage_lexical_updates_enabled(),
15731                "{value:?} should enable the lexical defer toggle"
15732            );
15733        }
15734
15735        for value in ["0", "false", "FALSE", "no", "NO", "", "maybe"] {
15736            let _guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", value);
15737            assert!(
15738                !defer_storage_lexical_updates_enabled(),
15739                "{value:?} should not enable the lexical defer toggle"
15740            );
15741        }
15742    }
15743
15744    #[test]
15745    #[serial]
15746    fn analytics_defer_default_can_be_overridden_explicitly() {
15747        {
15748            let _defer_env = unset_env_var("CASS_DEFER_ANALYTICS_UPDATES");
15749            let _inline_env = unset_env_var("CASS_INLINE_ANALYTICS_UPDATES");
15750            let _default_guard = default_defer_analytics_updates_guard(false);
15751            assert!(
15752                !defer_analytics_updates_enabled(),
15753                "analytics should stay inline when neither env nor index-run default requests deferral"
15754            );
15755
15756            let _defer = set_env_var("CASS_DEFER_ANALYTICS_UPDATES", "no");
15757            assert!(
15758                !defer_analytics_updates_enabled(),
15759                "false-like explicit defer value must not force analytics deferral"
15760            );
15761        }
15762
15763        let _defer_env = unset_env_var("CASS_DEFER_ANALYTICS_UPDATES");
15764        let _inline_env = unset_env_var("CASS_INLINE_ANALYTICS_UPDATES");
15765        let _default_guard = default_defer_analytics_updates_guard(true);
15766        assert!(
15767            defer_analytics_updates_enabled(),
15768            "index-run default should defer analytics when no explicit env override is set"
15769        );
15770
15771        {
15772            let _inline = set_env_var("CASS_INLINE_ANALYTICS_UPDATES", "1");
15773            assert!(
15774                !defer_analytics_updates_enabled(),
15775                "truthy inline override should restore inline analytics writes"
15776            );
15777        }
15778
15779        {
15780            let _inline = set_env_var("CASS_INLINE_ANALYTICS_UPDATES", "no");
15781            assert!(
15782                defer_analytics_updates_enabled(),
15783                "false-like inline override must not accidentally force inline analytics"
15784            );
15785        }
15786
15787        {
15788            let _defer = set_env_var("CASS_DEFER_ANALYTICS_UPDATES", "no");
15789            assert!(
15790                defer_analytics_updates_enabled(),
15791                "false-like explicit defer value should leave the index-run default in effect"
15792            );
15793        }
15794    }
15795
15796    fn frontier_test_conversation(idx_created_at: &[(i64, Option<i64>)]) -> Conversation {
15797        Conversation {
15798            id: None,
15799            agent_slug: "codex".into(),
15800            workspace: None,
15801            external_id: Some("frontier-test".into()),
15802            title: Some("Frontier test".into()),
15803            source_path: PathBuf::from("/tmp/frontier-test.jsonl"),
15804            started_at: Some(1_700_000_000_000),
15805            ended_at: None,
15806            approx_tokens: None,
15807            metadata_json: serde_json::Value::Null,
15808            messages: idx_created_at
15809                .iter()
15810                .map(|(idx, created_at)| Message {
15811                    id: None,
15812                    idx: *idx,
15813                    role: MessageRole::User,
15814                    author: None,
15815                    created_at: *created_at,
15816                    content: format!("message-{idx}"),
15817                    extra_json: serde_json::Value::Null,
15818                    snippets: Vec::new(),
15819                })
15820                .collect(),
15821            source_id: LOCAL_SOURCE_ID.into(),
15822            origin_host: None,
15823        }
15824    }
15825
15826    #[test]
15827    fn conversation_tail_ended_at_candidate_uses_latest_known_end() {
15828        let mut later_conversation_end =
15829            frontier_test_conversation(&[(0, Some(100)), (1, Some(110))]);
15830        later_conversation_end.ended_at = Some(250);
15831        assert_eq!(
15832            conversation_tail_ended_at_candidate(&later_conversation_end),
15833            Some(250),
15834            "conversation-level ended_at can be later than the final message timestamp"
15835        );
15836
15837        let mut later_message_end = frontier_test_conversation(&[(0, Some(100)), (1, Some(300))]);
15838        later_message_end.ended_at = Some(250);
15839        assert_eq!(
15840            conversation_tail_ended_at_candidate(&later_message_end),
15841            Some(300),
15842            "message timestamps can be later than a stale conversation-level ended_at"
15843        );
15844
15845        let mut no_message_timestamps = frontier_test_conversation(&[(0, None), (1, None)]);
15846        no_message_timestamps.ended_at = Some(200);
15847        assert_eq!(
15848            conversation_tail_ended_at_candidate(&no_message_timestamps),
15849            Some(200)
15850        );
15851    }
15852
15853    #[test]
15854    fn ended_at_shortcut_splits_safe_append_tail() {
15855        let covered = frontier_test_conversation(&[(0, Some(100)), (1, Some(110)), (2, Some(120))]);
15856        assert!(
15857            collect_existing_conversation_tail_from_ended_at(&covered, 120).is_none(),
15858            "ended_at coverage alone does not prove all lower idx rows exist"
15859        );
15860
15861        let append = frontier_test_conversation(&[(0, Some(100)), (1, Some(110)), (2, Some(130))]);
15862        assert!(
15863            collect_existing_conversation_tail_from_ended_at(&append, 120).is_none(),
15864            "mixed covered-prefix plus append-tail input needs lookup to fill possible gaps"
15865        );
15866
15867        let pure_append = frontier_test_conversation(&[(2, Some(130)), (3, Some(140))]);
15868        let plan = collect_existing_conversation_tail_from_ended_at(&pure_append, 120)
15869            .expect("all-new timestamp tail can append without message lookup");
15870        assert_eq!(plan.messages.len(), 2);
15871        assert_eq!(
15872            plan.messages.iter().map(|msg| msg.idx).collect::<Vec<_>>(),
15873            vec![2, 3]
15874        );
15875        assert_eq!(
15876            plan.new_chars,
15877            ("message-2".len() + "message-3".len()) as i64
15878        );
15879
15880        let unsorted = frontier_test_conversation(&[(1, Some(110)), (0, Some(100))]);
15881        assert!(
15882            collect_existing_conversation_tail_from_ended_at(&unsorted, 120).is_none(),
15883            "out-of-order input must not use the append/no-op shortcut"
15884        );
15885
15886        let missing_timestamp = frontier_test_conversation(&[(0, Some(100)), (1, None)]);
15887        assert!(
15888            collect_existing_conversation_tail_from_ended_at(&missing_timestamp, 120).is_none(),
15889            "missing timestamps require replay-aware lookup"
15890        );
15891
15892        let covered_after_append =
15893            frontier_test_conversation(&[(0, Some(100)), (1, Some(130)), (2, Some(110))]);
15894        assert!(
15895            collect_existing_conversation_tail_from_ended_at(&covered_after_append, 120).is_none(),
15896            "covered messages after the append split mean the input is not a safe tail"
15897        );
15898
15899        let duplicate_idx = frontier_test_conversation(&[(0, Some(100)), (0, Some(130))]);
15900        assert!(
15901            collect_existing_conversation_tail_from_ended_at(&duplicate_idx, 120).is_none(),
15902            "duplicate idx values can collide with archived rows and require robust lookup"
15903        );
15904    }
15905
15906    #[test]
15907    fn idx_tail_shortcut_handles_no_timestamp_legacy_sources() {
15908        let covered = frontier_test_conversation(&[(0, None), (1, None)]);
15909        assert!(
15910            collect_existing_conversation_noop_from_idx_tail(&covered, 1).is_none(),
15911            "idx tail coverage alone does not prove all lower rows exist"
15912        );
15913
15914        let append = frontier_test_conversation(&[(0, None), (1, None), (2, None)]);
15915        assert!(
15916            collect_existing_conversation_noop_from_idx_tail(&append, 1).is_none(),
15917            "partial timestamp tail metadata is not trusted for appends"
15918        );
15919
15920        let unsorted = frontier_test_conversation(&[(1, None), (0, None), (2, None)]);
15921        assert!(
15922            collect_existing_conversation_noop_from_idx_tail(&unsorted, 1).is_none(),
15923            "out-of-order legacy messages need the robust lookup"
15924        );
15925
15926        let duplicate_tail = frontier_test_conversation(&[(0, None), (2, None), (2, None)]);
15927        assert!(
15928            collect_existing_conversation_noop_from_idx_tail(&duplicate_tail, 1).is_none(),
15929            "duplicate tail idx values can collide and require robust lookup"
15930        );
15931
15932        let duplicate_covered = frontier_test_conversation(&[(0, None), (1, None), (1, None)]);
15933        assert!(
15934            collect_existing_conversation_noop_from_idx_tail(&duplicate_covered, 1).is_none(),
15935            "duplicate covered idx values still need collision-aware lookup"
15936        );
15937    }
15938
15939    #[test]
15940    fn conversation_ended_at_shortcut_handles_stale_partial_idx_tail() {
15941        let mut covered =
15942            frontier_test_conversation(&[(0, Some(100)), (1, Some(110)), (2, Some(120))]);
15943        covered.ended_at = Some(120);
15944        assert!(
15945            collect_existing_conversation_noop_from_conversation_ended_at(&covered, 120).is_none(),
15946            "conversation ended_at coverage alone does not prove all message rows exist"
15947        );
15948
15949        let mut missing_timestamp = frontier_test_conversation(&[(0, None), (1, None), (2, None)]);
15950        missing_timestamp.ended_at = Some(120);
15951        assert!(
15952            collect_existing_conversation_noop_from_conversation_ended_at(&missing_timestamp, 120)
15953                .is_none(),
15954            "no-timestamp messages need replay-aware lookup even when conversation ended_at is unchanged"
15955        );
15956
15957        let mut newer =
15958            frontier_test_conversation(&[(0, Some(100)), (1, Some(110)), (2, Some(121))]);
15959        newer.ended_at = Some(121);
15960        assert!(
15961            collect_existing_conversation_noop_from_conversation_ended_at(&newer, 120).is_none(),
15962            "newer conversations need robust append handling"
15963        );
15964
15965        let mut unsorted = frontier_test_conversation(&[(1, Some(110)), (0, Some(100))]);
15966        unsorted.ended_at = Some(120);
15967        assert!(
15968            collect_existing_conversation_noop_from_conversation_ended_at(&unsorted, 120).is_none(),
15969            "out-of-order unchanged conversations still use the robust path"
15970        );
15971
15972        let mut duplicate =
15973            frontier_test_conversation(&[(0, Some(100)), (1, Some(110)), (1, Some(111))]);
15974        duplicate.ended_at = Some(120);
15975        assert!(
15976            collect_existing_conversation_noop_from_conversation_ended_at(&duplicate, 120)
15977                .is_none(),
15978            "duplicate covered idx values still need collision-aware lookup"
15979        );
15980    }
15981
15982    #[test]
15983    fn populated_fts_shadow_without_rowid_reload_errors_are_classified() {
15984        assert!(
15985            error_message_indicates_populated_fts_shadow_without_rowid_reload(
15986                "not implemented: reloading populated WITHOUT ROWID table `fts_messages_config` into MemDatabase is not yet supported",
15987            )
15988        );
15989        assert!(
15990            error_message_indicates_populated_fts_shadow_without_rowid_reload(
15991                "not implemented: loading populated WITHOUT ROWID table fts_messages_data is not yet supported",
15992            )
15993        );
15994        assert!(
15995            !error_message_indicates_populated_fts_shadow_without_rowid_reload(
15996                "not implemented: reloading populated WITHOUT ROWID table `user_table` into MemDatabase is not yet supported",
15997            )
15998        );
15999    }
16000
16001    #[test]
16002    fn doctor_mutation_open_guard_only_targets_canonical_archive_db() {
16003        let dir = TempDir::new().unwrap();
16004        let canonical = dir.path().join("agent_search.db");
16005        let scratch = dir.path().join("scratch.db");
16006
16007        assert_eq!(
16008            doctor_mutation_lock_path_for_db_open(&canonical),
16009            Some(dir.path().join("doctor/locks/doctor-repair.lock"))
16010        );
16011        assert_eq!(doctor_mutation_lock_path_for_db_open(&scratch), None);
16012    }
16013
16014    #[test]
16015    fn doctor_lock_metadata_pid_detection_is_exact() {
16016        let current = std::process::id();
16017
16018        assert!(doctor_lock_metadata_pid_is_current_process(&format!(
16019            "schema_version=1\npid={current}\nmode=safe_auto_run\n"
16020        )));
16021        assert!(!doctor_lock_metadata_pid_is_current_process(
16022            "schema_version=1\npid=not-a-pid\n"
16023        ));
16024        assert!(!doctor_lock_metadata_pid_is_current_process(&format!(
16025            "pid={}\n",
16026            current.saturating_add(1)
16027        )));
16028    }
16029
16030    #[test]
16031    #[cfg(not(windows))]
16032    fn doctor_storage_open_refuses_active_doctor_mutation_lock_from_other_process() {
16033        use std::io::Write as _;
16034
16035        let dir = TempDir::new().unwrap();
16036        let db_path = dir.path().join("agent_search.db");
16037        {
16038            let storage = FrankenStorage::open(&db_path).unwrap();
16039            storage.close().unwrap();
16040        }
16041
16042        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
16043        let mut lock_file = fs::OpenOptions::new()
16044            .create(true)
16045            .truncate(false)
16046            .read(true)
16047            .write(true)
16048            .open(&lock_path)
16049            .unwrap();
16050        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
16051        lock_file.set_len(0).unwrap();
16052        lock_file.write_all(b"schema_version=1\npid=1\n").unwrap();
16053        lock_file.sync_all().unwrap();
16054
16055        let err =
16056            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
16057                .expect_err("active doctor mutation lock must block canonical DB opens");
16058        let message = err.to_string();
16059        assert!(
16060            message.contains("doctor mutation lock") && message.contains("active"),
16061            "error should identify the active doctor mutation lock: {message}"
16062        );
16063
16064        fs2::FileExt::unlock(&lock_file).unwrap();
16065    }
16066
16067    #[test]
16068    fn doctor_storage_open_allows_current_doctor_process_probe() {
16069        use std::io::Write as _;
16070
16071        let dir = TempDir::new().unwrap();
16072        let db_path = dir.path().join("agent_search.db");
16073        {
16074            let storage = FrankenStorage::open(&db_path).unwrap();
16075            storage.close().unwrap();
16076        }
16077
16078        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
16079        let mut lock_file = fs::OpenOptions::new()
16080            .create(true)
16081            .truncate(false)
16082            .read(true)
16083            .write(true)
16084            .open(&lock_path)
16085            .unwrap();
16086        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
16087        lock_file.set_len(0).unwrap();
16088        write!(lock_file, "schema_version=1\npid={}\n", std::process::id()).unwrap();
16089        lock_file.sync_all().unwrap();
16090
16091        #[cfg(windows)]
16092        let _bypass = enter_doctor_mutation_db_open_bypass();
16093
16094        let conn =
16095            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
16096                .expect(
16097                    "doctor process must be able to run post-repair read probes under its own lock",
16098                );
16099        drop(conn);
16100
16101        fs2::FileExt::unlock(&lock_file).unwrap();
16102    }
16103
16104    #[test]
16105    fn autocommit_retain_disable_tries_compat_then_canonical_pragma() {
16106        let mut attempts = Vec::new();
16107
16108        let selected = disable_autocommit_retain(|pragma| {
16109            attempts.push(pragma);
16110            if pragma == "PRAGMA fsqlite.autocommit_retain = OFF;" {
16111                Err("compat namespace unavailable")
16112            } else {
16113                Ok(())
16114            }
16115        })
16116        .expect("canonical pragma should disable autocommit retain");
16117
16118        assert_eq!(selected, "PRAGMA autocommit_retain = OFF;");
16119        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
16120    }
16121
16122    #[test]
16123    fn autocommit_retain_disable_fails_closed_when_no_pragma_works() {
16124        let mut attempts = Vec::new();
16125
16126        let err = disable_autocommit_retain(|pragma| {
16127            attempts.push(pragma);
16128            Err("unsupported pragma")
16129        })
16130        .expect_err("unsupported autocommit retain controls should fail closed");
16131
16132        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
16133        let message = err.to_string();
16134        assert!(
16135            message.contains("refusing to keep a long-lived MVCC connection"),
16136            "error should force callers away from unbounded snapshot retention: {message}"
16137        );
16138        assert!(
16139            message.contains("PRAGMA fsqlite.autocommit_retain = OFF;")
16140                && message.contains("PRAGMA autocommit_retain = OFF;"),
16141            "error should preserve attempted PRAGMAs for diagnostics: {message}"
16142        );
16143    }
16144
16145    /// Open a rusqlite connection on `db_path` for the narrow purpose of
16146    /// injecting (or inspecting the raw projection of) sqlite_master
16147    /// corruption patterns in test fixtures. Frankensqlite intentionally does
16148    /// not support `PRAGMA writable_schema` writes or raw inserts to
16149    /// sqlite_master (see AGENTS.md: "PRAGMA writable_schema: Not supported for
16150    /// write operations"), so these fixtures retain rusqlite as the standard-
16151    /// SQLite interop layer. All callers are in this test module and run under
16152    /// #[cfg(test)]; no production code path touches rusqlite here.
16153    fn rusqlite_test_fixture_conn(db_path: &Path) -> rusqlite::Connection {
16154        rusqlite::Connection::open(db_path).expect("open rusqlite test fixture connection")
16155    }
16156
16157    fn seed_historical_db_direct(
16158        db_path: &Path,
16159        conversations: &[crate::model::types::Conversation],
16160    ) {
16161        if let Some(parent) = db_path.parent() {
16162            fs::create_dir_all(parent).unwrap();
16163        }
16164
16165        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
16166        conn.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA).unwrap();
16167        conn.execute_compat(
16168            "INSERT INTO agents(id, slug, name, version, kind, created_at, updated_at)
16169             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
16170            fparams![1_i64, "codex", "Codex", "0.2.3", "cli", 0_i64, 0_i64],
16171        )
16172        .unwrap();
16173
16174        let mut next_message_id = 1_i64;
16175        for (conv_index, conv) in conversations.iter().enumerate() {
16176            let conversation_id = i64::try_from(conv_index + 1).unwrap();
16177            let workspace_id = conv.workspace.as_ref().map(|workspace| {
16178                let workspace_id = conversation_id;
16179                let workspace_path = workspace.to_string_lossy().into_owned();
16180                conn.execute_compat(
16181                    "INSERT INTO workspaces(id, path, display_name) VALUES(?1, ?2, ?3)",
16182                    fparams![
16183                        workspace_id,
16184                        workspace_path.as_str(),
16185                        workspace_path.as_str()
16186                    ],
16187                )
16188                .unwrap();
16189                workspace_id
16190            });
16191            let source_path = conv.source_path.to_string_lossy().into_owned();
16192            let metadata_json = conv.metadata_json.to_string();
16193            conn.execute_compat(
16194                "INSERT INTO conversations (
16195                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
16196                    started_at, ended_at, approx_tokens, metadata_json, origin_host
16197                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
16198                fparams![
16199                    conversation_id,
16200                    1_i64,
16201                    workspace_id,
16202                    conv.source_id.as_str(),
16203                    conv.external_id.as_deref(),
16204                    conv.title.as_deref(),
16205                    source_path.as_str(),
16206                    conv.started_at,
16207                    conv.ended_at,
16208                    conv.approx_tokens,
16209                    metadata_json.as_str(),
16210                    conv.origin_host.as_deref()
16211                ],
16212            )
16213            .unwrap();
16214
16215            for msg in &conv.messages {
16216                let extra_json = msg.extra_json.to_string();
16217                let role = role_str(&msg.role);
16218                conn.execute_compat(
16219                    "INSERT INTO messages(
16220                        id, conversation_id, idx, role, author, created_at, content, extra_json
16221                     ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
16222                    fparams![
16223                        next_message_id,
16224                        conversation_id,
16225                        msg.idx,
16226                        role.as_str(),
16227                        msg.author.as_deref(),
16228                        msg.created_at,
16229                        msg.content.as_str(),
16230                        extra_json.as_str()
16231                    ],
16232                )
16233                .unwrap();
16234                next_message_id += 1;
16235            }
16236        }
16237    }
16238
16239    // =========================================================================
16240    // User data file protection tests (bead yln.4)
16241    // =========================================================================
16242
16243    #[test]
16244    fn is_user_data_file_detects_bookmarks() {
16245        assert!(is_user_data_file(Path::new("/data/bookmarks.db")));
16246        assert!(is_user_data_file(Path::new("bookmarks.db")));
16247    }
16248
16249    #[test]
16250    fn is_user_data_file_detects_tui_state() {
16251        assert!(is_user_data_file(Path::new("/data/tui_state.json")));
16252    }
16253
16254    #[test]
16255    fn is_user_data_file_detects_sources_toml() {
16256        assert!(is_user_data_file(Path::new("/config/sources.toml")));
16257    }
16258
16259    #[test]
16260    fn is_user_data_file_detects_env() {
16261        assert!(is_user_data_file(Path::new(".env")));
16262    }
16263
16264    #[test]
16265    fn is_user_data_file_rejects_other_files() {
16266        assert!(!is_user_data_file(Path::new("index.db")));
16267        assert!(!is_user_data_file(Path::new("conversations.db")));
16268        assert!(!is_user_data_file(Path::new("random.txt")));
16269    }
16270
16271    // =========================================================================
16272    // Backup creation tests (bead yln.4)
16273    // =========================================================================
16274
16275    #[test]
16276    fn create_backup_returns_none_for_nonexistent() {
16277        let dir = TempDir::new().unwrap();
16278        let db_path = dir.path().join("nonexistent.db");
16279        let result = create_backup(&db_path).unwrap();
16280        assert!(result.is_none());
16281    }
16282
16283    #[test]
16284    fn create_backup_creates_named_file() {
16285        let dir = TempDir::new().unwrap();
16286        let db_path = dir.path().join("test.db");
16287        std::fs::write(&db_path, b"test data").unwrap();
16288
16289        let backup_path = create_backup(&db_path).unwrap();
16290        assert!(backup_path.is_some());
16291        let backup = backup_path.unwrap();
16292        assert!(backup.exists());
16293        assert!(
16294            backup
16295                .file_name()
16296                .unwrap()
16297                .to_str()
16298                .unwrap()
16299                .contains("backup")
16300        );
16301    }
16302
16303    #[test]
16304    fn create_backup_paths_are_unique() {
16305        let dir = TempDir::new().unwrap();
16306        let db_path = dir.path().join("test.db");
16307        std::fs::write(&db_path, b"test data").unwrap();
16308
16309        let first = create_backup(&db_path).unwrap().unwrap();
16310        let second = create_backup(&db_path).unwrap().unwrap();
16311
16312        assert_ne!(first, second);
16313        assert!(first.exists());
16314        assert!(second.exists());
16315    }
16316
16317    #[test]
16318    fn lexical_rebuild_messages_query_uses_conversation_idx_access_path() {
16319        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16320        use std::path::PathBuf;
16321
16322        let dir = TempDir::new().unwrap();
16323        let db_path = dir.path().join("agent_search.db");
16324        let storage = SqliteStorage::open(&db_path).unwrap();
16325
16326        let agent = Agent {
16327            id: None,
16328            slug: "claude_code".into(),
16329            name: "Claude Code".into(),
16330            version: None,
16331            kind: AgentKind::Cli,
16332        };
16333        let agent_id = storage.ensure_agent(&agent).unwrap();
16334        let conversation = Conversation {
16335            id: None,
16336            agent_slug: "claude_code".into(),
16337            workspace: Some(PathBuf::from("/tmp/workspace")),
16338            external_id: Some("conv-1".into()),
16339            title: Some("Lexical rebuild".into()),
16340            source_path: PathBuf::from("/tmp/conv-1.jsonl"),
16341            started_at: Some(1_700_000_000_000),
16342            ended_at: Some(1_700_000_000_100),
16343            approx_tokens: None,
16344            metadata_json: serde_json::Value::Null,
16345            messages: vec![
16346                Message {
16347                    id: None,
16348                    idx: 0,
16349                    role: MessageRole::User,
16350                    author: Some("user".into()),
16351                    created_at: Some(1_700_000_000_010),
16352                    content: "first".into(),
16353                    extra_json: serde_json::Value::Null,
16354                    snippets: Vec::new(),
16355                },
16356                Message {
16357                    id: None,
16358                    idx: 1,
16359                    role: MessageRole::Agent,
16360                    author: Some("assistant".into()),
16361                    created_at: Some(1_700_000_000_020),
16362                    content: "second".into(),
16363                    extra_json: serde_json::Value::Null,
16364                    snippets: Vec::new(),
16365                },
16366            ],
16367            source_id: LOCAL_SOURCE_ID.into(),
16368            origin_host: None,
16369        };
16370        storage
16371            .insert_conversation_tree(agent_id, None, &conversation)
16372            .unwrap();
16373        let conversation_id = storage
16374            .conn
16375            .query_row_map(
16376                "SELECT id FROM conversations WHERE external_id = ?1",
16377                fparams!["conv-1"],
16378                |row| row.get_typed::<i64>(0),
16379            )
16380            .unwrap();
16381
16382        let opcodes: Vec<String> = storage
16383            .conn
16384            .query_map_collect(
16385                "EXPLAIN \
16386                 SELECT id, idx, role, author, created_at, content \
16387                 FROM messages \
16388                 WHERE conversation_id = ?1 ORDER BY idx",
16389                fparams![conversation_id],
16390                |row| row.get_typed(1),
16391            )
16392            .unwrap();
16393
16394        assert!(
16395            opcodes.iter().any(|opcode| opcode == "SeekGE"),
16396            "expected lexical rebuild message fetch to seek into the conversation_id/idx access path, got {opcodes:?}"
16397        );
16398        assert!(
16399            !opcodes.iter().any(|opcode| opcode == "SorterOpen"),
16400            "expected lexical rebuild message fetch to avoid sorter temp b-trees, got {opcodes:?}"
16401        );
16402    }
16403
16404    #[test]
16405    fn schema_check_rebuild_classification_ignores_transient_errors() {
16406        assert!(!schema_check_error_requires_rebuild(
16407            &frankensqlite::FrankenError::Busy
16408        ));
16409        assert!(!schema_check_error_requires_rebuild(
16410            &frankensqlite::FrankenError::DatabaseLocked {
16411                path: PathBuf::from("/tmp/test.db"),
16412            }
16413        ));
16414        assert!(!schema_check_error_requires_rebuild(
16415            &frankensqlite::FrankenError::CannotOpen {
16416                path: PathBuf::from("/tmp/test.db"),
16417            }
16418        ));
16419        assert!(!schema_check_error_requires_rebuild(
16420            &frankensqlite::FrankenError::Io(std::io::Error::other("disk hiccup"))
16421        ));
16422    }
16423
16424    #[test]
16425    fn schema_check_rebuild_classification_keeps_corruption_errors() {
16426        assert!(schema_check_error_requires_rebuild(
16427            &frankensqlite::FrankenError::DatabaseCorrupt {
16428                detail: "bad header".to_string(),
16429            }
16430        ));
16431        assert!(schema_check_error_requires_rebuild(
16432            &frankensqlite::FrankenError::WalCorrupt {
16433                detail: "bad wal".to_string(),
16434            }
16435        ));
16436        assert!(schema_check_error_requires_rebuild(
16437            &frankensqlite::FrankenError::NotADatabase {
16438                path: PathBuf::from("/tmp/test.db"),
16439            }
16440        ));
16441        assert!(schema_check_error_requires_rebuild(
16442            &frankensqlite::FrankenError::ShortRead {
16443                expected: 4096,
16444                actual: 64,
16445            }
16446        ));
16447    }
16448
16449    #[test]
16450    fn create_backup_refuses_raw_copy_after_retryable_vacuum_errors() {
16451        let retryable_errors = [
16452            frankensqlite::FrankenError::Busy,
16453            frankensqlite::FrankenError::BusyRecovery,
16454            frankensqlite::FrankenError::BusySnapshot {
16455                conflicting_pages: "1,2".to_string(),
16456            },
16457            frankensqlite::FrankenError::DatabaseLocked {
16458                path: PathBuf::from("/tmp/test.db"),
16459            },
16460            frankensqlite::FrankenError::LockFailed {
16461                detail: "fcntl lock still held".to_string(),
16462            },
16463            frankensqlite::FrankenError::WriteConflict { page: 7, holder: 9 },
16464            frankensqlite::FrankenError::SerializationFailure { page: 11 },
16465            frankensqlite::FrankenError::Internal("database is locked".to_string()),
16466        ];
16467
16468        for err in retryable_errors {
16469            assert!(
16470                backup_vacuum_error_requires_consistent_retry(&err),
16471                "retryable VACUUM failure must not fall back to raw bundle copy: {err}"
16472            );
16473        }
16474
16475        assert!(!backup_vacuum_error_requires_consistent_retry(
16476            &frankensqlite::FrankenError::NotADatabase {
16477                path: PathBuf::from("/tmp/test.db")
16478            }
16479        ));
16480        assert!(!backup_vacuum_error_requires_consistent_retry(
16481            &frankensqlite::FrankenError::DatabaseCorrupt {
16482                detail: "bad header".to_string()
16483            }
16484        ));
16485    }
16486
16487    #[test]
16488    fn create_backup_uses_hidden_vacuum_stage_path() {
16489        let backup_path = PathBuf::from("/tmp/test.db.backup.123.456.0");
16490        let stage_path = vacuum_stage_backup_path(&backup_path);
16491        let stage_name = stage_path
16492            .file_name()
16493            .and_then(|name| name.to_str())
16494            .unwrap_or_default();
16495
16496        assert!(stage_name.starts_with('.'));
16497        assert!(stage_name.ends_with(".vacuum-in-progress"));
16498        assert!(
16499            !is_backup_root_name(stage_name, "test.db.backup."),
16500            "incomplete VACUUM output must not be discoverable as a backup root"
16501        );
16502    }
16503
16504    #[test]
16505    fn create_backup_preserves_content() {
16506        let dir = TempDir::new().unwrap();
16507        let db_path = dir.path().join("test.db");
16508        let original_content = b"test database content 12345";
16509        std::fs::write(&db_path, original_content).unwrap();
16510
16511        let backup_path = create_backup(&db_path).unwrap().unwrap();
16512        let backup_content = std::fs::read(&backup_path).unwrap();
16513        assert_eq!(backup_content, original_content);
16514    }
16515
16516    #[test]
16517    fn create_backup_copies_sidecars_when_present() {
16518        let dir = TempDir::new().unwrap();
16519        let db_path = dir.path().join("test.db");
16520        std::fs::write(&db_path, b"db").unwrap();
16521        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16522        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16523
16524        let backup_path = create_backup(&db_path).unwrap().unwrap();
16525
16526        assert_eq!(
16527            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
16528            b"wal"
16529        );
16530        assert_eq!(
16531            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
16532            b"shm"
16533        );
16534    }
16535
16536    #[test]
16537    #[cfg(unix)]
16538    fn create_backup_rejects_symlink_root_during_raw_fallback() {
16539        use std::os::unix::fs::symlink;
16540
16541        let dir = TempDir::new().unwrap();
16542        let outside_db = dir.path().join("outside.db");
16543        let db_path = dir.path().join("test.db");
16544        std::fs::write(&outside_db, b"not sqlite").unwrap();
16545        symlink(&outside_db, &db_path).unwrap();
16546
16547        let err = create_backup(&db_path).unwrap_err();
16548
16549        assert!(
16550            err.to_string().contains("bundle symlink"),
16551            "unexpected error: {err:#}"
16552        );
16553        assert_eq!(std::fs::read(&outside_db).unwrap(), b"not sqlite");
16554        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
16555            .unwrap()
16556            .filter_map(|entry| entry.ok())
16557            .map(|entry| entry.file_name().to_string_lossy().into_owned())
16558            .filter(|name| name.starts_with("test.db.backup."))
16559            .collect();
16560        assert!(
16561            backup_roots.is_empty(),
16562            "symlinked backup source must not publish backup roots: {backup_roots:?}"
16563        );
16564    }
16565
16566    #[test]
16567    #[cfg(unix)]
16568    fn create_backup_rejects_symlink_sidecar_without_partial_backup() {
16569        use std::os::unix::fs::symlink;
16570
16571        let dir = TempDir::new().unwrap();
16572        let db_path = dir.path().join("test.db");
16573        let outside_wal = dir.path().join("outside.wal");
16574        let wal_path = database_sidecar_path(&db_path, "-wal");
16575        std::fs::write(&db_path, b"not sqlite").unwrap();
16576        std::fs::write(&outside_wal, b"outside wal").unwrap();
16577        symlink(&outside_wal, &wal_path).unwrap();
16578
16579        let err = create_backup(&db_path).unwrap_err();
16580
16581        assert!(
16582            err.to_string().contains("bundle symlink"),
16583            "unexpected error: {err:#}"
16584        );
16585        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
16586        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
16587            .unwrap()
16588            .filter_map(|entry| entry.ok())
16589            .map(|entry| entry.file_name().to_string_lossy().into_owned())
16590            .filter(|name| name.starts_with("test.db.backup."))
16591            .collect();
16592        assert!(
16593            backup_roots.is_empty(),
16594            "sidecar preflight failure must not leave a partial backup root: {backup_roots:?}"
16595        );
16596    }
16597
16598    // =========================================================================
16599    // Backup cleanup tests (bead yln.4)
16600    // =========================================================================
16601
16602    #[test]
16603    fn cleanup_old_backups_keeps_recent() {
16604        let dir = TempDir::new().unwrap();
16605        let db_path = dir.path().join("test.db");
16606
16607        // Create 5 backup files with different timestamps
16608        for i in 0..5 {
16609            let backup_name = format!("test.db.backup.{}", 1000 + i);
16610            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
16611        }
16612
16613        cleanup_old_backups(&db_path, 3).unwrap();
16614
16615        // Count remaining backup files
16616        let backups: Vec<_> = std::fs::read_dir(dir.path())
16617            .unwrap()
16618            .filter_map(|e| e.ok())
16619            .filter(|e| e.file_name().to_str().unwrap_or("").contains("backup"))
16620            .collect();
16621
16622        assert_eq!(backups.len(), 3);
16623    }
16624
16625    #[test]
16626    fn cleanup_old_backups_ignores_wal_and_shm_sidecars() {
16627        let dir = TempDir::new().unwrap();
16628        let db_path = dir.path().join("test.db");
16629
16630        for i in 0..3 {
16631            let backup_name = format!("test.db.backup.{}", 1000 + i);
16632            let backup_path = dir.path().join(&backup_name);
16633            std::fs::write(&backup_path, format!("backup {i}")).unwrap();
16634            std::fs::write(format!("{}-wal", backup_path.display()), b"wal").unwrap();
16635            std::fs::write(format!("{}-shm", backup_path.display()), b"shm").unwrap();
16636            std::thread::sleep(std::time::Duration::from_millis(20));
16637        }
16638
16639        cleanup_old_backups(&db_path, 2).unwrap();
16640
16641        let mut roots = Vec::new();
16642        let mut wals = Vec::new();
16643        let mut shms = Vec::new();
16644        for entry in std::fs::read_dir(dir.path())
16645            .unwrap()
16646            .filter_map(|e| e.ok())
16647        {
16648            let name = entry.file_name().to_string_lossy().into_owned();
16649            if name.ends_with("-wal") {
16650                wals.push(name);
16651            } else if name.ends_with("-shm") {
16652                shms.push(name);
16653            } else if name.contains("backup") {
16654                roots.push(name);
16655            }
16656        }
16657
16658        assert_eq!(roots.len(), 2, "should keep two backup roots");
16659        assert_eq!(
16660            wals.len(),
16661            2,
16662            "should keep WAL sidecars only for retained backups"
16663        );
16664        assert_eq!(
16665            shms.len(),
16666            2,
16667            "should keep SHM sidecars only for retained backups"
16668        );
16669    }
16670
16671    #[test]
16672    fn move_database_bundle_moves_database_and_sidecars() {
16673        let dir = TempDir::new().unwrap();
16674        let db_path = dir.path().join("test.db");
16675        let backup_path = dir.path().join("test.db.corrupt");
16676
16677        std::fs::write(&db_path, b"db").unwrap();
16678        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16679        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16680
16681        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
16682        assert_eq!(
16683            moved,
16684            DatabaseBundleMoveResult {
16685                database: true,
16686                wal: true,
16687                shm: true
16688            }
16689        );
16690        assert!(moved.moved_any());
16691
16692        assert!(!db_path.exists());
16693        assert!(!database_sidecar_path(&db_path, "-wal").exists());
16694        assert!(!database_sidecar_path(&db_path, "-shm").exists());
16695
16696        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
16697        assert_eq!(
16698            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
16699            b"wal"
16700        );
16701        assert_eq!(
16702            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
16703            b"shm"
16704        );
16705    }
16706
16707    #[test]
16708    fn move_database_bundle_preserves_orphan_sidecars_without_main_db() {
16709        let dir = TempDir::new().unwrap();
16710        let db_path = dir.path().join("test.db");
16711        let backup_path = dir.path().join("test.db.corrupt");
16712
16713        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16714        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16715
16716        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
16717        assert_eq!(
16718            moved,
16719            DatabaseBundleMoveResult {
16720                database: false,
16721                wal: true,
16722                shm: true
16723            }
16724        );
16725        assert!(moved.moved_any());
16726        assert!(!db_path.exists());
16727        assert!(!database_sidecar_path(&db_path, "-wal").exists());
16728        assert!(!database_sidecar_path(&db_path, "-shm").exists());
16729        assert_eq!(
16730            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
16731            b"wal"
16732        );
16733        assert_eq!(
16734            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
16735            b"shm"
16736        );
16737    }
16738
16739    #[test]
16740    #[cfg(unix)]
16741    fn move_database_bundle_moves_dangling_symlink_database_root() {
16742        use std::os::unix::fs::symlink;
16743
16744        let dir = TempDir::new().unwrap();
16745        let db_path = dir.path().join("test.db");
16746        let backup_path = dir.path().join("test.db.corrupt");
16747        let missing_target = dir.path().join("missing-target.db");
16748
16749        symlink(&missing_target, &db_path).unwrap();
16750
16751        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
16752
16753        assert_eq!(
16754            moved,
16755            DatabaseBundleMoveResult {
16756                database: true,
16757                wal: false,
16758                shm: false
16759            }
16760        );
16761        assert!(std::fs::symlink_metadata(&db_path).is_err());
16762        assert!(
16763            std::fs::symlink_metadata(&backup_path)
16764                .unwrap()
16765                .file_type()
16766                .is_symlink()
16767        );
16768        assert!(!missing_target.exists());
16769    }
16770
16771    #[test]
16772    #[cfg(unix)]
16773    fn move_database_bundle_moves_dangling_symlink_sidecars_without_main_db() {
16774        use std::os::unix::fs::symlink;
16775
16776        let dir = TempDir::new().unwrap();
16777        let db_path = dir.path().join("test.db");
16778        let backup_path = dir.path().join("test.db.corrupt");
16779        let missing_wal_target = dir.path().join("missing-wal");
16780        let missing_shm_target = dir.path().join("missing-shm");
16781        let wal_path = database_sidecar_path(&db_path, "-wal");
16782        let shm_path = database_sidecar_path(&db_path, "-shm");
16783
16784        symlink(&missing_wal_target, &wal_path).unwrap();
16785        symlink(&missing_shm_target, &shm_path).unwrap();
16786
16787        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
16788
16789        assert_eq!(
16790            moved,
16791            DatabaseBundleMoveResult {
16792                database: false,
16793                wal: true,
16794                shm: true
16795            }
16796        );
16797        assert!(std::fs::symlink_metadata(&wal_path).is_err());
16798        assert!(std::fs::symlink_metadata(&shm_path).is_err());
16799        assert!(
16800            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-wal"))
16801                .unwrap()
16802                .file_type()
16803                .is_symlink()
16804        );
16805        assert!(
16806            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-shm"))
16807                .unwrap()
16808                .file_type()
16809                .is_symlink()
16810        );
16811        assert!(!missing_wal_target.exists());
16812        assert!(!missing_shm_target.exists());
16813    }
16814
16815    #[test]
16816    fn copy_database_bundle_copies_database_and_sidecars() {
16817        let dir = TempDir::new().unwrap();
16818        let db_path = dir.path().join("test.db");
16819        let copied_path = dir.path().join("copy.db");
16820
16821        std::fs::write(&db_path, b"db").unwrap();
16822        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16823        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16824
16825        copy_database_bundle(&db_path, &copied_path).unwrap();
16826
16827        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
16828        assert_eq!(
16829            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
16830            b"wal"
16831        );
16832        assert_eq!(
16833            std::fs::read(database_sidecar_path(&copied_path, "-shm")).unwrap(),
16834            b"shm"
16835        );
16836        assert_eq!(std::fs::read(&db_path).unwrap(), b"db");
16837    }
16838
16839    #[test]
16840    fn copy_database_bundle_creates_destination_parent() {
16841        let dir = TempDir::new().unwrap();
16842        let db_path = dir.path().join("test.db");
16843        let copied_path = dir.path().join("nested/copies/copy.db");
16844
16845        std::fs::write(&db_path, b"db").unwrap();
16846        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16847
16848        copy_database_bundle(&db_path, &copied_path).unwrap();
16849
16850        assert!(copied_path.parent().unwrap().is_dir());
16851        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
16852        assert_eq!(
16853            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
16854            b"wal"
16855        );
16856    }
16857
16858    #[test]
16859    #[cfg(unix)]
16860    fn copy_database_bundle_rejects_symlink_source_root() {
16861        use std::os::unix::fs::symlink;
16862
16863        let dir = TempDir::new().unwrap();
16864        let outside_db = dir.path().join("outside.db");
16865        let db_path = dir.path().join("test.db");
16866        let copied_path = dir.path().join("copy.db");
16867
16868        std::fs::write(&outside_db, b"outside").unwrap();
16869        symlink(&outside_db, &db_path).unwrap();
16870
16871        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
16872
16873        assert!(
16874            err.to_string().contains("bundle symlink"),
16875            "unexpected error: {err:#}"
16876        );
16877        assert!(!copied_path.exists());
16878        assert_eq!(std::fs::read(&outside_db).unwrap(), b"outside");
16879    }
16880
16881    #[test]
16882    #[cfg(unix)]
16883    fn copy_database_bundle_rejects_symlink_sidecar() {
16884        use std::os::unix::fs::symlink;
16885
16886        let dir = TempDir::new().unwrap();
16887        let db_path = dir.path().join("test.db");
16888        let copied_path = dir.path().join("copy.db");
16889        let outside_wal = dir.path().join("outside.wal");
16890        let wal_path = database_sidecar_path(&db_path, "-wal");
16891
16892        std::fs::write(&db_path, b"db").unwrap();
16893        std::fs::write(&outside_wal, b"outside wal").unwrap();
16894        symlink(&outside_wal, &wal_path).unwrap();
16895
16896        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
16897
16898        assert!(
16899            err.to_string().contains("bundle symlink"),
16900            "unexpected error: {err:#}"
16901        );
16902        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
16903        assert!(!copied_path.exists());
16904        assert!(!database_sidecar_path(&copied_path, "-wal").exists());
16905    }
16906
16907    #[test]
16908    fn move_database_bundle_creates_destination_parent_and_moves_sidecars() {
16909        let dir = TempDir::new().unwrap();
16910        let db_path = dir.path().join("test.db");
16911        let backup_path = dir.path().join("nested/backups/test.db.corrupt");
16912
16913        std::fs::write(&db_path, b"db").unwrap();
16914        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16915        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16916
16917        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
16918        assert_eq!(
16919            moved,
16920            DatabaseBundleMoveResult {
16921                database: true,
16922                wal: true,
16923                shm: true
16924            }
16925        );
16926        assert!(backup_path.parent().unwrap().is_dir());
16927        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
16928        assert_eq!(
16929            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
16930            b"wal"
16931        );
16932        assert_eq!(
16933            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
16934            b"shm"
16935        );
16936    }
16937
16938    #[test]
16939    fn remove_database_files_removes_orphan_sidecars_without_main_db() {
16940        let dir = TempDir::new().unwrap();
16941        let db_path = dir.path().join("test.db");
16942
16943        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16944        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16945
16946        remove_database_files(&db_path).unwrap();
16947
16948        assert!(!db_path.exists());
16949        assert!(!database_sidecar_path(&db_path, "-wal").exists());
16950        assert!(!database_sidecar_path(&db_path, "-shm").exists());
16951    }
16952
16953    #[test]
16954    fn cleanup_old_backups_ignores_backup_named_directories() {
16955        let dir = TempDir::new().unwrap();
16956        let db_path = dir.path().join("test.db");
16957
16958        for i in 0..3 {
16959            let backup_name = format!("test.db.backup.{}", 1000 + i);
16960            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
16961        }
16962        std::fs::create_dir(dir.path().join("test.db.backup.directory")).unwrap();
16963
16964        cleanup_old_backups(&db_path, 2).unwrap();
16965
16966        let mut backup_files = Vec::new();
16967        let mut backup_dirs = Vec::new();
16968        for entry in std::fs::read_dir(dir.path())
16969            .unwrap()
16970            .filter_map(|e| e.ok())
16971        {
16972            let name = entry.file_name().to_string_lossy().into_owned();
16973            if !name.starts_with("test.db.backup.") {
16974                continue;
16975            }
16976            if entry.path().is_dir() {
16977                backup_dirs.push(name);
16978            } else {
16979                backup_files.push(name);
16980            }
16981        }
16982
16983        assert_eq!(
16984            backup_files.len(),
16985            2,
16986            "only real backup files count toward retention"
16987        );
16988        assert_eq!(
16989            backup_dirs.len(),
16990            1,
16991            "backup-named directories should be ignored"
16992        );
16993    }
16994
16995    // =========================================================================
16996    // Storage open/create tests (bead yln.4)
16997    // =========================================================================
16998
16999    #[test]
17000    fn open_creates_new_database() {
17001        let dir = TempDir::new().unwrap();
17002        let db_path = dir.path().join("new.db");
17003        assert!(!db_path.exists());
17004
17005        let storage = SqliteStorage::open(&db_path).unwrap();
17006        assert!(db_path.exists());
17007        storage.close().unwrap();
17008    }
17009
17010    #[test]
17011    fn open_readonly_fails_for_nonexistent() {
17012        let dir = TempDir::new().unwrap();
17013        let db_path = dir.path().join("nonexistent.db");
17014        let result = SqliteStorage::open_readonly(&db_path);
17015        assert!(result.is_err());
17016    }
17017
17018    #[test]
17019    fn open_readonly_succeeds_for_existing() {
17020        let dir = TempDir::new().unwrap();
17021        let db_path = dir.path().join("existing.db");
17022
17023        // Create first
17024        let _storage = SqliteStorage::open(&db_path).unwrap();
17025        drop(_storage);
17026
17027        // Now open readonly
17028        let storage = SqliteStorage::open_readonly(&db_path).unwrap();
17029        assert!(storage.schema_version().is_ok());
17030    }
17031
17032    #[test]
17033    fn reopen_existing_current_schema_is_idempotent() {
17034        let dir = TempDir::new().unwrap();
17035        let db_path = dir.path().join("existing.db");
17036
17037        // First open creates and migrates to current schema.
17038        {
17039            let storage = SqliteStorage::open(&db_path).unwrap();
17040            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
17041        }
17042
17043        // Re-open should not fail on current schema.
17044        let reopened = SqliteStorage::open(&db_path).unwrap();
17045        assert_eq!(
17046            reopened.schema_version().unwrap(),
17047            CURRENT_SCHEMA_VERSION,
17048            "reopening current schema DB should be idempotent"
17049        );
17050    }
17051
17052    #[test]
17053    fn open_or_rebuild_current_schema_does_not_trigger_rebuild() {
17054        let dir = TempDir::new().unwrap();
17055        let db_path = dir.path().join("existing.db");
17056
17057        // Create DB at current schema.
17058        {
17059            let storage = SqliteStorage::open(&db_path).unwrap();
17060            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
17061        }
17062
17063        // Should open normally, not require rebuild.
17064        let reopened = SqliteStorage::open_or_rebuild(&db_path)
17065            .expect("current schema DB should open without rebuild");
17066        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
17067    }
17068
17069    #[test]
17070    fn open_or_rebuild_does_not_treat_non_database_paths_as_corruption() {
17071        let dir = TempDir::new().unwrap();
17072        let db_path = dir.path().join("db_dir");
17073        std::fs::create_dir(&db_path).unwrap();
17074
17075        let result = SqliteStorage::open_or_rebuild(&db_path);
17076
17077        assert!(
17078            matches!(
17079                result,
17080                Err(MigrationError::Database(_)) | Err(MigrationError::Io(_))
17081            ),
17082            "non-database path should report the underlying open error without rebuild"
17083        );
17084
17085        assert!(
17086            db_path.is_dir(),
17087            "non-database directory must be left in place"
17088        );
17089    }
17090
17091    // =========================================================================
17092    // Schema version tests (bead yln.4)
17093    // =========================================================================
17094
17095    #[test]
17096    fn schema_version_returns_current() {
17097        let dir = TempDir::new().unwrap();
17098        let db_path = dir.path().join("test.db");
17099        let storage = SqliteStorage::open(&db_path).unwrap();
17100        let version = storage.schema_version().unwrap();
17101        assert!(version >= 5, "Schema version should be at least 5");
17102    }
17103
17104    // =========================================================================
17105    // Current analytics/schema smoke test (bead z9fse.11)
17106    // =========================================================================
17107
17108    #[test]
17109    fn migration_v13_creates_analytics_tables() {
17110        let dir = TempDir::new().unwrap();
17111        let db_path = dir.path().join("test.db");
17112        let storage = SqliteStorage::open(&db_path).unwrap();
17113
17114        // Schema version should be current.
17115        let version = storage.schema_version().unwrap();
17116        assert_eq!(
17117            version, CURRENT_SCHEMA_VERSION,
17118            "Schema version must match CURRENT_SCHEMA_VERSION after migration"
17119        );
17120
17121        let conn = storage.raw();
17122
17123        // Helper: collect column names from PRAGMA table_info
17124        fn col_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
17125            conn.query_map_collect(
17126                &format!("PRAGMA table_info({})", table),
17127                fparams![],
17128                |row: &FrankenRow| row.get_typed(1),
17129            )
17130            .unwrap()
17131        }
17132
17133        // Helper: collect index names from PRAGMA index_list
17134        fn idx_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
17135            conn.query_map_collect(
17136                &format!("PRAGMA index_list({})", table),
17137                fparams![],
17138                |row: &FrankenRow| row.get_typed(1),
17139            )
17140            .unwrap()
17141        }
17142
17143        // Verify message_metrics table exists with expected columns
17144        let mm_cols = col_names(conn, "message_metrics");
17145        for expected in &[
17146            "message_id",
17147            "hour_id",
17148            "day_id",
17149            "content_tokens_est",
17150            "model_name",
17151            "model_family",
17152            "model_tier",
17153            "provider",
17154            "api_input_tokens",
17155            "has_plan",
17156            "agent_slug",
17157            "role",
17158            "api_data_source",
17159        ] {
17160            assert!(
17161                mm_cols.contains(&expected.to_string()),
17162                "message_metrics missing column: {expected}"
17163            );
17164        }
17165
17166        // Verify usage_hourly table
17167        let uh_cols = col_names(conn, "usage_hourly");
17168        for expected in &[
17169            "hour_id",
17170            "plan_message_count",
17171            "plan_content_tokens_est_total",
17172            "plan_api_tokens_total",
17173            "api_coverage_message_count",
17174            "content_tokens_est_user",
17175            "api_thinking_tokens_total",
17176        ] {
17177            assert!(
17178                uh_cols.contains(&expected.to_string()),
17179                "usage_hourly missing column: {expected}"
17180            );
17181        }
17182
17183        // Verify usage_daily table
17184        let ud_cols = col_names(conn, "usage_daily");
17185        for expected in &[
17186            "day_id",
17187            "plan_content_tokens_est_total",
17188            "plan_api_tokens_total",
17189            "api_thinking_tokens_total",
17190            "content_tokens_est_assistant",
17191            "message_count",
17192        ] {
17193            assert!(
17194                ud_cols.contains(&expected.to_string()),
17195                "usage_daily missing column: {expected}"
17196            );
17197        }
17198
17199        // Verify usage_models_daily table
17200        let umd_cols = col_names(conn, "usage_models_daily");
17201        for expected in &[
17202            "day_id",
17203            "model_family",
17204            "model_tier",
17205            "message_count",
17206            "api_tokens_total",
17207            "api_coverage_message_count",
17208        ] {
17209            assert!(
17210                umd_cols.contains(&expected.to_string()),
17211                "usage_models_daily missing column: {expected}"
17212            );
17213        }
17214
17215        // Verify indexes on message_metrics
17216        let mm_idxs = idx_names(conn, "message_metrics");
17217        assert!(
17218            mm_idxs.iter().any(|n| n.contains("idx_mm_hour")),
17219            "message_metrics must have hour index"
17220        );
17221        assert!(
17222            mm_idxs.iter().any(|n| n.contains("idx_mm_agent_day")),
17223            "message_metrics must have agent+day index"
17224        );
17225        assert!(
17226            mm_idxs
17227                .iter()
17228                .any(|n| n.contains("idx_mm_model_family_day")),
17229            "message_metrics must have model_family+day index"
17230        );
17231
17232        // Verify indexes on usage_hourly
17233        let uh_idxs = idx_names(conn, "usage_hourly");
17234        assert!(
17235            uh_idxs.iter().any(|n| n.contains("idx_uh_agent")),
17236            "usage_hourly must have agent index"
17237        );
17238
17239        // Verify indexes on usage_daily
17240        let ud_idxs = idx_names(conn, "usage_daily");
17241        assert!(
17242            ud_idxs.iter().any(|n| n.contains("idx_ud_agent")),
17243            "usage_daily must have agent index"
17244        );
17245
17246        // Verify indexes on usage_models_daily
17247        let umd_idxs = idx_names(conn, "usage_models_daily");
17248        assert!(
17249            umd_idxs.iter().any(|n| n.contains("idx_umd_model_day")),
17250            "usage_models_daily must have model+day index"
17251        );
17252
17253        let conversation_cols = col_names(conn, "conversations");
17254        assert!(
17255            conversation_cols.contains(&"last_message_idx".to_string())
17256                && conversation_cols.contains(&"last_message_created_at".to_string()),
17257            "fresh schema must include V15 tail columns without ALTER TABLE on conversations"
17258        );
17259        let fts_schema_rows: i64 = conn
17260            .query_row_map(
17261                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
17262                fparams![],
17263                |row: &FrankenRow| row.get_typed(0),
17264            )
17265            .unwrap();
17266        assert_eq!(
17267            fts_schema_rows, 0,
17268            "fresh schema should not create and immediately drop derived fts_messages"
17269        );
17270        let integrity: Vec<String> = conn
17271            .query_map_collect("PRAGMA integrity_check;", fparams![], |row: &FrankenRow| {
17272                row.get_typed(0)
17273            })
17274            .unwrap();
17275        assert_eq!(
17276            integrity,
17277            vec!["ok".to_string()],
17278            "fresh schema must pass SQLite integrity_check"
17279        );
17280    }
17281
17282    #[test]
17283    fn hour_id_round_trip() {
17284        // 2026-02-06 12:00:00 UTC
17285        let ts_ms = 1_770_508_800_000_i64;
17286        let hour_id = SqliteStorage::hour_id_from_millis(ts_ms);
17287        let day_id = SqliteStorage::day_id_from_millis(ts_ms);
17288
17289        // hour_id should be 24x day_id (approximately)
17290        assert_eq!(hour_id / 24, day_id, "hour_id/24 should equal day_id");
17291
17292        // Round-trip: millis_from_hour_id should give start of that hour
17293        let back = SqliteStorage::millis_from_hour_id(hour_id);
17294        assert!(
17295            back <= ts_ms && ts_ms - back < 3_600_000,
17296            "Round-trip should land within the same hour"
17297        );
17298    }
17299
17300    #[test]
17301    fn day_and_hour_ids_floor_negative_millis() {
17302        // One millisecond before the Unix epoch should still floor into the
17303        // previous second/hour/day rather than truncating toward zero.
17304        let ts_ms = -1_i64;
17305        let expected_secs = -1_i64;
17306        let epoch_2020_secs = 1_577_836_800_i64;
17307
17308        assert_eq!(
17309            SqliteStorage::day_id_from_millis(ts_ms),
17310            (expected_secs - epoch_2020_secs).div_euclid(86_400)
17311        );
17312        assert_eq!(
17313            SqliteStorage::hour_id_from_millis(ts_ms),
17314            (expected_secs - epoch_2020_secs).div_euclid(3_600)
17315        );
17316    }
17317
17318    #[test]
17319    fn migration_v13_from_v10() {
17320        let dir = TempDir::new().unwrap();
17321        let db_path = dir.path().join("test.db");
17322
17323        // Open at v10 first by faking it
17324        {
17325            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
17326            conn.execute_batch("PRAGMA journal_mode=WAL;").unwrap();
17327            conn.execute_batch(
17328                "CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT);",
17329            )
17330            .unwrap();
17331            conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', '10')")
17332                .unwrap();
17333            // Apply V1-V10 so schema is correct. Keep each historical DDL batch
17334            // in autocommit mode; the fixture is testing cass migration
17335            // transition behavior, not frankensqlite's handling of a giant
17336            // synthetic legacy-DDL transaction.
17337            conn.execute_batch(MIGRATION_V1).unwrap();
17338            conn.execute_batch(MIGRATION_V2).unwrap();
17339            conn.execute_batch(MIGRATION_V4).unwrap();
17340            conn.execute_batch(MIGRATION_V5).unwrap();
17341            conn.execute_batch(MIGRATION_V6).unwrap();
17342            conn.execute_batch(MIGRATION_V7).unwrap();
17343            conn.execute_batch(MIGRATION_V8).unwrap();
17344            conn.execute_batch(MIGRATION_V9).unwrap();
17345            conn.execute_batch(MIGRATION_V10).unwrap();
17346            conn.execute("UPDATE meta SET value = '10' WHERE key = 'schema_version'")
17347                .unwrap();
17348        }
17349        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
17350
17351        // Now open with SqliteStorage — should auto-migrate to current schema
17352        let storage = SqliteStorage::open(&db_path).unwrap();
17353        let version = storage.schema_version().unwrap();
17354        assert_eq!(
17355            version, CURRENT_SCHEMA_VERSION,
17356            "Should have migrated from v10 to the current schema"
17357        );
17358
17359        // Verify new tables exist
17360        let count: i64 = storage
17361            .raw()
17362            .query_row_map(
17363                "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name IN ('message_metrics', 'usage_hourly', 'usage_daily', 'usage_models_daily')",
17364                &[],
17365                |row: &FrankenRow| row.get_typed::<i64>(0),
17366            )
17367            .unwrap();
17368        assert_eq!(count, 4, "All 4 analytics tables should exist");
17369    }
17370
17371    // =========================================================================
17372    // Analytics ingest integration test (bead z9fse.2)
17373    // =========================================================================
17374
17375    #[test]
17376    fn analytics_ingest_populates_metrics_and_rollups() {
17377        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17378        use std::path::PathBuf;
17379
17380        let dir = TempDir::new().unwrap();
17381        let db_path = dir.path().join("test.db");
17382        let storage = SqliteStorage::open(&db_path).unwrap();
17383
17384        // Register agent + workspace
17385        let agent = Agent {
17386            id: None,
17387            slug: "claude_code".into(),
17388            name: "Claude Code".into(),
17389            version: Some("1.0".into()),
17390            kind: AgentKind::Cli,
17391        };
17392        let agent_id = storage.ensure_agent(&agent).unwrap();
17393
17394        // Create a synthetic conversation with 3 messages at a known timestamp
17395        // 2026-02-06 10:30:00 UTC → day_id = 2228, hour_id = 53472
17396        let ts_ms = 1_770_551_400_000_i64;
17397        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
17398        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
17399
17400        // Include a JSON usage block on the assistant message (like Claude Code data)
17401        let usage_json = serde_json::json!({
17402            "message": {
17403                "model": "claude-opus-4-6",
17404                "usage": {
17405                    "input_tokens": 100,
17406                    "output_tokens": 50,
17407                    "cache_read_input_tokens": 200,
17408                    "cache_creation_input_tokens": 30,
17409                    "service_tier": "standard"
17410                }
17411            }
17412        });
17413
17414        let conv = Conversation {
17415            id: None,
17416            agent_slug: "claude_code".into(),
17417            workspace: None,
17418            external_id: Some("test-conv-1".into()),
17419            title: Some("Test conversation".into()),
17420            source_path: PathBuf::from("/tmp/test.jsonl"),
17421            started_at: Some(ts_ms),
17422            ended_at: Some(ts_ms + 60_000),
17423            approx_tokens: None,
17424            metadata_json: serde_json::Value::Null,
17425            messages: vec![
17426                Message {
17427                    id: None,
17428                    idx: 0,
17429                    role: MessageRole::User,
17430                    author: None,
17431                    created_at: Some(ts_ms),
17432                    content: "Hello, can you help me with a plan?".into(),
17433                    extra_json: serde_json::Value::Null,
17434                    snippets: vec![],
17435                },
17436                Message {
17437                    id: None,
17438                    idx: 1,
17439                    role: MessageRole::Agent,
17440                    author: None,
17441                    created_at: Some(ts_ms + 30_000),
17442                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
17443                    extra_json: usage_json,
17444                    snippets: vec![],
17445                },
17446                Message {
17447                    id: None,
17448                    idx: 2,
17449                    role: MessageRole::User,
17450                    author: None,
17451                    created_at: Some(ts_ms + 60_000),
17452                    content: "Great, let's proceed!".into(),
17453                    extra_json: serde_json::Value::Null,
17454                    snippets: vec![],
17455                },
17456            ],
17457            source_id: "local".into(),
17458            origin_host: None,
17459        };
17460
17461        let outcomes = storage
17462            .insert_conversations_batched(&[(agent_id, None, &conv)])
17463            .unwrap();
17464        assert_eq!(outcomes.len(), 1);
17465        assert_eq!(outcomes[0].inserted_indices.len(), 3);
17466
17467        let conn = storage.raw();
17468
17469        // Verify message_metrics rows
17470        let mm_count: i64 = conn
17471            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17472                row.get_typed::<i64>(0)
17473            })
17474            .unwrap();
17475        assert_eq!(mm_count, 3, "Should have 3 message_metrics rows");
17476
17477        // Verify hour_id and day_id are correct
17478        #[allow(clippy::type_complexity)]
17479        let rows: Vec<(i64, i64, String, i64, i64, String, String, String, String)> = conn
17480            .query_map_collect(
17481                "SELECT hour_id, day_id, role, content_tokens_est, has_plan, api_data_source, model_family, model_tier, provider FROM message_metrics ORDER BY message_id",
17482                fparams![],
17483                |row: &FrankenRow| {
17484                    Ok((
17485                        row.get_typed(0)?,
17486                        row.get_typed(1)?,
17487                        row.get_typed(2)?,
17488                        row.get_typed(3)?,
17489                        row.get_typed(4)?,
17490                        row.get_typed(5)?,
17491                        row.get_typed(6)?,
17492                        row.get_typed(7)?,
17493                        row.get_typed(8)?,
17494                    ))
17495                },
17496            )
17497            .unwrap();
17498
17499        assert_eq!(rows.len(), 3);
17500        // All messages in the same hour/day
17501        assert_eq!(rows[0].0, expected_hour);
17502        assert_eq!(rows[0].1, expected_day);
17503        // First message is user
17504        assert_eq!(rows[0].2, "user");
17505        // Second message (assistant) should have has_plan=1 (contains "## Plan" + numbered steps)
17506        assert_eq!(
17507            rows[1].4, 1,
17508            "Assistant message with plan should have has_plan=1"
17509        );
17510        // Second message should have api data source
17511        assert_eq!(
17512            rows[1].5, "api",
17513            "Claude Code assistant message should have api data source"
17514        );
17515        // First and third (user) messages should be estimated
17516        assert_eq!(rows[0].5, "estimated");
17517        assert_eq!(rows[2].5, "estimated");
17518        assert_eq!(rows[1].6, "claude");
17519        assert_eq!(rows[1].7, "opus");
17520        assert_eq!(rows[1].8, "anthropic");
17521        assert_eq!(rows[0].6, "unknown");
17522        // content_tokens_est = chars / 4
17523        let user_chars = "Hello, can you help me with a plan?".len() as i64;
17524        assert_eq!(rows[0].3, user_chars / 4);
17525
17526        // Verify usage_hourly rollup
17527        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api, uh_api_cov): (
17528            i64,
17529            i64,
17530            i64,
17531            i64,
17532            i64,
17533            i64,
17534            i64,
17535        ) = conn
17536            .query_row_map(
17537                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
17538                        plan_content_tokens_est_total, plan_api_tokens_total, api_coverage_message_count
17539                 FROM usage_hourly WHERE hour_id = ?",
17540                fparams![expected_hour],
17541                |row: &FrankenRow| {
17542                    Ok((
17543                        row.get_typed(0)?,
17544                        row.get_typed(1)?,
17545                        row.get_typed(2)?,
17546                        row.get_typed(3)?,
17547                        row.get_typed(4)?,
17548                        row.get_typed(5)?,
17549                        row.get_typed(6)?,
17550                    ))
17551                },
17552            )
17553            .unwrap();
17554        assert_eq!(uh_msg, 3, "Hourly rollup should have 3 messages");
17555        assert_eq!(uh_user, 2, "Hourly rollup should have 2 user messages");
17556        assert_eq!(uh_asst, 1, "Hourly rollup should have 1 assistant message");
17557        assert_eq!(uh_plan, 1, "Hourly rollup should have 1 plan message");
17558        assert!(
17559            uh_plan_content > 0,
17560            "Hourly rollup should include plan content tokens"
17561        );
17562        assert!(
17563            uh_plan_api > 0,
17564            "Hourly rollup should include plan API tokens"
17565        );
17566        assert_eq!(
17567            uh_api_cov, 1,
17568            "Hourly rollup should have 1 API-covered message"
17569        );
17570
17571        // Verify usage_daily rollup matches hourly (same day)
17572        let (ud_msg, ud_api_cov): (i64, i64) = conn
17573            .query_row_map(
17574                "SELECT message_count, api_coverage_message_count FROM usage_daily WHERE day_id = ?",
17575                fparams![expected_day],
17576                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17577            )
17578            .unwrap();
17579        assert_eq!(ud_msg, 3, "Daily rollup should match hourly");
17580        assert_eq!(
17581            ud_api_cov, 1,
17582            "Daily api_coverage should be 1 (only assistant msg has real API data)"
17583        );
17584
17585        // Verify the API input tokens from message_metrics (only API-sourced)
17586        let api_only_input: i64 = conn
17587            .query_row_map(
17588                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE day_id = ? AND api_data_source = 'api'",
17589                fparams![expected_day],
17590                |row: &FrankenRow| row.get_typed::<i64>(0),
17591            )
17592            .unwrap();
17593        assert_eq!(
17594            api_only_input, 100,
17595            "Only API-sourced input tokens should be 100"
17596        );
17597
17598        // Verify rollups match summed message_metrics
17599        let mm_total_content_est: i64 = conn
17600            .query_row_map(
17601                "SELECT SUM(content_tokens_est) FROM message_metrics WHERE day_id = ?",
17602                fparams![expected_day],
17603                |row| row.get_typed::<i64>(0),
17604            )
17605            .unwrap();
17606        let mm_plan_content_est: i64 = conn
17607            .query_row_map(
17608                "SELECT COALESCE(SUM(content_tokens_est), 0) FROM message_metrics WHERE day_id = ? AND has_plan = 1",
17609                fparams![expected_day],
17610                |row: &FrankenRow| row.get_typed::<i64>(0),
17611            )
17612            .unwrap();
17613        let mm_plan_api_total: i64 = conn
17614            .query_row_map(
17615                "SELECT COALESCE(SUM(COALESCE(api_input_tokens, 0) + COALESCE(api_output_tokens, 0) + COALESCE(api_cache_read_tokens, 0) + COALESCE(api_cache_creation_tokens, 0) + COALESCE(api_thinking_tokens, 0)), 0)
17616                 FROM message_metrics WHERE day_id = ? AND has_plan = 1 AND api_data_source = 'api'",
17617                fparams![expected_day],
17618                |row: &FrankenRow| row.get_typed::<i64>(0),
17619            )
17620            .unwrap();
17621        let ud_content_est: i64 = conn
17622            .query_row_map(
17623                "SELECT content_tokens_est_total FROM usage_daily WHERE day_id = ?",
17624                fparams![expected_day],
17625                |row| row.get_typed::<i64>(0),
17626            )
17627            .unwrap();
17628        let (ud_plan_content_est, ud_plan_api_total): (i64, i64) = conn
17629            .query_row_map(
17630                "SELECT plan_content_tokens_est_total, plan_api_tokens_total FROM usage_daily WHERE day_id = ?",
17631                fparams![expected_day],
17632                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17633            )
17634            .unwrap();
17635        assert_eq!(
17636            mm_total_content_est, ud_content_est,
17637            "Daily rollup content_tokens_est_total must equal SUM of message_metrics"
17638        );
17639        assert_eq!(
17640            mm_plan_content_est, ud_plan_content_est,
17641            "Daily rollup plan_content_tokens_est_total must equal planned message_metrics content sum"
17642        );
17643        assert_eq!(
17644            mm_plan_api_total, ud_plan_api_total,
17645            "Daily rollup plan_api_tokens_total must equal planned message_metrics API token sum"
17646        );
17647
17648        // Verify model rollup rows
17649        let (claude_msg, claude_user, claude_asst, claude_api_total, claude_api_cov): (
17650            i64,
17651            i64,
17652            i64,
17653            i64,
17654            i64,
17655        ) = conn
17656            .query_row_map(
17657                "SELECT message_count, user_message_count, assistant_message_count, api_tokens_total, api_coverage_message_count
17658                 FROM usage_models_daily
17659                 WHERE day_id = ? AND model_family = 'claude' AND model_tier = 'opus'",
17660                fparams![expected_day],
17661                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?, row.get_typed(3)?, row.get_typed(4)?)),
17662            )
17663            .unwrap();
17664        assert_eq!(claude_msg, 1);
17665        assert_eq!(claude_user, 0);
17666        assert_eq!(claude_asst, 1);
17667        assert_eq!(claude_api_total, 380);
17668        assert_eq!(claude_api_cov, 1);
17669
17670        let unknown_msg: i64 = conn
17671            .query_row_map(
17672                "SELECT message_count FROM usage_models_daily
17673                 WHERE day_id = ? AND model_family = 'unknown' AND model_tier = 'unknown'",
17674                fparams![expected_day],
17675                |row| row.get_typed(0),
17676            )
17677            .unwrap();
17678        assert_eq!(
17679            unknown_msg, 2,
17680            "user messages should land in unknown model bucket"
17681        );
17682    }
17683
17684    #[test]
17685    fn has_plan_heuristic_detects_plans() {
17686        assert!(has_plan_heuristic(
17687            "## Plan\n\n1. First step\n2. Second step"
17688        ));
17689        assert!(has_plan_heuristic(
17690            "# Plan\nHere is what we will do:\n1. Step one\n2. Step two"
17691        ));
17692        assert!(has_plan_heuristic(
17693            "Plan:\n- Gather baseline\n- Implement changes\n- Validate with tests"
17694        ));
17695        assert!(has_plan_heuristic(
17696            "Next steps:\n1. Update schema\n2. Rebuild rollups"
17697        ));
17698        assert!(!has_plan_heuristic("Hello world"));
17699        assert!(!has_plan_heuristic("Short"));
17700        assert!(!has_plan_heuristic(
17701            "This is a regular message without plans"
17702        ));
17703        assert!(!has_plan_heuristic(
17704            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. install\\n2. run\"}\n```"
17705        ));
17706    }
17707
17708    #[test]
17709    fn has_plan_for_role_only_counts_assistant_messages() {
17710        let plan_text = "## Plan\n1. First\n2. Second";
17711        assert!(has_plan_for_role("assistant", plan_text));
17712        assert!(has_plan_for_role("agent", plan_text));
17713        assert!(has_plan_for_role("Assistant", plan_text));
17714        assert!(!has_plan_for_role("user", plan_text));
17715        assert!(!has_plan_for_role("tool", plan_text));
17716    }
17717
17718    #[test]
17719    fn api_rollups_require_api_data_source() {
17720        let mut agg = AnalyticsRollupAggregator::new();
17721
17722        let estimated_plan = MessageMetricsEntry {
17723            message_id: 1,
17724            created_at_ms: 0,
17725            hour_id: 1,
17726            day_id: 1,
17727            agent_slug: "codex".into(),
17728            workspace_id: 0,
17729            source_id: "local".into(),
17730            role: "assistant".into(),
17731            content_chars: 120,
17732            content_tokens_est: 30,
17733            model_name: None,
17734            model_family: "unknown".into(),
17735            model_tier: "unknown".into(),
17736            provider: "unknown".into(),
17737            api_input_tokens: Some(100),
17738            api_output_tokens: Some(50),
17739            api_cache_read_tokens: Some(0),
17740            api_cache_creation_tokens: Some(0),
17741            api_thinking_tokens: Some(0),
17742            api_service_tier: None,
17743            api_data_source: "estimated".into(),
17744            tool_call_count: 0,
17745            has_tool_calls: false,
17746            has_plan: true,
17747        };
17748        agg.record(&estimated_plan);
17749
17750        let api_plan = MessageMetricsEntry {
17751            message_id: 2,
17752            created_at_ms: 0,
17753            hour_id: 1,
17754            day_id: 1,
17755            agent_slug: "codex".into(),
17756            workspace_id: 0,
17757            source_id: "local".into(),
17758            role: "assistant".into(),
17759            content_chars: 80,
17760            content_tokens_est: 20,
17761            model_name: None,
17762            model_family: "unknown".into(),
17763            model_tier: "unknown".into(),
17764            provider: "unknown".into(),
17765            api_input_tokens: Some(40),
17766            api_output_tokens: Some(10),
17767            api_cache_read_tokens: Some(0),
17768            api_cache_creation_tokens: Some(0),
17769            api_thinking_tokens: Some(0),
17770            api_service_tier: None,
17771            api_data_source: "api".into(),
17772            tool_call_count: 0,
17773            has_tool_calls: false,
17774            has_plan: true,
17775        };
17776        agg.record(&api_plan);
17777
17778        let key = (1_i64, "codex".to_string(), 0_i64, "local".to_string());
17779        let hourly = agg.hourly.get(&key).expect("hourly rollup key must exist");
17780        let daily = agg.daily.get(&key).expect("daily rollup key must exist");
17781        let model_key = (
17782            1_i64,
17783            "codex".to_string(),
17784            0_i64,
17785            "local".to_string(),
17786            "unknown".to_string(),
17787            "unknown".to_string(),
17788        );
17789        let models_daily = agg
17790            .models_daily
17791            .get(&model_key)
17792            .expect("model rollup key must exist");
17793
17794        // Content rollup includes both plan messages.
17795        assert_eq!(hourly.plan_message_count, 2);
17796        assert_eq!(hourly.plan_content_tokens_est_total, 50);
17797        // API plan tokens must include only api_data_source='api' rows.
17798        assert_eq!(hourly.plan_api_tokens_total, 50);
17799        assert_eq!(daily.plan_api_tokens_total, 50);
17800        assert_eq!(models_daily.plan_api_tokens_total, 50);
17801        // Overall API totals must also exclude estimated rows.
17802        assert_eq!(hourly.api_tokens_total, 50);
17803        assert_eq!(hourly.api_input_tokens_total, 40);
17804        assert_eq!(hourly.api_output_tokens_total, 10);
17805        assert_eq!(hourly.api_coverage_message_count, 1);
17806        assert_eq!(daily.api_tokens_total, 50);
17807        assert_eq!(models_daily.api_tokens_total, 50);
17808    }
17809
17810    #[test]
17811    fn has_plan_heuristic_curated_corpus_thresholds() {
17812        // Cross-agent-style positives.
17813        let positives = [
17814            "## Plan\n1. Inspect current schema\n2. Add migration\n3. Verify rebuild",
17815            "Plan:\n1) Reproduce\n2) Patch\n3) Add tests",
17816            "Implementation plan:\n- Parse inputs\n- Update rollups\n- Run checks",
17817            "Next steps:\n1. Reserve file\n2. Implement\n3. Report status",
17818            "# Plan\n1. Gather requirements\n2. Ship changes",
17819            "Action plan:\n- Identify root cause\n- Fix it\n- Validate",
17820        ];
17821
17822        // Typical false positives we want to avoid.
17823        let negatives = [
17824            "The plan is to move fast and fix things later.",
17825            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. ls\\n2. cat\"}\n```",
17826            "stdout:\n1. Build started\n2. Build finished\nexit code: 0",
17827            "I can help with that request. Let me know if you want details.",
17828            "Here is a list:\n- apples\n- oranges",
17829            "Status update: completed tasks and blockers below.",
17830        ];
17831
17832        let tp = positives
17833            .iter()
17834            .filter(|msg| has_plan_heuristic(msg))
17835            .count();
17836        let fp = negatives
17837            .iter()
17838            .filter(|msg| has_plan_heuristic(msg))
17839            .count();
17840
17841        let recall = tp as f64 / positives.len() as f64;
17842        let false_positive_rate = fp as f64 / negatives.len() as f64;
17843
17844        assert!(
17845            recall >= 0.80,
17846            "plan heuristic recall too low: got {recall:.2}"
17847        );
17848        assert!(
17849            false_positive_rate <= 0.20,
17850            "plan heuristic false-positive rate too high: got {false_positive_rate:.2}"
17851        );
17852    }
17853
17854    #[test]
17855    fn rebuild_analytics_repopulates_from_messages() {
17856        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17857        use std::path::PathBuf;
17858
17859        let dir = TempDir::new().unwrap();
17860        let db_path = dir.path().join("test.db");
17861        let storage = SqliteStorage::open(&db_path).unwrap();
17862
17863        // Register agent
17864        let agent = Agent {
17865            id: None,
17866            slug: "claude_code".into(),
17867            name: "Claude Code".into(),
17868            version: Some("1.0".into()),
17869            kind: AgentKind::Cli,
17870        };
17871        let agent_id = storage.ensure_agent(&agent).unwrap();
17872
17873        // 2026-02-06 10:30:00 UTC
17874        let ts_ms = 1_770_551_400_000_i64;
17875        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
17876        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
17877
17878        let usage_json = serde_json::json!({
17879            "message": {
17880                "model": "claude-opus-4-6",
17881                "usage": {
17882                    "input_tokens": 100,
17883                    "output_tokens": 50,
17884                    "cache_read_input_tokens": 200,
17885                    "cache_creation_input_tokens": 30,
17886                    "service_tier": "standard"
17887                }
17888            }
17889        });
17890
17891        let conv = Conversation {
17892            id: None,
17893            agent_slug: "claude_code".into(),
17894            workspace: None,
17895            external_id: Some("test-rebuild-1".into()),
17896            title: Some("Test conversation".into()),
17897            source_path: PathBuf::from("/tmp/test.jsonl"),
17898            started_at: Some(ts_ms),
17899            ended_at: Some(ts_ms + 60_000),
17900            approx_tokens: None,
17901            metadata_json: serde_json::Value::Null,
17902            messages: vec![
17903                Message {
17904                    id: None,
17905                    idx: 0,
17906                    role: MessageRole::User,
17907                    author: None,
17908                    created_at: Some(ts_ms),
17909                    content: "Hello, can you help me with a plan?".into(),
17910                    extra_json: serde_json::Value::Null,
17911                    snippets: vec![],
17912                },
17913                Message {
17914                    id: None,
17915                    idx: 1,
17916                    role: MessageRole::Agent,
17917                    author: None,
17918                    created_at: Some(ts_ms + 30_000),
17919                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
17920                    extra_json: usage_json,
17921                    snippets: vec![],
17922                },
17923                Message {
17924                    id: None,
17925                    idx: 2,
17926                    role: MessageRole::User,
17927                    author: None,
17928                    created_at: Some(ts_ms + 60_000),
17929                    content: "Great, let's proceed!".into(),
17930                    extra_json: serde_json::Value::Null,
17931                    snippets: vec![],
17932                },
17933            ],
17934            source_id: "local".into(),
17935            origin_host: None,
17936        };
17937
17938        storage
17939            .insert_conversations_batched(&[(agent_id, None, &conv)])
17940            .unwrap();
17941
17942        // Save original analytics state
17943        let conn = storage.raw();
17944        let orig_mm: i64 = conn
17945            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17946                row.get_typed(0)
17947            })
17948            .unwrap();
17949        let orig_hourly: i64 = conn
17950            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
17951                row.get_typed(0)
17952            })
17953            .unwrap();
17954        let orig_daily: i64 = conn
17955            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
17956                row.get_typed(0)
17957            })
17958            .unwrap();
17959        let orig_models_daily: i64 = conn
17960            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
17961                row.get_typed(0)
17962            })
17963            .unwrap();
17964        let orig_api_input: i64 = conn
17965            .query_row_map(
17966                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
17967                &[],
17968                |row: &FrankenRow| row.get_typed(0),
17969            )
17970            .unwrap();
17971
17972        assert_eq!(orig_mm, 3);
17973        assert!(orig_hourly > 0);
17974        assert!(orig_daily > 0);
17975        assert!(orig_models_daily > 0);
17976
17977        // Destroy analytics tables (simulate corruption)
17978        conn.execute("DELETE FROM message_metrics").unwrap();
17979        conn.execute("DELETE FROM usage_hourly").unwrap();
17980        conn.execute("DELETE FROM usage_daily").unwrap();
17981        conn.execute("DELETE FROM usage_models_daily").unwrap();
17982
17983        // Verify they're empty
17984        let zero: i64 = conn
17985            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17986                row.get_typed(0)
17987            })
17988            .unwrap();
17989        assert_eq!(zero, 0);
17990
17991        // Rebuild analytics
17992        let result = storage.rebuild_analytics().unwrap();
17993
17994        assert_eq!(result.message_metrics_rows, 3);
17995        assert!(result.usage_hourly_rows > 0);
17996        assert!(result.usage_daily_rows > 0);
17997        assert!(result.usage_models_daily_rows > 0);
17998        assert!(
17999            result.elapsed_ms < 10_000,
18000            "Rebuild should be fast for 3 msgs"
18001        );
18002
18003        // Verify rebuilt data matches
18004        let conn = storage.raw();
18005        let rebuilt_mm: i64 = conn
18006            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
18007                row.get_typed(0)
18008            })
18009            .unwrap();
18010        assert_eq!(
18011            rebuilt_mm, orig_mm,
18012            "Rebuilt message_metrics count should match"
18013        );
18014
18015        let rebuilt_hourly: i64 = conn
18016            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
18017                row.get_typed(0)
18018            })
18019            .unwrap();
18020        assert_eq!(
18021            rebuilt_hourly, orig_hourly,
18022            "Rebuilt hourly rows should match"
18023        );
18024
18025        let rebuilt_daily: i64 = conn
18026            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
18027                row.get_typed(0)
18028            })
18029            .unwrap();
18030        assert_eq!(rebuilt_daily, orig_daily, "Rebuilt daily rows should match");
18031
18032        let rebuilt_models_daily: i64 = conn
18033            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
18034                row.get_typed(0)
18035            })
18036            .unwrap();
18037        assert_eq!(
18038            rebuilt_models_daily, orig_models_daily,
18039            "Rebuilt model rollup rows should match"
18040        );
18041
18042        // Verify API token data preserved through rebuild
18043        let rebuilt_api_input: i64 = conn
18044            .query_row_map(
18045                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
18046                &[],
18047                |row: &FrankenRow| row.get_typed(0),
18048            )
18049            .unwrap();
18050        assert_eq!(
18051            rebuilt_api_input, orig_api_input,
18052            "Rebuilt API input tokens should match original"
18053        );
18054
18055        // Verify rollups have correct data
18056        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api): (
18057            i64,
18058            i64,
18059            i64,
18060            i64,
18061            i64,
18062            i64,
18063        ) = conn
18064            .query_row_map(
18065                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
18066                        plan_content_tokens_est_total, plan_api_tokens_total
18067                 FROM usage_hourly WHERE hour_id = ?",
18068                fparams![expected_hour],
18069                |row: &FrankenRow| {
18070                    Ok((
18071                        row.get_typed(0)?,
18072                        row.get_typed(1)?,
18073                        row.get_typed(2)?,
18074                        row.get_typed(3)?,
18075                        row.get_typed(4)?,
18076                        row.get_typed(5)?,
18077                    ))
18078                },
18079            )
18080            .unwrap();
18081        assert_eq!(uh_msg, 3);
18082        assert_eq!(uh_user, 2);
18083        assert_eq!(uh_asst, 1);
18084        assert_eq!(uh_plan, 1);
18085        assert!(uh_plan_content > 0);
18086        assert!(uh_plan_api > 0);
18087
18088        let ud_msg: i64 = conn
18089            .query_row_map(
18090                "SELECT message_count FROM usage_daily WHERE day_id = ?",
18091                fparams![expected_day],
18092                |row| row.get_typed(0),
18093            )
18094            .unwrap();
18095        assert_eq!(ud_msg, 3);
18096    }
18097
18098    #[test]
18099    fn insert_conversations_batched_flushes_large_fts_batches() {
18100        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18101        use std::path::PathBuf;
18102
18103        let dir = TempDir::new().unwrap();
18104        let db_path = dir.path().join("test.db");
18105        let storage = SqliteStorage::open(&db_path).unwrap();
18106        // V14 drops fts_messages during migration; cass normally recreates it
18107        // during startup via `ensure_search_fallback_fts_consistency`. Tests
18108        // that inspect fts_messages directly need to run the same repair pass
18109        // to exercise the "insert flushes FTS" contract.
18110        storage
18111            .ensure_search_fallback_fts_consistency()
18112            .expect("ensure FTS consistency before insert");
18113
18114        let agent = Agent {
18115            id: None,
18116            slug: "codex".into(),
18117            name: "Codex".into(),
18118            version: Some("0.2.3".into()),
18119            kind: AgentKind::Cli,
18120        };
18121        let agent_id = storage.ensure_agent(&agent).unwrap();
18122
18123        let content = "y".repeat((FTS_ENTRY_BATCH_MAX_CHARS / 2) + 1);
18124        let messages: Vec<_> = (0_i64..2)
18125            .map(|i| Message {
18126                id: None,
18127                idx: i,
18128                role: MessageRole::Agent,
18129                author: None,
18130                created_at: Some(1_700_000_000_000 + i),
18131                content: format!("{i}-{content}"),
18132                extra_json: serde_json::Value::Null,
18133                snippets: Vec::new(),
18134            })
18135            .collect();
18136        let conv = Conversation {
18137            id: None,
18138            agent_slug: "codex".into(),
18139            workspace: Some(PathBuf::from("/tmp/workspace")),
18140            external_id: Some("fts-large-batch".into()),
18141            title: Some("FTS Large Batch".into()),
18142            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18143            started_at: Some(1_700_000_000_000),
18144            ended_at: Some(1_700_000_000_999),
18145            approx_tokens: None,
18146            metadata_json: serde_json::Value::Null,
18147            messages,
18148            source_id: "local".into(),
18149            origin_host: None,
18150        };
18151
18152        let outcomes = storage
18153            .insert_conversations_batched(&[(agent_id, None, &conv)])
18154            .unwrap();
18155        assert_eq!(outcomes.len(), 1);
18156        assert_eq!(outcomes[0].inserted_indices.len(), conv.messages.len());
18157
18158        let message_count: i64 = storage
18159            .conn
18160            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18161                row.get_typed(0)
18162            })
18163            .unwrap();
18164        let fts_count: i64 = storage
18165            .conn
18166            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
18167                row.get_typed(0)
18168            })
18169            .unwrap();
18170
18171        assert_eq!(message_count, conv.messages.len() as i64);
18172        assert_eq!(fts_count, conv.messages.len() as i64);
18173    }
18174
18175    fn make_profiled_storage_remote_conversation(
18176        external_id: i64,
18177        msg_count: usize,
18178    ) -> Conversation {
18179        Conversation {
18180            id: None,
18181            agent_slug: "codex".into(),
18182            workspace: Some(PathBuf::from("/ws/profiled-storage-remote")),
18183            external_id: Some(format!("profiled-storage-remote-{external_id}")),
18184            title: Some(format!(
18185                "Profiled storage remote conversation {external_id}"
18186            )),
18187            source_path: PathBuf::from(format!("/log/profiled-storage-remote-{external_id}.jsonl")),
18188            started_at: Some(10_000 + external_id * 100),
18189            ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
18190            approx_tokens: Some(msg_count as i64 * 32),
18191            metadata_json: serde_json::json!({ "bench": true }),
18192            messages: (0..msg_count)
18193                .map(|idx| Message {
18194                    id: None,
18195                    idx: idx as i64,
18196                    role: if idx % 2 == 0 {
18197                        MessageRole::User
18198                    } else {
18199                        MessageRole::Agent
18200                    },
18201                    author: Some("tester".into()),
18202                    created_at: Some(20_000 + external_id * 100 + idx as i64),
18203                    content: format!(
18204                        "profiled storage remote content ext={external_id} idx={idx} {}",
18205                        "x".repeat(64)
18206                    ),
18207                    extra_json: serde_json::json!({ "idx": idx }),
18208                    snippets: Vec::new(),
18209                })
18210                .collect(),
18211            source_id: "profiled-storage-remote-source".into(),
18212            origin_host: Some("builder-profile".into()),
18213        }
18214    }
18215
18216    fn make_profiled_append_remote_merge_conversation(
18217        external_id: i64,
18218        msg_count: usize,
18219    ) -> Conversation {
18220        let base_ts = 100_000 + external_id * 1_000;
18221        Conversation {
18222            id: None,
18223            agent_slug: "codex".into(),
18224            workspace: Some(PathBuf::from("/ws/profiled-append-remote")),
18225            external_id: Some(format!("profiled-append-remote-{external_id}")),
18226            title: Some(format!("Profiled append remote conversation {external_id}")),
18227            source_path: PathBuf::from(format!("/log/profiled-append-remote-{external_id}.jsonl")),
18228            started_at: Some(base_ts),
18229            ended_at: Some(base_ts + msg_count as i64),
18230            approx_tokens: Some(msg_count as i64 * 50),
18231            metadata_json: serde_json::json!({ "bench": true }),
18232            messages: (0..msg_count)
18233                .map(|idx| Message {
18234                    id: None,
18235                    idx: idx as i64,
18236                    role: if idx % 2 == 0 {
18237                        MessageRole::User
18238                    } else {
18239                        MessageRole::Agent
18240                    },
18241                    author: Some(format!("model-{}", external_id % 5)),
18242                    created_at: Some(base_ts + idx as i64),
18243                    content: format!(
18244                        "Profiled append remote conversation {} message {}: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
18245                        external_id, idx
18246                    ),
18247                    extra_json: serde_json::json!({ "bench": true }),
18248                    snippets: Vec::new(),
18249                })
18250                .collect(),
18251            source_id: "profiled-append-remote-source".into(),
18252            origin_host: Some("builder-profile".into()),
18253        }
18254    }
18255
18256    #[test]
18257    fn insert_conversation_tree_batched_new_message_ids_match_snippet_rows() {
18258        let dir = TempDir::new().unwrap();
18259        let db_path = dir.path().join("batched-message-ids.db");
18260        let storage = SqliteStorage::open(&db_path).unwrap();
18261        let agent_id = storage
18262            .ensure_agent(&Agent {
18263                id: None,
18264                slug: "codex".into(),
18265                name: "Codex".into(),
18266                version: None,
18267                kind: AgentKind::Cli,
18268            })
18269            .unwrap();
18270        let workspace_id = storage
18271            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
18272            .unwrap();
18273        let mut conv = make_profiled_storage_remote_conversation(42, 5);
18274        for (idx, msg) in conv.messages.iter_mut().enumerate() {
18275            msg.snippets.push(Snippet {
18276                id: None,
18277                file_path: Some(PathBuf::from(format!("src/file_{idx}.rs"))),
18278                start_line: Some((idx + 1) as i64),
18279                end_line: Some((idx + 2) as i64),
18280                language: Some("rust".into()),
18281                snippet_text: Some(format!("fn snippet_{idx}() {{}}")),
18282            });
18283        }
18284        let outcome = storage
18285            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
18286            .unwrap();
18287
18288        let message_count: i64 = storage
18289            .conn
18290            .query_row_map(
18291                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
18292                fparams![outcome.conversation_id],
18293                |row| row.get_typed(0),
18294            )
18295            .unwrap();
18296        let joined_snippet_count: i64 = storage
18297            .conn
18298            .query_row_map(
18299                "SELECT COUNT(*)
18300                 FROM snippets s
18301                 JOIN messages m ON s.message_id = m.id
18302                 WHERE m.conversation_id = ?1",
18303                fparams![outcome.conversation_id],
18304                |row| row.get_typed(0),
18305            )
18306            .unwrap();
18307
18308        assert_eq!(message_count, conv.messages.len() as i64);
18309        assert_eq!(joined_snippet_count, conv.messages.len() as i64);
18310    }
18311
18312    #[test]
18313    fn insert_conversation_tree_batched_appended_message_ids_match_snippet_rows() {
18314        let dir = TempDir::new().unwrap();
18315        let db_path = dir.path().join("batched-append-message-ids.db");
18316        let storage = SqliteStorage::open(&db_path).unwrap();
18317        let agent_id = storage
18318            .ensure_agent(&Agent {
18319                id: None,
18320                slug: "codex".into(),
18321                name: "Codex".into(),
18322                version: None,
18323                kind: AgentKind::Cli,
18324            })
18325            .unwrap();
18326        let workspace_id = storage
18327            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
18328            .unwrap();
18329
18330        let mut initial = make_profiled_storage_remote_conversation(77, 2);
18331        for (idx, msg) in initial.messages.iter_mut().enumerate() {
18332            msg.snippets.push(Snippet {
18333                id: None,
18334                file_path: Some(PathBuf::from(format!("src/append_initial_{idx}.rs"))),
18335                start_line: Some((idx + 1) as i64),
18336                end_line: Some((idx + 2) as i64),
18337                language: Some("rust".into()),
18338                snippet_text: Some(format!("fn append_initial_{idx}() {{}}")),
18339            });
18340        }
18341        let first = storage
18342            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
18343            .unwrap();
18344        assert_eq!(first.inserted_indices, vec![0, 1]);
18345
18346        let mut appended = make_profiled_storage_remote_conversation(77, 5);
18347        for (idx, msg) in appended.messages.iter_mut().enumerate() {
18348            msg.snippets.push(Snippet {
18349                id: None,
18350                file_path: Some(PathBuf::from(format!("src/append_full_{idx}.rs"))),
18351                start_line: Some((idx + 10) as i64),
18352                end_line: Some((idx + 11) as i64),
18353                language: Some("rust".into()),
18354                snippet_text: Some(format!("fn append_full_{idx}() {{}}")),
18355            });
18356        }
18357        let second = storage
18358            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
18359            .unwrap();
18360        assert_eq!(second.conversation_id, first.conversation_id);
18361        assert_eq!(second.inserted_indices, vec![2, 3, 4]);
18362
18363        let message_count: i64 = storage
18364            .conn
18365            .query_row_map(
18366                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
18367                fparams![first.conversation_id],
18368                |row| row.get_typed(0),
18369            )
18370            .unwrap();
18371        let joined_snippets: Vec<(i64, String)> = storage
18372            .conn
18373            .query_map_collect(
18374                "SELECT m.idx, s.file_path
18375                 FROM snippets s
18376                 JOIN messages m ON s.message_id = m.id
18377                 WHERE m.conversation_id = ?1
18378                 ORDER BY m.idx, s.id",
18379                fparams![first.conversation_id],
18380                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18381            )
18382            .unwrap();
18383
18384        assert_eq!(message_count, 5);
18385        assert_eq!(
18386            joined_snippets,
18387            vec![
18388                (0, "src/append_initial_0.rs".to_string()),
18389                (1, "src/append_initial_1.rs".to_string()),
18390                (2, "src/append_full_2.rs".to_string()),
18391                (3, "src/append_full_3.rs".to_string()),
18392                (4, "src/append_full_4.rs".to_string()),
18393            ]
18394        );
18395    }
18396
18397    #[test]
18398    fn insert_conversation_tree_rehydrates_external_lookup_after_manual_clear() {
18399        let dir = TempDir::new().unwrap();
18400        let db_path = dir.path().join("external-lookup-rehydrate.db");
18401        let storage = SqliteStorage::open(&db_path).unwrap();
18402        let agent_id = storage
18403            .ensure_agent(&Agent {
18404                id: None,
18405                slug: "codex".into(),
18406                name: "Codex".into(),
18407                version: None,
18408                kind: AgentKind::Cli,
18409            })
18410            .unwrap();
18411        let workspace_id = storage
18412            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
18413            .unwrap();
18414
18415        let initial = make_profiled_storage_remote_conversation(88, 2);
18416        let first = storage
18417            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
18418            .unwrap();
18419        let external_id = initial.external_id.as_deref().unwrap();
18420        let lookup_key =
18421            conversation_external_lookup_key(&initial.source_id, agent_id, external_id);
18422        let lookup_id: i64 = storage
18423            .conn
18424            .query_row_map(
18425                "SELECT conversation_id
18426                 FROM conversation_external_tail_lookup
18427                 WHERE lookup_key = ?1",
18428                fparams![lookup_key.as_str()],
18429                |row| row.get_typed(0),
18430            )
18431            .unwrap();
18432        assert_eq!(lookup_id, first.conversation_id);
18433
18434        storage
18435            .conn
18436            .execute_compat(
18437                "DELETE FROM conversation_external_tail_lookup WHERE lookup_key = ?1",
18438                fparams![lookup_key.as_str()],
18439            )
18440            .unwrap();
18441
18442        let appended = make_profiled_storage_remote_conversation(88, 4);
18443        let second = storage
18444            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
18445            .unwrap();
18446        assert_eq!(second.conversation_id, first.conversation_id);
18447        assert_eq!(second.inserted_indices, vec![2, 3]);
18448
18449        let conversation_count: i64 = storage
18450            .conn
18451            .query_row_map(
18452                "SELECT COUNT(*)
18453                 FROM conversations
18454                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
18455                fparams![initial.source_id.as_str(), agent_id, external_id],
18456                |row| row.get_typed(0),
18457            )
18458            .unwrap();
18459        let restored_lookup: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
18460            .conn
18461            .query_row_map(
18462                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
18463                 FROM conversation_external_tail_lookup
18464                 WHERE lookup_key = ?1",
18465                fparams![lookup_key.as_str()],
18466                |row| {
18467                    Ok((
18468                        row.get_typed(0)?,
18469                        row.get_typed(1)?,
18470                        row.get_typed(2)?,
18471                        row.get_typed(3)?,
18472                    ))
18473                },
18474            )
18475            .unwrap();
18476        let tail_state: (Option<i64>, Option<i64>, Option<i64>) = storage
18477            .conn
18478            .query_row_map(
18479                "SELECT ended_at, last_message_idx, last_message_created_at
18480                 FROM conversation_tail_state
18481                 WHERE conversation_id = ?1",
18482                fparams![first.conversation_id],
18483                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
18484            )
18485            .unwrap();
18486        assert_eq!(conversation_count, 1);
18487        assert_eq!(
18488            restored_lookup,
18489            (
18490                first.conversation_id,
18491                tail_state.0,
18492                tail_state.1,
18493                tail_state.2
18494            )
18495        );
18496        assert_eq!(
18497            tail_state,
18498            (
18499                appended.messages[3].created_at,
18500                Some(3),
18501                appended.messages[3].created_at
18502            )
18503        );
18504    }
18505
18506    #[test]
18507    fn insert_conversation_tree_recreates_daily_stats_after_manual_clear() {
18508        let dir = TempDir::new().unwrap();
18509        let db_path = dir.path().join("test.db");
18510        let storage = SqliteStorage::open(&db_path).unwrap();
18511        let agent_id = storage
18512            .ensure_agent(&Agent {
18513                id: None,
18514                slug: "codex".into(),
18515                name: "Codex".into(),
18516                version: None,
18517                kind: AgentKind::Cli,
18518            })
18519            .unwrap();
18520        let workspace = PathBuf::from("/ws/profiled-storage-remote");
18521        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
18522
18523        storage
18524            .insert_conversation_tree(
18525                agent_id,
18526                Some(workspace_id),
18527                &make_profiled_storage_remote_conversation(0, 3),
18528            )
18529            .unwrap();
18530        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18531
18532        storage
18533            .insert_conversation_tree(
18534                agent_id,
18535                Some(workspace_id),
18536                &make_profiled_storage_remote_conversation(1, 2),
18537            )
18538            .unwrap();
18539
18540        let row_count: i64 = storage
18541            .conn
18542            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
18543                row.get_typed(0)
18544            })
18545            .unwrap();
18546        let (session_count, message_count): (i64, i64) = storage
18547            .conn
18548            .query_row_map(
18549                "SELECT session_count, message_count
18550                 FROM daily_stats
18551                 WHERE agent_slug = 'all' AND source_id = 'all'",
18552                fparams![],
18553                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18554            )
18555            .unwrap();
18556
18557        assert_eq!(row_count, 4);
18558        assert_eq!(session_count, 1);
18559        assert_eq!(message_count, 2);
18560    }
18561
18562    #[test]
18563    #[serial]
18564    fn insert_conversation_tree_stage_profile_tracks_steady_state_remote_reuse() {
18565        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
18566
18567        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
18568            let dir = TempDir::new().unwrap();
18569            let db_path = dir.path().join(format!("profile-{msg_count}.db"));
18570            let storage = SqliteStorage::open(&db_path).unwrap();
18571            let agent_id = storage
18572                .ensure_agent(&Agent {
18573                    id: None,
18574                    slug: "codex".into(),
18575                    name: "Codex".into(),
18576                    version: None,
18577                    kind: AgentKind::Cli,
18578                })
18579                .unwrap();
18580            let workspace = PathBuf::from("/ws/profiled-storage-remote");
18581            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
18582
18583            storage
18584                .insert_conversation_tree(
18585                    agent_id,
18586                    Some(workspace_id),
18587                    &make_profiled_storage_remote_conversation(0, msg_count),
18588                )
18589                .unwrap();
18590
18591            let mut profile = InsertConversationTreePerfProfile::default();
18592            for external_id in 1..=iterations {
18593                storage
18594                    .insert_conversation_tree_with_profile(
18595                        agent_id,
18596                        Some(workspace_id),
18597                        &make_profiled_storage_remote_conversation(external_id as i64, msg_count),
18598                        &mut profile,
18599                    )
18600                    .unwrap();
18601            }
18602
18603            let accounted_duration = profile.source_duration
18604                + profile.tx_open_duration
18605                + profile.existing_lookup_duration
18606                + profile.conversation_row_duration
18607                + profile.message_insert_duration
18608                + profile.snippet_insert_duration
18609                + profile.fts_entry_duration
18610                + profile.fts_flush_duration
18611                + profile.analytics_duration
18612                + profile.commit_duration;
18613            assert_eq!(profile.invocations, iterations);
18614            assert_eq!(profile.messages, iterations * msg_count);
18615            assert_eq!(profile.inserted_messages, iterations * msg_count);
18616            assert!(
18617                profile.total_duration >= accounted_duration,
18618                "accounted stage durations cannot exceed total duration"
18619            );
18620
18621            profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
18622        }
18623    }
18624
18625    #[test]
18626    #[serial]
18627    fn insert_conversation_tree_stage_profile_tracks_append_remote_source_merge() {
18628        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
18629
18630        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
18631            let dir = TempDir::new().unwrap();
18632            let db_path = dir.path().join(format!("append-profile-{msg_count}.db"));
18633            let storage = SqliteStorage::open(&db_path).unwrap();
18634            let agent_id = storage
18635                .ensure_agent(&Agent {
18636                    id: None,
18637                    slug: "codex".into(),
18638                    name: "Codex".into(),
18639                    version: None,
18640                    kind: AgentKind::Cli,
18641                })
18642                .unwrap();
18643            let workspace = PathBuf::from("/ws/profiled-append-remote");
18644            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
18645
18646            for external_id in 0..iterations {
18647                storage
18648                    .insert_conversation_tree(
18649                        agent_id,
18650                        Some(workspace_id),
18651                        &make_profiled_append_remote_merge_conversation(
18652                            external_id as i64,
18653                            msg_count,
18654                        ),
18655                    )
18656                    .unwrap();
18657            }
18658
18659            let mut profile = InsertConversationTreePerfProfile::default();
18660            for external_id in 0..iterations {
18661                storage
18662                    .append_existing_conversation_with_profile(
18663                        agent_id,
18664                        Some(workspace_id),
18665                        &make_profiled_append_remote_merge_conversation(
18666                            external_id as i64,
18667                            msg_count * 2,
18668                        ),
18669                        &mut profile,
18670                    )
18671                    .unwrap();
18672            }
18673
18674            let accounted_duration = profile.source_duration
18675                + profile.tx_open_duration
18676                + profile.existing_lookup_duration
18677                + profile.existing_idx_lookup_duration
18678                + profile.existing_replay_lookup_duration
18679                + profile.dedupe_filter_duration
18680                + profile.conversation_row_duration
18681                + profile.message_insert_duration
18682                + profile.snippet_insert_duration
18683                + profile.fts_entry_duration
18684                + profile.fts_flush_duration
18685                + profile.analytics_duration
18686                + profile.commit_duration;
18687            assert_eq!(profile.invocations, iterations);
18688            assert_eq!(profile.messages, iterations * msg_count * 2);
18689            assert_eq!(profile.inserted_messages, iterations * msg_count);
18690            assert!(
18691                profile.total_duration >= accounted_duration,
18692                "accounted append stage durations cannot exceed total duration"
18693            );
18694
18695            profile.log_summary(&format!("append_remote_merge_{msg_count}_msgs"));
18696        }
18697    }
18698
18699    #[test]
18700    fn rebuild_daily_stats_recomputes_materialized_totals_without_monolithic_group_by() {
18701        let dir = TempDir::new().unwrap();
18702        let db_path = dir.path().join("test.db");
18703        let storage = SqliteStorage::open(&db_path).unwrap();
18704        let started_at = 1_700_000_000_000_i64;
18705        let day_id = FrankenStorage::day_id_from_millis(started_at);
18706        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
18707
18708        storage
18709            .conn
18710            .execute_compat(
18711                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
18712                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
18713                fparams![1_i64, "codex", "Codex", "cli"],
18714            )
18715            .unwrap();
18716        storage
18717            .conn
18718            .execute_compat(
18719                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
18720                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
18721                fparams![2_i64, "claude", "Claude", "cli"],
18722            )
18723            .unwrap();
18724
18725        storage
18726            .conn
18727            .execute_compat(
18728                "INSERT INTO conversations (
18729                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
18730                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
18731                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
18732                fparams![
18733                    1_i64,
18734                    1_i64,
18735                    LOCAL_SOURCE_ID,
18736                    "daily-a",
18737                    "Daily A",
18738                    "/tmp/daily-a.jsonl",
18739                    started_at,
18740                    started_at + 200,
18741                    "{}"
18742                ],
18743            )
18744            .unwrap();
18745        storage
18746            .conn
18747            .execute_compat(
18748                "INSERT INTO conversations (
18749                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
18750                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
18751                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
18752                fparams![
18753                    2_i64,
18754                    2_i64,
18755                    LOCAL_SOURCE_ID,
18756                    "daily-b",
18757                    "Daily B",
18758                    "/tmp/daily-b.jsonl",
18759                    started_at,
18760                    started_at + 300,
18761                    "{}"
18762                ],
18763            )
18764            .unwrap();
18765
18766        storage
18767            .conn
18768            .execute_compat(
18769                "INSERT INTO messages (
18770                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18771                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18772                fparams![1_i64, 1_i64, 0_i64, "user", started_at, "hello"],
18773            )
18774            .unwrap();
18775        storage
18776            .conn
18777            .execute_compat(
18778                "INSERT INTO messages (
18779                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18780                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18781                fparams![2_i64, 1_i64, 1_i64, "assistant", started_at + 100, "response"],
18782            )
18783            .unwrap();
18784        storage
18785            .conn
18786            .execute_compat(
18787                "INSERT INTO messages (
18788                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18789                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18790                fparams![3_i64, 2_i64, 0_i64, "user", started_at + 50, "abc"],
18791            )
18792            .unwrap();
18793
18794        for (message_id, agent_slug, role, content_len) in [
18795            (1_i64, "codex", "user", 5_i64),
18796            (2_i64, "codex", "assistant", 8_i64),
18797            (3_i64, "claude", "user", 3_i64),
18798        ] {
18799            storage
18800                .conn
18801                .execute_compat(
18802                    "INSERT INTO message_metrics (
18803                        message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
18804                        role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
18805                        api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
18806                        api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
18807                        model_name, model_family, model_tier, provider
18808                     ) VALUES (
18809                        ?1, ?2, ?3, ?4, ?5, ?6, ?7,
18810                        ?8, ?9, ?10, ?11, ?12,
18811                        ?13, ?14, ?15,
18812                        ?16, ?17, ?18, ?19, ?20,
18813                        ?21, ?22, ?23, ?24
18814                     )",
18815                    fparams![
18816                        message_id,
18817                        started_at,
18818                        hour_id,
18819                        day_id,
18820                        agent_slug,
18821                        0_i64,
18822                        LOCAL_SOURCE_ID,
18823                        role,
18824                        content_len,
18825                        content_len / 4,
18826                        0_i64,
18827                        0_i64,
18828                        0_i64,
18829                        0_i64,
18830                        0_i64,
18831                        "",
18832                        "estimated",
18833                        0_i64,
18834                        0_i64,
18835                        0_i64,
18836                        "",
18837                        "unknown",
18838                        "unknown",
18839                        "unknown"
18840                    ],
18841                )
18842                .unwrap();
18843        }
18844
18845        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18846
18847        let rebuilt = storage.rebuild_daily_stats().unwrap();
18848        assert_eq!(rebuilt.total_sessions, 2);
18849
18850        let health = storage.daily_stats_health().unwrap();
18851        assert_eq!(health.conversation_count, 2);
18852        assert_eq!(health.materialized_total, 2);
18853        assert_eq!(health.drift, 0);
18854
18855        let total_messages: i64 = storage
18856            .conn
18857            .query_row_map(
18858                "SELECT message_count FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18859                fparams![],
18860                |row| row.get_typed(0),
18861            )
18862            .unwrap();
18863        assert_eq!(total_messages, 3);
18864    }
18865
18866    #[test]
18867    fn rebuild_daily_stats_preserves_byte_counts_with_message_metrics() {
18868        let dir = TempDir::new().unwrap();
18869        let db_path = dir.path().join("test.db");
18870        let storage = SqliteStorage::open(&db_path).unwrap();
18871
18872        let content = "ASCII🙂é漢字";
18873        let expected_bytes = content.len() as i64;
18874        let started_at = 1_704_067_200_000_i64;
18875        let day_id = FrankenStorage::day_id_from_millis(started_at);
18876        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
18877
18878        storage
18879            .conn
18880            .execute_compat(
18881                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
18882                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
18883                fparams![1_i64, "tester", "Tester", "cli"],
18884            )
18885            .unwrap();
18886        storage
18887            .conn
18888            .execute_compat(
18889                "INSERT INTO conversations (
18890                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
18891                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
18892                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
18893                fparams![
18894                    1_i64,
18895                    1_i64,
18896                    LOCAL_SOURCE_ID,
18897                    "unicode-metrics",
18898                    "Unicode Metrics",
18899                    "/tmp/unicode-metrics.jsonl",
18900                    started_at,
18901                    "{}"
18902                ],
18903            )
18904            .unwrap();
18905        storage
18906            .conn
18907            .execute_compat(
18908                "INSERT INTO messages (
18909                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18910                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18911                fparams![1_i64, 1_i64, 0_i64, "user", started_at, content],
18912            )
18913            .unwrap();
18914        storage
18915            .conn
18916            .execute_compat(
18917                "INSERT INTO message_metrics (
18918                    message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
18919                    role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
18920                    api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
18921                    api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
18922                    model_name, model_family, model_tier, provider
18923                 ) VALUES (
18924                    ?1, ?2, ?3, ?4, ?5, ?6, ?7,
18925                    ?8, ?9, ?10, ?11, ?12,
18926                    ?13, ?14, ?15,
18927                    ?16, ?17, ?18, ?19, ?20,
18928                    ?21, ?22, ?23, ?24
18929                 )",
18930                fparams![
18931                    1_i64,
18932                    started_at,
18933                    hour_id,
18934                    day_id,
18935                    "tester",
18936                    0_i64,
18937                    LOCAL_SOURCE_ID,
18938                    "user",
18939                    expected_bytes,
18940                    expected_bytes / 4,
18941                    0_i64,
18942                    0_i64,
18943                    0_i64,
18944                    0_i64,
18945                    0_i64,
18946                    "",
18947                    "estimated",
18948                    0_i64,
18949                    0_i64,
18950                    0_i64,
18951                    "",
18952                    "unknown",
18953                    "unknown",
18954                    "unknown"
18955                ],
18956            )
18957            .unwrap();
18958
18959        let mut tx = storage.conn.transaction().unwrap();
18960        franken_update_daily_stats_in_tx(
18961            &storage,
18962            &tx,
18963            "tester",
18964            LOCAL_SOURCE_ID,
18965            Some(started_at),
18966            StatsDelta {
18967                session_count_delta: 1,
18968                message_count_delta: 1,
18969                total_chars_delta: expected_bytes,
18970            },
18971        )
18972        .unwrap();
18973        tx.commit().unwrap();
18974
18975        let inline_total: i64 = storage
18976            .conn
18977            .query_row_map(
18978                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18979                fparams![],
18980                |row| row.get_typed(0),
18981            )
18982            .unwrap();
18983        assert_eq!(inline_total, expected_bytes);
18984
18985        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18986
18987        let rebuilt = storage.rebuild_daily_stats().unwrap();
18988        assert_eq!(rebuilt.total_sessions, 1);
18989
18990        let rebuilt_total: i64 = storage
18991            .conn
18992            .query_row_map(
18993                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18994                fparams![],
18995                |row| row.get_typed(0),
18996            )
18997            .unwrap();
18998        assert_eq!(rebuilt_total, expected_bytes);
18999    }
19000
19001    #[test]
19002    fn rebuild_daily_stats_raw_fallback_preserves_byte_counts() {
19003        let dir = TempDir::new().unwrap();
19004        let db_path = dir.path().join("test.db");
19005        let storage = SqliteStorage::open(&db_path).unwrap();
19006
19007        let content = "fallback🙂é漢字";
19008        let expected_bytes = content.len() as i64;
19009        let started_at = 1_704_067_200_000_i64;
19010        storage
19011            .conn
19012            .execute_compat(
19013                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
19014                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
19015                fparams![1_i64, "tester", "Tester", "cli"],
19016            )
19017            .unwrap();
19018        storage
19019            .conn
19020            .execute_compat(
19021                "INSERT INTO conversations (
19022                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
19023                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
19024                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
19025                fparams![
19026                    1_i64,
19027                    1_i64,
19028                    LOCAL_SOURCE_ID,
19029                    "unicode-fallback",
19030                    "Unicode Fallback",
19031                    "/tmp/unicode-fallback.jsonl",
19032                    started_at,
19033                    "{}"
19034                ],
19035            )
19036            .unwrap();
19037        storage
19038            .conn
19039            .execute_compat(
19040                "INSERT INTO messages (
19041                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
19042                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
19043                fparams![1_i64, 1_i64, 0_i64, "assistant", started_at, content],
19044            )
19045            .unwrap();
19046
19047        let mut tx = storage.conn.transaction().unwrap();
19048        franken_update_daily_stats_in_tx(
19049            &storage,
19050            &tx,
19051            "tester",
19052            LOCAL_SOURCE_ID,
19053            Some(started_at),
19054            StatsDelta {
19055                session_count_delta: 1,
19056                message_count_delta: 1,
19057                total_chars_delta: expected_bytes,
19058            },
19059        )
19060        .unwrap();
19061        tx.commit().unwrap();
19062
19063        storage.conn.execute("DELETE FROM daily_stats").unwrap();
19064
19065        let rebuilt = storage.rebuild_daily_stats().unwrap();
19066        assert_eq!(rebuilt.total_sessions, 1);
19067
19068        let rebuilt_total: i64 = storage
19069            .conn
19070            .query_row_map(
19071                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
19072                fparams![],
19073                |row| row.get_typed(0),
19074            )
19075            .unwrap();
19076        assert_eq!(rebuilt_total, expected_bytes);
19077    }
19078
19079    #[test]
19080    fn insert_conversations_batched_appends_duplicate_external_id() {
19081        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19082        use std::path::PathBuf;
19083
19084        let dir = TempDir::new().unwrap();
19085        let db_path = dir.path().join("test.db");
19086        let storage = SqliteStorage::open(&db_path).unwrap();
19087
19088        let agent = Agent {
19089            id: None,
19090            slug: "codex".into(),
19091            name: "Codex".into(),
19092            version: Some("0.2.3".into()),
19093            kind: AgentKind::Cli,
19094        };
19095        let agent_id = storage.ensure_agent(&agent).unwrap();
19096
19097        let base_conv = |messages: Vec<Message>| Conversation {
19098            id: None,
19099            agent_slug: "codex".into(),
19100            workspace: Some(PathBuf::from("/tmp/workspace")),
19101            external_id: Some("shared-session".into()),
19102            title: Some("Shared Session".into()),
19103            source_path: PathBuf::from("/tmp/rollout.jsonl"),
19104            started_at: Some(1_700_000_000_000),
19105            ended_at: Some(1_700_000_000_999),
19106            approx_tokens: None,
19107            metadata_json: serde_json::Value::Null,
19108            messages,
19109            source_id: "local".into(),
19110            origin_host: None,
19111        };
19112
19113        let conv_a = base_conv(vec![
19114            Message {
19115                id: None,
19116                idx: 0,
19117                role: MessageRole::User,
19118                author: None,
19119                created_at: Some(1_700_000_000_000),
19120                content: "first".into(),
19121                extra_json: serde_json::Value::Null,
19122                snippets: Vec::new(),
19123            },
19124            Message {
19125                id: None,
19126                idx: 1,
19127                role: MessageRole::Agent,
19128                author: None,
19129                created_at: Some(1_700_000_000_100),
19130                content: "second".into(),
19131                extra_json: serde_json::Value::Null,
19132                snippets: Vec::new(),
19133            },
19134        ]);
19135        let conv_b = base_conv(vec![
19136            Message {
19137                id: None,
19138                idx: 0,
19139                role: MessageRole::User,
19140                author: None,
19141                created_at: Some(1_700_000_000_000),
19142                content: "first".into(),
19143                extra_json: serde_json::Value::Null,
19144                snippets: Vec::new(),
19145            },
19146            Message {
19147                id: None,
19148                idx: 1,
19149                role: MessageRole::Agent,
19150                author: None,
19151                created_at: Some(1_700_000_000_100),
19152                content: "second".into(),
19153                extra_json: serde_json::Value::Null,
19154                snippets: Vec::new(),
19155            },
19156            Message {
19157                id: None,
19158                idx: 2,
19159                role: MessageRole::User,
19160                author: None,
19161                created_at: Some(1_700_000_000_200),
19162                content: "third".into(),
19163                extra_json: serde_json::Value::Null,
19164                snippets: Vec::new(),
19165            },
19166            Message {
19167                id: None,
19168                idx: 3,
19169                role: MessageRole::Agent,
19170                author: None,
19171                created_at: Some(1_700_000_000_300),
19172                content: "fourth".into(),
19173                extra_json: serde_json::Value::Null,
19174                snippets: Vec::new(),
19175            },
19176        ]);
19177
19178        let outcomes = storage
19179            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
19180            .unwrap();
19181        assert_eq!(outcomes.len(), 2);
19182        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
19183        assert_eq!(outcomes[1].inserted_indices, vec![2, 3]);
19184        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
19185
19186        let conversation_count: i64 = storage
19187            .conn
19188            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19189                row.get_typed(0)
19190            })
19191            .unwrap();
19192        let conversation_count_not_indexed: i64 = storage
19193            .conn
19194            .query_row_map(
19195                "SELECT COUNT(*) FROM conversations NOT INDEXED",
19196                fparams![],
19197                |row| row.get_typed(0),
19198            )
19199            .unwrap();
19200        let conversation_count_source_index: i64 = storage
19201            .conn
19202            .query_row_map(
19203                "SELECT COUNT(*) FROM conversations INDEXED BY idx_conversations_source_id",
19204                fparams![],
19205                |row| row.get_typed(0),
19206            )
19207            .unwrap();
19208        let message_count: i64 = storage
19209            .conn
19210            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
19211                row.get_typed(0)
19212            })
19213            .unwrap();
19214        let reopened_storage = SqliteStorage::open(&db_path).unwrap();
19215        let reopened_conversation_count: i64 = reopened_storage
19216            .conn
19217            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19218                row.get_typed(0)
19219            })
19220            .unwrap();
19221        let reopened_conversation_count_not_indexed: i64 = reopened_storage
19222            .conn
19223            .query_row_map(
19224                "SELECT COUNT(*) FROM conversations NOT INDEXED",
19225                fparams![],
19226                |row| row.get_typed(0),
19227            )
19228            .unwrap();
19229        let reopened_conversation_ids: Vec<i64> = reopened_storage
19230            .conn
19231            .query_map_collect(
19232                "SELECT id FROM conversations ORDER BY id",
19233                fparams![],
19234                |row| row.get_typed(0),
19235            )
19236            .unwrap();
19237        let reopened_conversation_ids_not_indexed: Vec<i64> = reopened_storage
19238            .conn
19239            .query_map_collect(
19240                "SELECT id FROM conversations NOT INDEXED ORDER BY id",
19241                fparams![],
19242                |row| row.get_typed(0),
19243            )
19244            .unwrap();
19245        let reopened_conversation_ids_source_index: Vec<i64> = reopened_storage
19246            .conn
19247            .query_map_collect(
19248                "SELECT id FROM conversations INDEXED BY idx_conversations_source_id ORDER BY id",
19249                fparams![],
19250                |row| row.get_typed(0),
19251            )
19252            .unwrap();
19253
19254        assert_eq!(reopened_conversation_ids, vec![outcomes[0].conversation_id]);
19255        assert_eq!(
19256            reopened_conversation_ids_not_indexed,
19257            vec![outcomes[0].conversation_id]
19258        );
19259        assert_eq!(
19260            reopened_conversation_ids_source_index,
19261            vec![outcomes[0].conversation_id]
19262        );
19263        assert_eq!(reopened_conversation_count, 1);
19264        assert_eq!(reopened_conversation_count_not_indexed, 1);
19265        assert_eq!(conversation_count_not_indexed, 1);
19266        assert_eq!(conversation_count_source_index, 1);
19267        assert_eq!(conversation_count, 1);
19268        assert_eq!(message_count, 4);
19269    }
19270
19271    #[test]
19272    fn franken_insert_conversation_or_get_existing_recovers_unique_conflict() {
19273        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19274        use std::path::PathBuf;
19275
19276        let dir = TempDir::new().unwrap();
19277        let db_path = dir.path().join("test.db");
19278        let storage = SqliteStorage::open(&db_path).unwrap();
19279
19280        let agent = Agent {
19281            id: None,
19282            slug: "codex".into(),
19283            name: "Codex".into(),
19284            version: Some("0.2.3".into()),
19285            kind: AgentKind::Cli,
19286        };
19287        let agent_id = storage.ensure_agent(&agent).unwrap();
19288
19289        let conv = Conversation {
19290            id: None,
19291            agent_slug: "codex".into(),
19292            workspace: Some(PathBuf::from("/tmp/workspace")),
19293            external_id: Some("recover-duplicate".into()),
19294            title: Some("Recover Duplicate".into()),
19295            source_path: PathBuf::from("/tmp/rollout.jsonl"),
19296            started_at: Some(1_700_000_000_000),
19297            ended_at: Some(1_700_000_000_100),
19298            approx_tokens: None,
19299            metadata_json: serde_json::Value::Null,
19300            messages: vec![Message {
19301                id: None,
19302                idx: 0,
19303                role: MessageRole::User,
19304                author: None,
19305                created_at: Some(1_700_000_000_000),
19306                content: "hello".into(),
19307                extra_json: serde_json::Value::Null,
19308                snippets: Vec::new(),
19309            }],
19310            source_id: "local".into(),
19311            origin_host: None,
19312        };
19313
19314        let tx = storage.conn.transaction().unwrap();
19315        let inserted_id = franken_insert_conversation(&tx, agent_id, None, &conv)
19316            .unwrap()
19317            .expect("first insert should succeed");
19318
19319        let conversation_key = conversation_merge_key(agent_id, &conv);
19320        let resolved = franken_insert_conversation_or_get_existing_after_miss(
19321            &tx,
19322            agent_id,
19323            None,
19324            &conv,
19325            &conversation_key,
19326        )
19327        .unwrap();
19328
19329        assert!(
19330            matches!(
19331                resolved,
19332                ConversationInsertStatus::Existing(existing_id)
19333                    if existing_id.cmp(&inserted_id).is_eq()
19334            ),
19335            "expected existing conversation id {inserted_id}"
19336        );
19337
19338        let conversation_count: i64 = tx
19339            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19340                row.get_typed(0)
19341            })
19342            .unwrap();
19343        assert_eq!(conversation_count, 1);
19344    }
19345
19346    #[test]
19347    fn insert_conversations_batched_merges_duplicate_external_id_with_gaps() {
19348        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19349        use std::path::PathBuf;
19350
19351        let dir = TempDir::new().unwrap();
19352        let db_path = dir.path().join("test.db");
19353        let storage = SqliteStorage::open(&db_path).unwrap();
19354
19355        let agent = Agent {
19356            id: None,
19357            slug: "codex".into(),
19358            name: "Codex".into(),
19359            version: Some("0.2.3".into()),
19360            kind: AgentKind::Cli,
19361        };
19362        let agent_id = storage.ensure_agent(&agent).unwrap();
19363
19364        let base_conv = |messages: Vec<Message>| Conversation {
19365            id: None,
19366            agent_slug: "codex".into(),
19367            workspace: Some(PathBuf::from("/tmp/workspace")),
19368            external_id: Some("shared-session-gap".into()),
19369            title: Some("Shared Session Gap".into()),
19370            source_path: PathBuf::from("/tmp/rollout.jsonl"),
19371            started_at: Some(1_700_000_000_000),
19372            ended_at: Some(1_700_000_000_999),
19373            approx_tokens: None,
19374            metadata_json: serde_json::Value::Null,
19375            messages,
19376            source_id: "local".into(),
19377            origin_host: None,
19378        };
19379
19380        let conv_a = base_conv(vec![
19381            Message {
19382                id: None,
19383                idx: 2,
19384                role: MessageRole::User,
19385                author: None,
19386                created_at: Some(1_700_000_000_200),
19387                content: "third".into(),
19388                extra_json: serde_json::Value::Null,
19389                snippets: Vec::new(),
19390            },
19391            Message {
19392                id: None,
19393                idx: 3,
19394                role: MessageRole::Agent,
19395                author: None,
19396                created_at: Some(1_700_000_000_300),
19397                content: "fourth".into(),
19398                extra_json: serde_json::Value::Null,
19399                snippets: Vec::new(),
19400            },
19401        ]);
19402        let conv_b = base_conv(vec![
19403            Message {
19404                id: None,
19405                idx: 0,
19406                role: MessageRole::User,
19407                author: None,
19408                created_at: Some(1_700_000_000_000),
19409                content: "first".into(),
19410                extra_json: serde_json::Value::Null,
19411                snippets: Vec::new(),
19412            },
19413            Message {
19414                id: None,
19415                idx: 1,
19416                role: MessageRole::Agent,
19417                author: None,
19418                created_at: Some(1_700_000_000_100),
19419                content: "second".into(),
19420                extra_json: serde_json::Value::Null,
19421                snippets: Vec::new(),
19422            },
19423            Message {
19424                id: None,
19425                idx: 3,
19426                role: MessageRole::Agent,
19427                author: None,
19428                created_at: Some(1_700_000_000_300),
19429                content: "fourth".into(),
19430                extra_json: serde_json::Value::Null,
19431                snippets: Vec::new(),
19432            },
19433        ]);
19434
19435        let outcomes = storage
19436            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
19437            .unwrap();
19438        assert_eq!(outcomes.len(), 2);
19439        assert_eq!(outcomes[0].inserted_indices, vec![2, 3]);
19440        assert_eq!(outcomes[1].inserted_indices, vec![0, 1]);
19441        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
19442
19443        let stored_indices: Vec<i64> = storage
19444            .conn
19445            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
19446                row.get_typed(0)
19447            })
19448            .unwrap();
19449        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
19450    }
19451
19452    #[test]
19453    fn insert_conversations_batched_refreshes_partial_pending_message_lookup() {
19454        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19455        use std::path::PathBuf;
19456
19457        let dir = TempDir::new().unwrap();
19458        let db_path = dir.path().join("test.db");
19459        let storage = SqliteStorage::open(&db_path).unwrap();
19460
19461        let agent = Agent {
19462            id: None,
19463            slug: "codex".into(),
19464            name: "Codex".into(),
19465            version: Some("0.2.3".into()),
19466            kind: AgentKind::Cli,
19467        };
19468        let agent_id = storage.ensure_agent(&agent).unwrap();
19469
19470        let make_message = |idx: i64, content: &str| Message {
19471            id: None,
19472            idx,
19473            role: if idx == 0 {
19474                MessageRole::User
19475            } else {
19476                MessageRole::Agent
19477            },
19478            author: None,
19479            created_at: Some(1_700_000_000_000 + idx),
19480            content: content.into(),
19481            extra_json: serde_json::Value::Null,
19482            snippets: Vec::new(),
19483        };
19484
19485        let base_conv = |messages: Vec<Message>| Conversation {
19486            id: None,
19487            agent_slug: "codex".into(),
19488            workspace: Some(PathBuf::from("/tmp/workspace")),
19489            external_id: Some("partial-cache-session".into()),
19490            title: Some("Partial cache session".into()),
19491            source_path: PathBuf::from("/tmp/partial-cache.jsonl"),
19492            started_at: Some(1_700_000_000_000),
19493            ended_at: Some(1_700_000_000_100),
19494            approx_tokens: None,
19495            metadata_json: serde_json::Value::Null,
19496            messages,
19497            source_id: "local".into(),
19498            origin_host: None,
19499        };
19500
19501        let canonical = base_conv(vec![
19502            make_message(0, "canonical zero"),
19503            make_message(20, "canonical twenty"),
19504        ]);
19505        storage
19506            .insert_conversation_tree(agent_id, None, &canonical)
19507            .unwrap();
19508
19509        let exact_prefix = base_conv(vec![make_message(0, "canonical zero")]);
19510        let conflicting_tail = base_conv(vec![make_message(20, "conflicting twenty")]);
19511
19512        let outcomes = storage
19513            .insert_conversations_batched(&[
19514                (agent_id, None, &exact_prefix),
19515                (agent_id, None, &conflicting_tail),
19516            ])
19517            .unwrap();
19518
19519        assert_eq!(outcomes.len(), 2);
19520        assert!(outcomes[0].inserted_indices.is_empty());
19521        assert!(
19522            outcomes[1].inserted_indices.is_empty(),
19523            "the second batch item must refresh the partial pending lookup and retain the canonical idx=20 row"
19524        );
19525
19526        let stored_messages: Vec<(i64, String)> = storage
19527            .conn
19528            .query_map_collect(
19529                "SELECT idx, content FROM messages ORDER BY idx",
19530                fparams![],
19531                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
19532            )
19533            .unwrap();
19534        assert_eq!(
19535            stored_messages,
19536            vec![
19537                (0, "canonical zero".to_string()),
19538                (20, "canonical twenty".to_string()),
19539            ]
19540        );
19541    }
19542
19543    #[test]
19544    fn insert_conversations_batched_reprocessing_conversation_is_idempotent() {
19545        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19546        use std::path::PathBuf;
19547
19548        const MESSAGE_COUNT: i64 = 64;
19549
19550        let dir = TempDir::new().unwrap();
19551        let db_path = dir.path().join("test.db");
19552        let storage = SqliteStorage::open(&db_path).unwrap();
19553
19554        let agent = Agent {
19555            id: None,
19556            slug: "codex".into(),
19557            name: "Codex".into(),
19558            version: Some("0.2.3".into()),
19559            kind: AgentKind::Cli,
19560        };
19561        let agent_id = storage.ensure_agent(&agent).unwrap();
19562
19563        let messages: Vec<Message> = (0..MESSAGE_COUNT)
19564            .map(|idx| Message {
19565                id: None,
19566                idx,
19567                role: if idx % 2 == 0 {
19568                    MessageRole::User
19569                } else {
19570                    MessageRole::Agent
19571                },
19572                author: None,
19573                created_at: Some(1_700_000_000_000 + idx),
19574                content: format!("message {idx}"),
19575                extra_json: serde_json::Value::Null,
19576                snippets: Vec::new(),
19577            })
19578            .collect();
19579
19580        let conversation = Conversation {
19581            id: None,
19582            agent_slug: "codex".into(),
19583            workspace: Some(PathBuf::from("/tmp/workspace")),
19584            external_id: Some("large-reprocess-session".into()),
19585            title: Some("Large Reprocess Session".into()),
19586            source_path: PathBuf::from("/tmp/large-reprocess-session.jsonl"),
19587            started_at: Some(1_700_000_000_000),
19588            ended_at: Some(1_700_000_000_000 + MESSAGE_COUNT - 1),
19589            approx_tokens: None,
19590            metadata_json: serde_json::Value::Null,
19591            messages,
19592            source_id: "local".into(),
19593            origin_host: None,
19594        };
19595
19596        let first = storage
19597            .insert_conversations_batched(&[(agent_id, None, &conversation)])
19598            .unwrap();
19599        let second = storage
19600            .insert_conversations_batched(&[(agent_id, None, &conversation)])
19601            .unwrap();
19602
19603        assert_eq!(first.len(), 1);
19604        assert_eq!(second.len(), 1);
19605        assert_eq!(first[0].inserted_indices.len(), MESSAGE_COUNT as usize);
19606        assert!(
19607            second[0].inserted_indices.is_empty(),
19608            "full reprocessing of a large conversation must not attempt duplicate idx inserts"
19609        );
19610        assert_eq!(first[0].conversation_id, second[0].conversation_id);
19611
19612        let conversation_count: i64 = storage
19613            .conn
19614            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19615                row.get_typed(0)
19616            })
19617            .unwrap();
19618        let message_count: i64 = storage
19619            .conn
19620            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
19621                row.get_typed(0)
19622            })
19623            .unwrap();
19624
19625        assert_eq!(conversation_count, 1);
19626        assert_eq!(message_count, MESSAGE_COUNT);
19627    }
19628
19629    #[test]
19630    fn parallel_insert_conversation_tree_keeps_unique_external_ids_distinct() {
19631        use crate::connectors::{NormalizedConversation, NormalizedMessage};
19632        use crate::indexer::persist::map_to_internal;
19633        use crate::model::types::{Agent, AgentKind};
19634        use frankensqlite::compat::{ConnectionExt, RowExt};
19635        use rand::RngExt;
19636        use rayon::prelude::*;
19637
19638        fn retryable_franken_error(err: &anyhow::Error) -> bool {
19639            err.downcast_ref::<frankensqlite::FrankenError>()
19640                .or_else(|| {
19641                    err.root_cause()
19642                        .downcast_ref::<frankensqlite::FrankenError>()
19643                })
19644                .is_some_and(|inner| {
19645                    matches!(
19646                        inner,
19647                        frankensqlite::FrankenError::Busy
19648                            | frankensqlite::FrankenError::BusyRecovery
19649                            | frankensqlite::FrankenError::BusySnapshot { .. }
19650                            | frankensqlite::FrankenError::WriteConflict { .. }
19651                            | frankensqlite::FrankenError::SerializationFailure { .. }
19652                    )
19653                })
19654        }
19655
19656        fn with_retry<F, T>(mut f: F) -> anyhow::Result<T>
19657        where
19658            F: FnMut() -> anyhow::Result<T>,
19659        {
19660            let mut rng = rand::rng();
19661            let mut backoff_ms = 4_u64;
19662            for attempt in 0..=24 {
19663                match f() {
19664                    Ok(value) => return Ok(value),
19665                    Err(err) if attempt < 24 && retryable_franken_error(&err) => {
19666                        let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
19667                        std::thread::sleep(Duration::from_millis(sleep_ms));
19668                        backoff_ms = (backoff_ms * 2).min(512);
19669                    }
19670                    Err(err) => return Err(err),
19671                }
19672            }
19673            unreachable!("retry loop must return on success or final failure")
19674        }
19675
19676        let dir = TempDir::new().unwrap();
19677        let db_path = dir.path().join("parallel_insert_conversation_tree.db");
19678        let seed = FrankenStorage::open(&db_path).unwrap();
19679        drop(seed);
19680
19681        let conversations: Vec<NormalizedConversation> = (0..10)
19682            .map(|i| NormalizedConversation {
19683                agent_slug: format!("agent-{}", i % 3),
19684                external_id: Some(format!("conv-{i}")),
19685                title: Some(format!("Conversation {i}")),
19686                workspace: Some(PathBuf::from(format!("/ws/{i}"))),
19687                source_path: PathBuf::from(format!("/log/{i}.jsonl")),
19688                started_at: Some(1_000 + i * 100),
19689                ended_at: Some(1_000 + i * 100 + 50),
19690                metadata: serde_json::json!({}),
19691                messages: (0..3)
19692                    .map(|j| NormalizedMessage {
19693                        idx: j,
19694                        role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
19695                        author: Some("tester".into()),
19696                        created_at: Some(1_000 + i * 100 + j * 10),
19697                        content: format!("parallel-distinct-test conv={i} msg={j}"),
19698                        extra: serde_json::json!({}),
19699                        snippets: vec![],
19700                        invocations: Vec::new(),
19701                    })
19702                    .collect(),
19703            })
19704            .collect();
19705
19706        let mut outcomes: Vec<(String, i64, Vec<i64>)> = conversations
19707            .par_chunks(3)
19708            .map(|chunk| {
19709                let storage = FrankenStorage::open_writer(&db_path).unwrap();
19710                let mut agent_cache: HashMap<String, i64> = HashMap::new();
19711                let mut workspace_cache: HashMap<PathBuf, i64> = HashMap::new();
19712                let mut chunk_outcomes = Vec::with_capacity(chunk.len());
19713
19714                for conv in chunk {
19715                    let agent_slug = conv.agent_slug.clone();
19716                    let workspace = conv.workspace.clone();
19717                    let external_id = conv.external_id.clone().expect("external id");
19718                    let internal = map_to_internal(conv);
19719                    let outcome = with_retry(|| {
19720                        let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
19721                            *id
19722                        } else {
19723                            let agent = Agent {
19724                                id: None,
19725                                slug: agent_slug.clone(),
19726                                name: agent_slug.clone(),
19727                                version: None,
19728                                kind: AgentKind::Cli,
19729                            };
19730                            let id = storage.ensure_agent(&agent)?;
19731                            agent_cache.insert(agent_slug.clone(), id);
19732                            id
19733                        };
19734                        let workspace_id = if let Some(path) = &workspace {
19735                            if let Some(id) = workspace_cache.get(path) {
19736                                Some(*id)
19737                            } else {
19738                                let id = storage.ensure_workspace(path, None)?;
19739                                workspace_cache.insert(path.clone(), id);
19740                                Some(id)
19741                            }
19742                        } else {
19743                            None
19744                        };
19745                        storage.insert_conversation_tree(agent_id, workspace_id, &internal)
19746                    })
19747                    .unwrap();
19748                    chunk_outcomes.push((
19749                        external_id,
19750                        outcome.conversation_id,
19751                        outcome.inserted_indices,
19752                    ));
19753                }
19754
19755                storage.close().unwrap();
19756                chunk_outcomes
19757            })
19758            .flatten()
19759            .collect();
19760        outcomes.sort_by(|left, right| left.0.cmp(&right.0));
19761
19762        assert!(
19763            outcomes
19764                .iter()
19765                .all(|(_, _, inserted_indices)| inserted_indices == &vec![0, 1, 2]),
19766            "unique external ids must not be routed through the existing-conversation merge path: {outcomes:?}"
19767        );
19768
19769        let distinct_ids: HashSet<i64> = outcomes
19770            .iter()
19771            .map(|(_, conversation_id, _)| *conversation_id)
19772            .collect();
19773        assert_eq!(
19774            distinct_ids.len(),
19775            conversations.len(),
19776            "unique external ids must produce distinct conversation ids: {outcomes:?}"
19777        );
19778
19779        let reader = FrankenStorage::open(&db_path).unwrap();
19780        let stored_rows: Vec<(i64, String)> = reader
19781            .raw()
19782            .query_map_collect(
19783                "SELECT id, external_id FROM conversations ORDER BY id",
19784                &[],
19785                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
19786            )
19787            .unwrap();
19788        let stored_count: i64 = reader
19789            .raw()
19790            .query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
19791                row.get_typed(0)
19792            })
19793            .unwrap();
19794
19795        assert_eq!(
19796            stored_count as usize,
19797            conversations.len(),
19798            "parallel distinct inserts must persist one row per external id; rows={stored_rows:?}; outcomes={outcomes:?}"
19799        );
19800        assert_eq!(
19801            stored_rows.len(),
19802            conversations.len(),
19803            "parallel distinct inserts must remain visible after reopening; rows={stored_rows:?}; outcomes={outcomes:?}"
19804        );
19805    }
19806
19807    #[test]
19808    fn insert_conversation_tree_merges_duplicate_external_id_with_gaps() {
19809        use crate::connectors::{NormalizedConversation, NormalizedMessage};
19810        use crate::indexer::persist::map_to_internal;
19811        use crate::model::types::{Agent, AgentKind};
19812        use std::path::PathBuf;
19813
19814        let dir = TempDir::new().unwrap();
19815        let db_path = dir.path().join("test.db");
19816        let storage = SqliteStorage::open(&db_path).unwrap();
19817
19818        let agent = Agent {
19819            id: None,
19820            slug: "codex".into(),
19821            name: "Codex".into(),
19822            version: Some("0.2.3".into()),
19823            kind: AgentKind::Cli,
19824        };
19825        let agent_id = storage.ensure_agent(&agent).unwrap();
19826
19827        let base_conv = |messages: Vec<NormalizedMessage>| NormalizedConversation {
19828            agent_slug: "codex".into(),
19829            workspace: Some(PathBuf::from("/tmp/workspace")),
19830            external_id: Some("tree-gap-session".into()),
19831            title: Some("Tree Gap Session".into()),
19832            source_path: PathBuf::from("/tmp/tree.jsonl"),
19833            started_at: Some(1_700_000_000_000),
19834            ended_at: Some(1_700_000_000_999),
19835            metadata: serde_json::Value::Null,
19836            messages,
19837        };
19838
19839        let conv_a = map_to_internal(&base_conv(vec![
19840            NormalizedMessage {
19841                idx: 2,
19842                role: "user".into(),
19843                author: None,
19844                created_at: Some(1_700_000_000_200),
19845                content: "third".into(),
19846                extra: serde_json::Value::Null,
19847                snippets: Vec::new(),
19848                invocations: Vec::new(),
19849            },
19850            NormalizedMessage {
19851                idx: 3,
19852                role: "assistant".into(),
19853                author: None,
19854                created_at: Some(1_700_000_000_300),
19855                content: "fourth".into(),
19856                extra: serde_json::Value::Null,
19857                snippets: Vec::new(),
19858                invocations: Vec::new(),
19859            },
19860        ]));
19861        let conv_b = map_to_internal(&base_conv(vec![
19862            NormalizedMessage {
19863                idx: 0,
19864                role: "user".into(),
19865                author: None,
19866                created_at: Some(1_700_000_000_000),
19867                content: "first".into(),
19868                extra: serde_json::Value::Null,
19869                snippets: Vec::new(),
19870                invocations: Vec::new(),
19871            },
19872            NormalizedMessage {
19873                idx: 1,
19874                role: "assistant".into(),
19875                author: None,
19876                created_at: Some(1_700_000_000_100),
19877                content: "second".into(),
19878                extra: serde_json::Value::Null,
19879                snippets: Vec::new(),
19880                invocations: Vec::new(),
19881            },
19882            NormalizedMessage {
19883                idx: 3,
19884                role: "assistant".into(),
19885                author: None,
19886                created_at: Some(1_700_000_000_300),
19887                content: "fourth".into(),
19888                extra: serde_json::Value::Null,
19889                snippets: Vec::new(),
19890                invocations: Vec::new(),
19891            },
19892        ]));
19893
19894        let first = storage
19895            .insert_conversation_tree(agent_id, None, &conv_a)
19896            .unwrap();
19897        let second = storage
19898            .insert_conversation_tree(agent_id, None, &conv_b)
19899            .unwrap();
19900
19901        assert_eq!(first.inserted_indices, vec![2, 3]);
19902        assert_eq!(second.inserted_indices, vec![0, 1]);
19903        assert_eq!(first.conversation_id, second.conversation_id);
19904
19905        let stored_indices: Vec<i64> = storage
19906            .conn
19907            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
19908                row.get_typed(0)
19909            })
19910            .unwrap();
19911        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
19912    }
19913
19914    #[test]
19915    fn insert_conversation_tree_skips_duplicate_message_indices_for_new_conversation() {
19916        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19917        use std::path::PathBuf;
19918
19919        let dir = TempDir::new().unwrap();
19920        let db_path = dir.path().join("test.db");
19921        let storage = SqliteStorage::open(&db_path).unwrap();
19922
19923        let agent = Agent {
19924            id: None,
19925            slug: "codex".into(),
19926            name: "Codex".into(),
19927            version: Some("0.2.3".into()),
19928            kind: AgentKind::Cli,
19929        };
19930        let agent_id = storage.ensure_agent(&agent).unwrap();
19931
19932        let conversation = Conversation {
19933            id: None,
19934            agent_slug: "codex".into(),
19935            workspace: Some(PathBuf::from("/tmp/workspace")),
19936            external_id: Some("duplicate-new-session".into()),
19937            title: Some("Duplicate New Session".into()),
19938            source_path: PathBuf::from("/tmp/duplicate-new-session.jsonl"),
19939            started_at: Some(1_700_000_000_000),
19940            ended_at: Some(1_700_000_000_999),
19941            approx_tokens: None,
19942            metadata_json: serde_json::Value::Null,
19943            messages: vec![
19944                Message {
19945                    id: None,
19946                    idx: 0,
19947                    role: MessageRole::User,
19948                    author: None,
19949                    created_at: Some(1_700_000_000_000),
19950                    content: "first canonical".into(),
19951                    extra_json: serde_json::Value::Null,
19952                    snippets: Vec::new(),
19953                },
19954                Message {
19955                    id: None,
19956                    idx: 0,
19957                    role: MessageRole::User,
19958                    author: None,
19959                    created_at: Some(1_700_000_000_001),
19960                    content: "duplicate idx should be skipped".into(),
19961                    extra_json: serde_json::Value::Null,
19962                    snippets: Vec::new(),
19963                },
19964                Message {
19965                    id: None,
19966                    idx: 1,
19967                    role: MessageRole::Agent,
19968                    author: None,
19969                    created_at: Some(1_700_000_000_100),
19970                    content: "second".into(),
19971                    extra_json: serde_json::Value::Null,
19972                    snippets: Vec::new(),
19973                },
19974            ],
19975            source_id: "local".into(),
19976            origin_host: None,
19977        };
19978
19979        let outcome = storage
19980            .insert_conversation_tree(agent_id, None, &conversation)
19981            .unwrap();
19982
19983        assert_eq!(outcome.inserted_indices, vec![0, 1]);
19984
19985        let stored_messages: Vec<(i64, String)> = storage
19986            .conn
19987            .query_map_collect(
19988                "SELECT idx, content FROM messages ORDER BY idx",
19989                fparams![],
19990                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
19991            )
19992            .unwrap();
19993        assert_eq!(
19994            stored_messages,
19995            vec![
19996                (0, "first canonical".to_string()),
19997                (1, "second".to_string())
19998            ]
19999        );
20000    }
20001
20002    #[test]
20003    fn insert_conversation_tree_merges_duplicate_source_path_without_external_id() {
20004        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20005        use std::path::PathBuf;
20006
20007        let dir = TempDir::new().unwrap();
20008        let db_path = dir.path().join("test.db");
20009        let storage = SqliteStorage::open(&db_path).unwrap();
20010
20011        let agent = Agent {
20012            id: None,
20013            slug: "codex".into(),
20014            name: "Codex".into(),
20015            version: Some("0.2.3".into()),
20016            kind: AgentKind::Cli,
20017        };
20018        let agent_id = storage.ensure_agent(&agent).unwrap();
20019
20020        let base_conv = |messages: Vec<Message>| Conversation {
20021            id: None,
20022            agent_slug: "codex".into(),
20023            workspace: Some(PathBuf::from("/tmp/workspace")),
20024            external_id: None,
20025            title: Some("Source Path Merge".into()),
20026            source_path: PathBuf::from("/tmp/shared-session.jsonl"),
20027            started_at: Some(1_700_000_000_000),
20028            ended_at: Some(1_700_000_000_999),
20029            approx_tokens: None,
20030            metadata_json: serde_json::Value::Null,
20031            messages,
20032            source_id: "local".into(),
20033            origin_host: None,
20034        };
20035
20036        let first = storage
20037            .insert_conversation_tree(
20038                agent_id,
20039                None,
20040                &base_conv(vec![
20041                    Message {
20042                        id: None,
20043                        idx: 0,
20044                        role: MessageRole::User,
20045                        author: None,
20046                        created_at: Some(1_700_000_000_000),
20047                        content: "first".into(),
20048                        extra_json: serde_json::Value::Null,
20049                        snippets: Vec::new(),
20050                    },
20051                    Message {
20052                        id: None,
20053                        idx: 1,
20054                        role: MessageRole::Agent,
20055                        author: None,
20056                        created_at: Some(1_700_000_000_100),
20057                        content: "second".into(),
20058                        extra_json: serde_json::Value::Null,
20059                        snippets: Vec::new(),
20060                    },
20061                ]),
20062            )
20063            .unwrap();
20064
20065        let second = storage
20066            .insert_conversation_tree(
20067                agent_id,
20068                None,
20069                &base_conv(vec![
20070                    Message {
20071                        id: None,
20072                        idx: 1,
20073                        role: MessageRole::Agent,
20074                        author: None,
20075                        created_at: Some(1_700_000_000_100),
20076                        content: "second".into(),
20077                        extra_json: serde_json::Value::Null,
20078                        snippets: Vec::new(),
20079                    },
20080                    Message {
20081                        id: None,
20082                        idx: 2,
20083                        role: MessageRole::User,
20084                        author: None,
20085                        created_at: Some(1_700_000_000_200),
20086                        content: "third".into(),
20087                        extra_json: serde_json::Value::Null,
20088                        snippets: Vec::new(),
20089                    },
20090                ]),
20091            )
20092            .unwrap();
20093
20094        assert_eq!(first.conversation_id, second.conversation_id);
20095        assert_eq!(first.inserted_indices, vec![0, 1]);
20096        assert_eq!(second.inserted_indices, vec![2]);
20097
20098        let stored_indices: Vec<i64> = storage
20099            .conn
20100            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
20101                row.get_typed(0)
20102            })
20103            .unwrap();
20104        assert_eq!(stored_indices, vec![0, 1, 2]);
20105    }
20106
20107    #[test]
20108    fn insert_conversation_tree_merges_source_path_duplicates_with_start_drift() {
20109        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20110        use std::path::PathBuf;
20111
20112        let dir = TempDir::new().unwrap();
20113        let db_path = dir.path().join("test.db");
20114        let storage = SqliteStorage::open(&db_path).unwrap();
20115
20116        let agent = Agent {
20117            id: None,
20118            slug: "codex".into(),
20119            name: "Codex".into(),
20120            version: Some("0.2.3".into()),
20121            kind: AgentKind::Cli,
20122        };
20123        let agent_id = storage.ensure_agent(&agent).unwrap();
20124
20125        let base_conv = |started_at: Option<i64>, messages: Vec<Message>| Conversation {
20126            id: None,
20127            agent_slug: "codex".into(),
20128            workspace: Some(PathBuf::from("/tmp/workspace")),
20129            external_id: None,
20130            title: Some("Drift Merge".into()),
20131            source_path: PathBuf::from("/tmp/drift-session.jsonl"),
20132            started_at,
20133            ended_at: Some(1_700_000_000_999),
20134            approx_tokens: None,
20135            metadata_json: serde_json::Value::Null,
20136            messages,
20137            source_id: "local".into(),
20138            origin_host: None,
20139        };
20140
20141        let first = storage
20142            .insert_conversation_tree(
20143                agent_id,
20144                None,
20145                &base_conv(
20146                    Some(1_700_000_000_000),
20147                    vec![
20148                        Message {
20149                            id: None,
20150                            idx: 0,
20151                            role: MessageRole::User,
20152                            author: None,
20153                            created_at: Some(1_700_000_000_000),
20154                            content: "first".into(),
20155                            extra_json: serde_json::Value::Null,
20156                            snippets: Vec::new(),
20157                        },
20158                        Message {
20159                            id: None,
20160                            idx: 1,
20161                            role: MessageRole::Agent,
20162                            author: None,
20163                            created_at: Some(1_700_000_000_100),
20164                            content: "second".into(),
20165                            extra_json: serde_json::Value::Null,
20166                            snippets: Vec::new(),
20167                        },
20168                    ],
20169                ),
20170            )
20171            .unwrap();
20172
20173        let second = storage
20174            .insert_conversation_tree(
20175                agent_id,
20176                None,
20177                &base_conv(
20178                    Some(1_700_000_004_000),
20179                    vec![
20180                        Message {
20181                            id: None,
20182                            idx: 1,
20183                            role: MessageRole::Agent,
20184                            author: None,
20185                            created_at: Some(1_700_000_000_100),
20186                            content: "second".into(),
20187                            extra_json: serde_json::Value::Null,
20188                            snippets: Vec::new(),
20189                        },
20190                        Message {
20191                            id: None,
20192                            idx: 2,
20193                            role: MessageRole::User,
20194                            author: None,
20195                            created_at: Some(1_700_000_004_200),
20196                            content: "third".into(),
20197                            extra_json: serde_json::Value::Null,
20198                            snippets: Vec::new(),
20199                        },
20200                    ],
20201                ),
20202            )
20203            .unwrap();
20204
20205        assert_eq!(first.conversation_id, second.conversation_id);
20206        assert_eq!(second.inserted_indices, vec![2]);
20207    }
20208
20209    #[test]
20210    fn insert_conversation_tree_keeps_single_message_overlap_sessions_separate() {
20211        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20212        use std::path::PathBuf;
20213
20214        let dir = TempDir::new().unwrap();
20215        let db_path = dir.path().join("test.db");
20216        let storage = SqliteStorage::open(&db_path).unwrap();
20217
20218        let agent = Agent {
20219            id: None,
20220            slug: "codex".into(),
20221            name: "Codex".into(),
20222            version: Some("0.2.3".into()),
20223            kind: AgentKind::Cli,
20224        };
20225        let agent_id = storage.ensure_agent(&agent).unwrap();
20226
20227        let make_conv = |started_at: i64, idx: i64, content: &str| Conversation {
20228            id: None,
20229            agent_slug: "codex".into(),
20230            workspace: Some(PathBuf::from("/tmp/workspace")),
20231            external_id: None,
20232            title: Some("Partial overlap".into()),
20233            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
20234            started_at: Some(started_at),
20235            ended_at: Some(started_at + 500),
20236            approx_tokens: None,
20237            metadata_json: serde_json::Value::Null,
20238            messages: vec![Message {
20239                id: None,
20240                idx,
20241                role: MessageRole::User,
20242                author: None,
20243                created_at: Some(started_at),
20244                content: content.into(),
20245                extra_json: serde_json::Value::Null,
20246                snippets: Vec::new(),
20247            }],
20248            source_id: "local".into(),
20249            origin_host: None,
20250        };
20251
20252        storage
20253            .insert_conversation_tree(
20254                agent_id,
20255                None,
20256                &Conversation {
20257                    messages: vec![
20258                        Message {
20259                            id: None,
20260                            idx: 0,
20261                            role: MessageRole::User,
20262                            author: None,
20263                            created_at: Some(1_700_000_000_000),
20264                            content: "shared opener".into(),
20265                            extra_json: serde_json::Value::Null,
20266                            snippets: Vec::new(),
20267                        },
20268                        Message {
20269                            id: None,
20270                            idx: 1,
20271                            role: MessageRole::Agent,
20272                            author: None,
20273                            created_at: Some(1_700_000_000_100),
20274                            content: "first session unique".into(),
20275                            extra_json: serde_json::Value::Null,
20276                            snippets: Vec::new(),
20277                        },
20278                    ],
20279                    ..make_conv(1_700_000_000_000, 0, "unused")
20280                },
20281            )
20282            .unwrap();
20283        storage
20284            .insert_conversation_tree(
20285                agent_id,
20286                None,
20287                &make_conv(1_700_000_900_000, 0, "shared opener"),
20288            )
20289            .unwrap();
20290
20291        let conversation_count: i64 = storage
20292            .conn
20293            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
20294                row.get_typed(0)
20295            })
20296            .unwrap();
20297        assert_eq!(conversation_count, 2);
20298    }
20299
20300    #[test]
20301    fn insert_conversation_tree_keeps_distinct_source_path_sessions_separate() {
20302        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20303        use std::path::PathBuf;
20304
20305        let dir = TempDir::new().unwrap();
20306        let db_path = dir.path().join("test.db");
20307        let storage = SqliteStorage::open(&db_path).unwrap();
20308
20309        let agent = Agent {
20310            id: None,
20311            slug: "codex".into(),
20312            name: "Codex".into(),
20313            version: Some("0.2.3".into()),
20314            kind: AgentKind::Cli,
20315        };
20316        let agent_id = storage.ensure_agent(&agent).unwrap();
20317
20318        let make_conv = |started_at: i64, created_at: i64, content: &str| Conversation {
20319            id: None,
20320            agent_slug: "codex".into(),
20321            workspace: Some(PathBuf::from("/tmp/workspace")),
20322            external_id: None,
20323            title: Some("Same Path Different Session".into()),
20324            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
20325            started_at: Some(started_at),
20326            ended_at: Some(started_at + 500),
20327            approx_tokens: None,
20328            metadata_json: serde_json::Value::Null,
20329            messages: vec![Message {
20330                id: None,
20331                idx: 0,
20332                role: MessageRole::User,
20333                author: None,
20334                created_at: Some(created_at),
20335                content: content.into(),
20336                extra_json: serde_json::Value::Null,
20337                snippets: Vec::new(),
20338            }],
20339            source_id: "local".into(),
20340            origin_host: None,
20341        };
20342
20343        storage
20344            .insert_conversation_tree(
20345                agent_id,
20346                None,
20347                &make_conv(1_700_000_000_000, 1_700_000_000_000, "first session"),
20348            )
20349            .unwrap();
20350        storage
20351            .insert_conversation_tree(
20352                agent_id,
20353                None,
20354                &make_conv(1_700_000_900_000, 1_700_000_900_000, "second session"),
20355            )
20356            .unwrap();
20357
20358        let conversation_count: i64 = storage
20359            .conn
20360            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
20361                row.get_typed(0)
20362            })
20363            .unwrap();
20364        assert_eq!(conversation_count, 2);
20365    }
20366
20367    #[test]
20368    fn insert_conversation_tree_merges_replay_equivalent_messages_with_shifted_idx() {
20369        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20370        use std::path::PathBuf;
20371
20372        let dir = TempDir::new().unwrap();
20373        let db_path = dir.path().join("test.db");
20374        let storage = SqliteStorage::open(&db_path).unwrap();
20375
20376        let agent = Agent {
20377            id: None,
20378            slug: "codex".into(),
20379            name: "Codex".into(),
20380            version: Some("0.2.3".into()),
20381            kind: AgentKind::Cli,
20382        };
20383        let agent_id = storage.ensure_agent(&agent).unwrap();
20384
20385        let make_conv = |started_at: i64, messages: Vec<Message>| Conversation {
20386            id: None,
20387            agent_slug: "codex".into(),
20388            workspace: Some(PathBuf::from("/tmp/workspace")),
20389            external_id: None,
20390            title: Some("Shifted replay".into()),
20391            source_path: PathBuf::from("/tmp/replay-session.jsonl"),
20392            started_at: Some(started_at),
20393            ended_at: Some(started_at + 500),
20394            approx_tokens: None,
20395            metadata_json: serde_json::Value::Null,
20396            messages,
20397            source_id: "local".into(),
20398            origin_host: None,
20399        };
20400
20401        let first = storage
20402            .insert_conversation_tree(
20403                agent_id,
20404                None,
20405                &make_conv(
20406                    1_700_000_000_000,
20407                    vec![
20408                        Message {
20409                            id: None,
20410                            idx: 0,
20411                            role: MessageRole::User,
20412                            author: None,
20413                            created_at: Some(1_700_000_000_000),
20414                            content: "first".into(),
20415                            extra_json: serde_json::Value::Null,
20416                            snippets: Vec::new(),
20417                        },
20418                        Message {
20419                            id: None,
20420                            idx: 1,
20421                            role: MessageRole::Agent,
20422                            author: None,
20423                            created_at: Some(1_700_000_000_100),
20424                            content: "second".into(),
20425                            extra_json: serde_json::Value::Null,
20426                            snippets: Vec::new(),
20427                        },
20428                    ],
20429                ),
20430            )
20431            .unwrap();
20432
20433        let second = storage
20434            .insert_conversation_tree(
20435                agent_id,
20436                None,
20437                &make_conv(
20438                    1_700_000_900_000,
20439                    vec![
20440                        Message {
20441                            id: None,
20442                            idx: 10,
20443                            role: MessageRole::User,
20444                            author: None,
20445                            created_at: Some(1_700_000_000_000),
20446                            content: "first".into(),
20447                            extra_json: serde_json::Value::Null,
20448                            snippets: Vec::new(),
20449                        },
20450                        Message {
20451                            id: None,
20452                            idx: 11,
20453                            role: MessageRole::Agent,
20454                            author: None,
20455                            created_at: Some(1_700_000_000_100),
20456                            content: "second".into(),
20457                            extra_json: serde_json::Value::Null,
20458                            snippets: Vec::new(),
20459                        },
20460                        Message {
20461                            id: None,
20462                            idx: 12,
20463                            role: MessageRole::User,
20464                            author: None,
20465                            created_at: Some(1_700_000_000_200),
20466                            content: "third".into(),
20467                            extra_json: serde_json::Value::Null,
20468                            snippets: Vec::new(),
20469                        },
20470                    ],
20471                ),
20472            )
20473            .unwrap();
20474
20475        assert_eq!(first.conversation_id, second.conversation_id);
20476        assert_eq!(second.inserted_indices, vec![12]);
20477
20478        let stored_indices: Vec<i64> = storage
20479            .conn
20480            .query_map_collect(
20481                "SELECT idx FROM messages WHERE conversation_id = ?1 ORDER BY idx",
20482                fparams![first.conversation_id],
20483                |row| row.get_typed(0),
20484            )
20485            .unwrap();
20486        assert_eq!(stored_indices, vec![0, 1, 12]);
20487    }
20488
20489    #[test]
20490    fn salvage_historical_databases_imports_backups_once_and_merges_overlap() {
20491        use crate::model::types::{Conversation, Message, MessageRole};
20492        use std::path::PathBuf;
20493
20494        fn base_conv(source_path: &str, messages: Vec<Message>) -> Conversation {
20495            Conversation {
20496                id: None,
20497                agent_slug: "codex".into(),
20498                workspace: Some(PathBuf::from("/tmp/workspace")),
20499                external_id: None,
20500                title: Some("Recovered".into()),
20501                source_path: PathBuf::from(source_path),
20502                started_at: Some(1_700_000_000_000),
20503                ended_at: Some(1_700_000_000_999),
20504                approx_tokens: None,
20505                metadata_json: serde_json::Value::Null,
20506                messages,
20507                source_id: "local".into(),
20508                origin_host: None,
20509            }
20510        }
20511
20512        let dir = TempDir::new().unwrap();
20513        let canonical_db = dir.path().join("agent_search.db");
20514        let storage = SqliteStorage::open(&canonical_db).unwrap();
20515
20516        let overlapping_a = base_conv(
20517            "/tmp/shared-history.jsonl",
20518            vec![
20519                Message {
20520                    id: None,
20521                    idx: 0,
20522                    role: MessageRole::User,
20523                    author: None,
20524                    created_at: Some(1_700_000_000_000),
20525                    content: "first".into(),
20526                    extra_json: serde_json::Value::Null,
20527                    snippets: Vec::new(),
20528                },
20529                Message {
20530                    id: None,
20531                    idx: 1,
20532                    role: MessageRole::Agent,
20533                    author: None,
20534                    created_at: Some(1_700_000_000_100),
20535                    content: "second".into(),
20536                    extra_json: serde_json::Value::Null,
20537                    snippets: Vec::new(),
20538                },
20539            ],
20540        );
20541        let overlapping_b = base_conv(
20542            "/tmp/shared-history.jsonl",
20543            vec![
20544                Message {
20545                    id: None,
20546                    idx: 1,
20547                    role: MessageRole::Agent,
20548                    author: None,
20549                    created_at: Some(1_700_000_000_100),
20550                    content: "second".into(),
20551                    extra_json: serde_json::Value::Null,
20552                    snippets: Vec::new(),
20553                },
20554                Message {
20555                    id: None,
20556                    idx: 2,
20557                    role: MessageRole::User,
20558                    author: None,
20559                    created_at: Some(1_700_000_000_200),
20560                    content: "third".into(),
20561                    extra_json: serde_json::Value::Null,
20562                    snippets: Vec::new(),
20563                },
20564            ],
20565        );
20566        let unique = Conversation {
20567            source_path: PathBuf::from("/tmp/unique-history.jsonl"),
20568            messages: vec![Message {
20569                id: None,
20570                idx: 0,
20571                role: MessageRole::User,
20572                author: None,
20573                created_at: Some(1_700_000_001_000),
20574                content: "unique".into(),
20575                extra_json: serde_json::Value::Null,
20576                snippets: Vec::new(),
20577            }],
20578            started_at: Some(1_700_000_001_000),
20579            ended_at: Some(1_700_000_001_100),
20580            ..base_conv("/tmp/unique-history.jsonl", Vec::new())
20581        };
20582
20583        seed_historical_db_direct(
20584            &dir.path()
20585                .join("backups/agent_search.db.20260322T020200.bak"),
20586            std::slice::from_ref(&overlapping_a),
20587        );
20588        seed_historical_db_direct(
20589            &dir.path().join("agent_search.corrupt.20260324_212907"),
20590            &[overlapping_b, unique],
20591        );
20592
20593        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
20594        assert_eq!(first.bundles_considered, 2);
20595        assert_eq!(first.bundles_imported, 2);
20596        assert_eq!(first.messages_imported, 4);
20597
20598        let conversations = storage.list_conversations(10, 0).unwrap();
20599        assert_eq!(conversations.len(), 2);
20600
20601        let shared_id = conversations
20602            .iter()
20603            .find(|conv| conv.source_path == std::path::Path::new("/tmp/shared-history.jsonl"))
20604            .and_then(|conv| conv.id)
20605            .unwrap();
20606        let shared_indices: Vec<i64> = storage
20607            .fetch_messages(shared_id)
20608            .unwrap()
20609            .into_iter()
20610            .map(|msg| msg.idx)
20611            .collect();
20612        assert_eq!(shared_indices, vec![0, 1, 2]);
20613
20614        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
20615        assert_eq!(second.bundles_imported, 0);
20616        assert_eq!(second.messages_imported, 0);
20617    }
20618
20619    #[test]
20620    fn salvage_historical_databases_normalizes_host_only_remote_provenance() {
20621        use crate::model::types::{Conversation, Message, MessageRole};
20622        use std::path::PathBuf;
20623
20624        let dir = TempDir::new().unwrap();
20625        let canonical_db = dir.path().join("agent_search.db");
20626        let storage = SqliteStorage::open(&canonical_db).unwrap();
20627
20628        let host_only_remote = Conversation {
20629            id: None,
20630            agent_slug: "codex".into(),
20631            workspace: Some(PathBuf::from("/tmp/workspace")),
20632            external_id: None,
20633            title: Some("Recovered Host Only Remote".into()),
20634            source_path: PathBuf::from("/tmp/host-only-history.jsonl"),
20635            started_at: Some(1_700_000_000_000),
20636            ended_at: Some(1_700_000_000_999),
20637            approx_tokens: None,
20638            metadata_json: serde_json::Value::Null,
20639            messages: vec![Message {
20640                id: None,
20641                idx: 0,
20642                role: MessageRole::User,
20643                author: None,
20644                created_at: Some(1_700_000_000_000),
20645                content: "host-only remote".into(),
20646                extra_json: serde_json::Value::Null,
20647                snippets: Vec::new(),
20648            }],
20649            source_id: "   ".into(),
20650            origin_host: Some("builder-5".into()),
20651        };
20652
20653        let historical_db = dir
20654            .path()
20655            .join("backups/agent_search.db.20260322T020200.bak");
20656        seed_historical_db_direct(&historical_db, std::slice::from_ref(&host_only_remote));
20657
20658        let historical_conn =
20659            FrankenConnection::open(historical_db.to_string_lossy().into_owned()).unwrap();
20660        historical_conn
20661            .execute_compat(
20662                "INSERT INTO sources(id, kind, host_label, created_at, updated_at) VALUES(?1, ?2, ?3, ?4, ?5)",
20663                fparams!["   ", "ssh", "builder-5", 0_i64, 0_i64],
20664            )
20665            .unwrap();
20666        historical_conn
20667            .execute_compat(
20668                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE source_path = ?3",
20669                fparams!["   ", "builder-5", "/tmp/host-only-history.jsonl"],
20670            )
20671            .unwrap();
20672        historical_conn
20673            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams!["builder-5"])
20674            .unwrap();
20675        drop(historical_conn);
20676
20677        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
20678        assert_eq!(first.bundles_imported, 1);
20679        assert_eq!(first.messages_imported, 1);
20680
20681        let source_ids = storage.get_source_ids().unwrap();
20682        assert_eq!(source_ids, vec!["builder-5".to_string()]);
20683
20684        let conversations = storage.list_conversations(10, 0).unwrap();
20685        assert_eq!(conversations.len(), 1);
20686        assert_eq!(conversations[0].source_id, "builder-5");
20687        assert_eq!(conversations[0].origin_host.as_deref(), Some("builder-5"));
20688    }
20689
20690    #[test]
20691    fn historical_salvage_retry_splits_single_conversation_until_it_fits() {
20692        use crate::model::types::{Conversation, Message, MessageRole};
20693        use std::path::PathBuf;
20694
20695        let mut attempts: Vec<Vec<usize>> = Vec::new();
20696        let entry = HistoricalBatchEntry {
20697            source_row_id: 77,
20698            agent_id: 1,
20699            workspace_id: None,
20700            conversation: Conversation {
20701                id: None,
20702                agent_slug: "gemini".into(),
20703                workspace: Some(PathBuf::from("/tmp/workspace")),
20704                external_id: Some("conv-77".into()),
20705                title: Some("Large recovered conversation".into()),
20706                source_path: PathBuf::from("/tmp/history.jsonl"),
20707                started_at: Some(1_700_000_000_000),
20708                ended_at: Some(1_700_000_000_999),
20709                approx_tokens: None,
20710                metadata_json: serde_json::Value::Null,
20711                messages: (0..4)
20712                    .map(|idx| Message {
20713                        id: None,
20714                        idx,
20715                        role: MessageRole::User,
20716                        author: None,
20717                        created_at: Some(1_700_000_000_000 + idx),
20718                        content: format!("message-{idx}"),
20719                        extra_json: serde_json::Value::Null,
20720                        snippets: Vec::new(),
20721                    })
20722                    .collect(),
20723                source_id: LOCAL_SOURCE_ID.into(),
20724                origin_host: None,
20725            },
20726        };
20727
20728        let totals = SqliteStorage::import_historical_batch_with_retry(
20729            std::slice::from_ref(&entry),
20730            &mut |batch| {
20731                attempts.push(
20732                    batch
20733                        .iter()
20734                        .map(|entry| entry.conversation.messages.len())
20735                        .collect(),
20736                );
20737                let total_messages: usize = batch
20738                    .iter()
20739                    .map(|entry| entry.conversation.messages.len())
20740                    .sum();
20741                if total_messages > 1 {
20742                    Err(anyhow!("out of memory"))
20743                } else {
20744                    Ok(HistoricalBatchImportTotals {
20745                        inserted_source_rows: batch.len(),
20746                        inserted_messages: total_messages,
20747                    })
20748                }
20749            },
20750        )
20751        .unwrap();
20752
20753        assert_eq!(
20754            totals,
20755            HistoricalBatchImportTotals {
20756                inserted_source_rows: 1,
20757                inserted_messages: 4,
20758            }
20759        );
20760        assert_eq!(attempts.first().cloned(), Some(vec![4]));
20761        assert!(
20762            attempts.iter().filter(|sizes| sizes == &&vec![1]).count() >= 4,
20763            "expected recursive fallback to reach one-message slices"
20764        );
20765    }
20766
20767    #[test]
20768    fn salvage_historical_databases_resumes_from_progress_checkpoint() {
20769        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20770        use std::path::PathBuf;
20771
20772        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
20773            Conversation {
20774                id: None,
20775                agent_slug: "codex".into(),
20776                workspace: Some(PathBuf::from("/tmp/workspace")),
20777                external_id: Some(format!("conv-{idx_seed}")),
20778                title: Some(format!("Recovered {idx_seed}")),
20779                source_path: PathBuf::from(source_path),
20780                started_at: Some(1_700_000_000_000 + idx_seed),
20781                ended_at: Some(1_700_000_000_100 + idx_seed),
20782                approx_tokens: None,
20783                metadata_json: serde_json::Value::Null,
20784                messages: vec![Message {
20785                    id: None,
20786                    idx: 0,
20787                    role: MessageRole::User,
20788                    author: None,
20789                    created_at: Some(1_700_000_000_000 + idx_seed),
20790                    content: format!("message-{idx_seed}"),
20791                    extra_json: serde_json::Value::Null,
20792                    snippets: Vec::new(),
20793                }],
20794                source_id: LOCAL_SOURCE_ID.into(),
20795                origin_host: None,
20796            }
20797        }
20798
20799        let dir = TempDir::new().unwrap();
20800        let canonical_db = dir.path().join("agent_search.db");
20801        let backup_db = dir
20802            .path()
20803            .join("backups/agent_search.db.20260322T020200.bak");
20804        let storage = SqliteStorage::open(&canonical_db).unwrap();
20805        let conv_a = make_conv("/tmp/one.jsonl", 1);
20806        let conv_b = make_conv("/tmp/two.jsonl", 2);
20807        let conv_c = make_conv("/tmp/three.jsonl", 3);
20808        seed_historical_db_direct(
20809            &backup_db,
20810            &[conv_a.clone(), conv_b.clone(), conv_c.clone()],
20811        );
20812
20813        let agent = Agent {
20814            id: None,
20815            slug: "codex".into(),
20816            name: "Codex".into(),
20817            version: Some("0.2.3".into()),
20818            kind: AgentKind::Cli,
20819        };
20820        let agent_id = storage.ensure_agent(&agent).unwrap();
20821        storage
20822            .insert_conversation_tree(agent_id, None, &conv_a)
20823            .unwrap();
20824
20825        let bundle = discover_historical_database_bundles(&canonical_db)
20826            .into_iter()
20827            .find(|bundle| bundle.root_path == backup_db)
20828            .unwrap();
20829        let first_row_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
20830            .unwrap()
20831            .query_row_map(
20832                "SELECT id FROM conversations WHERE source_path = ?1",
20833                fparams!["/tmp/one.jsonl"],
20834                |row| row.get_typed(0),
20835            )
20836            .unwrap();
20837        storage
20838            .record_historical_bundle_progress(&bundle, "direct-readonly", first_row_id, 50, 99)
20839            .unwrap();
20840
20841        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
20842        assert_eq!(outcome.bundles_imported, 1);
20843        assert_eq!(outcome.conversations_imported, 52);
20844        assert_eq!(outcome.messages_imported, 101);
20845        assert_eq!(storage.list_conversations(10, 0).unwrap().len(), 3);
20846
20847        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
20848        let progress_left: Option<String> = storage
20849            .conn
20850            .query_row_map(
20851                "SELECT value FROM meta WHERE key = ?1",
20852                fparams![progress_key.as_str()],
20853                |row| row.get_typed(0),
20854            )
20855            .optional()
20856            .unwrap();
20857        assert!(
20858            progress_left.is_none(),
20859            "completed salvage should clear bundle progress"
20860        );
20861
20862        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
20863        assert_eq!(second.bundles_imported, 0);
20864        assert_eq!(second.messages_imported, 0);
20865    }
20866
20867    #[test]
20868    fn salvage_historical_databases_skips_bundle_when_checkpoint_covers_backup() {
20869        // Regression for issue #247 (coding_agent_session_search-r8pcy): a bundle
20870        // whose progress checkpoint already covers the backup's entire conversation
20871        // row-id space (daemon OOM-killed after the last batch committed but before
20872        // the completion ledger marker landed) must be ledgered + skipped, not
20873        // re-scanned O(n) with imported=0 every batch.
20874        use crate::model::types::{Conversation, Message, MessageRole};
20875        use std::path::PathBuf;
20876
20877        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
20878            Conversation {
20879                id: None,
20880                agent_slug: "codex".into(),
20881                workspace: Some(PathBuf::from("/tmp/workspace")),
20882                external_id: Some(format!("conv-{idx_seed}")),
20883                title: Some(format!("Recovered {idx_seed}")),
20884                source_path: PathBuf::from(source_path),
20885                started_at: Some(1_700_000_000_000 + idx_seed),
20886                ended_at: Some(1_700_000_000_100 + idx_seed),
20887                approx_tokens: None,
20888                metadata_json: serde_json::Value::Null,
20889                messages: vec![Message {
20890                    id: None,
20891                    idx: 0,
20892                    role: MessageRole::User,
20893                    author: None,
20894                    created_at: Some(1_700_000_000_000 + idx_seed),
20895                    content: format!("message-{idx_seed}"),
20896                    extra_json: serde_json::Value::Null,
20897                    snippets: Vec::new(),
20898                }],
20899                source_id: LOCAL_SOURCE_ID.into(),
20900                origin_host: None,
20901            }
20902        }
20903
20904        let dir = TempDir::new().unwrap();
20905        let canonical_db = dir.path().join("agent_search.db");
20906        let backup_db = dir
20907            .path()
20908            .join("backups/agent_search.db.20260322T020200.bak");
20909        let storage = SqliteStorage::open(&canonical_db).unwrap();
20910        seed_historical_db_direct(
20911            &backup_db,
20912            &[
20913                make_conv("/tmp/one.jsonl", 1),
20914                make_conv("/tmp/two.jsonl", 2),
20915                make_conv("/tmp/three.jsonl", 3),
20916            ],
20917        );
20918
20919        let bundle = discover_historical_database_bundles(&canonical_db)
20920            .into_iter()
20921            .find(|bundle| bundle.root_path == backup_db)
20922            .unwrap();
20923
20924        // Checkpoint high-water mark == backup's max conversation id.
20925        let backup_max_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
20926            .unwrap()
20927            .query_row_map(
20928                "SELECT COALESCE(MAX(id), 0) FROM conversations",
20929                fparams![],
20930                |row| row.get_typed(0),
20931            )
20932            .unwrap();
20933        assert!(backup_max_id > 0, "seeded backup should have conversations");
20934        storage
20935            .record_historical_bundle_progress(&bundle, "direct-readonly", backup_max_id, 3, 3)
20936            .unwrap();
20937
20938        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
20939        assert_eq!(
20940            outcome.bundles_imported, 0,
20941            "fully-checkpointed bundle must not be re-scanned"
20942        );
20943        assert_eq!(outcome.conversations_imported, 0);
20944        assert_eq!(outcome.messages_imported, 0);
20945        assert_eq!(
20946            storage.list_conversations(10, 0).unwrap().len(),
20947            0,
20948            "skip path must not import anything"
20949        );
20950        assert!(
20951            storage.historical_bundle_already_imported(&bundle).unwrap(),
20952            "skipped bundle must be ledgered as salvaged so future runs short-circuit"
20953        );
20954
20955        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
20956        let progress_left: Option<String> = storage
20957            .conn
20958            .query_row_map(
20959                "SELECT value FROM meta WHERE key = ?1",
20960                fparams![progress_key.as_str()],
20961                |row| row.get_typed(0),
20962            )
20963            .optional()
20964            .unwrap();
20965        assert!(
20966            progress_left.is_none(),
20967            "skip path must clear the bundle progress checkpoint"
20968        );
20969    }
20970
20971    #[test]
20972    fn list_conversations_for_lexical_rebuild_uses_stable_id_order() {
20973        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20974        use std::path::PathBuf;
20975
20976        let dir = TempDir::new().unwrap();
20977        let db_path = dir.path().join("agent_search.db");
20978        let storage = SqliteStorage::open(&db_path).unwrap();
20979        let agent = Agent {
20980            id: None,
20981            slug: "codex".into(),
20982            name: "Codex".into(),
20983            version: Some("0.2.3".into()),
20984            kind: AgentKind::Cli,
20985        };
20986        let agent_id = storage.ensure_agent(&agent).unwrap();
20987
20988        let make_conv = |source_path: &str, started_at: i64| Conversation {
20989            id: None,
20990            agent_slug: "codex".into(),
20991            workspace: Some(PathBuf::from("/tmp/workspace")),
20992            external_id: Some(source_path.to_string()),
20993            title: Some(source_path.to_string()),
20994            source_path: PathBuf::from(source_path),
20995            started_at: Some(started_at),
20996            ended_at: Some(started_at + 1),
20997            approx_tokens: None,
20998            metadata_json: serde_json::Value::Null,
20999            messages: vec![Message {
21000                id: None,
21001                idx: 0,
21002                role: MessageRole::User,
21003                author: None,
21004                created_at: Some(started_at),
21005                content: format!("message for {source_path}"),
21006                extra_json: serde_json::Value::Null,
21007                snippets: Vec::new(),
21008            }],
21009            source_id: LOCAL_SOURCE_ID.into(),
21010            origin_host: None,
21011        };
21012
21013        let conv_a = make_conv("/tmp/a.jsonl", 3_000);
21014        let conv_b = make_conv("/tmp/b.jsonl", 1_000);
21015        let conv_c = make_conv("/tmp/c.jsonl", 2_000);
21016
21017        storage
21018            .insert_conversation_tree(agent_id, None, &conv_a)
21019            .unwrap();
21020        storage
21021            .insert_conversation_tree(agent_id, None, &conv_b)
21022            .unwrap();
21023        storage
21024            .insert_conversation_tree(agent_id, None, &conv_c)
21025            .unwrap();
21026
21027        let user_order: Vec<PathBuf> = storage
21028            .list_conversations(10, 0)
21029            .unwrap()
21030            .into_iter()
21031            .map(|conv| conv.source_path)
21032            .collect();
21033        assert_eq!(
21034            user_order,
21035            vec![
21036                PathBuf::from("/tmp/a.jsonl"),
21037                PathBuf::from("/tmp/c.jsonl"),
21038                PathBuf::from("/tmp/b.jsonl"),
21039            ]
21040        );
21041
21042        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
21043        let rebuild_order: Vec<PathBuf> = storage
21044            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
21045            .unwrap()
21046            .into_iter()
21047            .map(|conv| conv.source_path)
21048            .collect();
21049        assert_eq!(
21050            rebuild_order,
21051            vec![
21052                PathBuf::from("/tmp/a.jsonl"),
21053                PathBuf::from("/tmp/b.jsonl"),
21054                PathBuf::from("/tmp/c.jsonl"),
21055            ]
21056        );
21057
21058        let first_page = storage
21059            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
21060            .unwrap();
21061        let first_page_paths: Vec<PathBuf> = first_page
21062            .iter()
21063            .map(|conv| conv.source_path.clone())
21064            .collect();
21065        assert_eq!(
21066            first_page_paths,
21067            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
21068        );
21069
21070        let second_page = storage
21071            .list_conversations_for_lexical_rebuild_after_id(
21072                2,
21073                first_page
21074                    .last()
21075                    .and_then(|conv| conv.id)
21076                    .expect("first page should include an id"),
21077                &agent_slugs,
21078                &workspace_paths,
21079            )
21080            .unwrap();
21081        let second_page_paths: Vec<PathBuf> = second_page
21082            .iter()
21083            .map(|conv| conv.source_path.clone())
21084            .collect();
21085        assert_eq!(second_page_paths, vec![PathBuf::from("/tmp/c.jsonl")]);
21086
21087        let bounded_page = storage
21088            .list_conversations_for_lexical_rebuild_after_id_through_id(
21089                10,
21090                0,
21091                first_page
21092                    .last()
21093                    .and_then(|conv| conv.id)
21094                    .expect("first page should include an id"),
21095                &agent_slugs,
21096                &workspace_paths,
21097            )
21098            .unwrap();
21099        let bounded_paths: Vec<PathBuf> = bounded_page
21100            .iter()
21101            .map(|conv| conv.source_path.clone())
21102            .collect();
21103        assert_eq!(
21104            bounded_paths,
21105            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
21106        );
21107    }
21108
21109    #[test]
21110    fn keyset_traversal_handles_sparse_holey_conversation_ids() {
21111        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21112        use std::path::PathBuf;
21113
21114        let dir = TempDir::new().unwrap();
21115        let db_path = dir.path().join("agent_search.db");
21116        let storage = SqliteStorage::open(&db_path).unwrap();
21117        let agent = Agent {
21118            id: None,
21119            slug: "codex".into(),
21120            name: "Codex".into(),
21121            version: Some("0.2.3".into()),
21122            kind: AgentKind::Cli,
21123        };
21124        let agent_id = storage.ensure_agent(&agent).unwrap();
21125
21126        let make_conv = |label: &str, ts: i64| Conversation {
21127            id: None,
21128            agent_slug: "codex".into(),
21129            workspace: Some(PathBuf::from("/tmp/workspace")),
21130            external_id: Some(label.to_string()),
21131            title: Some(label.to_string()),
21132            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
21133            started_at: Some(ts),
21134            ended_at: Some(ts + 1),
21135            approx_tokens: None,
21136            metadata_json: serde_json::Value::Null,
21137            messages: vec![Message {
21138                id: None,
21139                idx: 0,
21140                role: MessageRole::User,
21141                author: None,
21142                created_at: Some(ts),
21143                content: format!("msg for {label}"),
21144                extra_json: serde_json::Value::Null,
21145                snippets: Vec::new(),
21146            }],
21147            source_id: LOCAL_SOURCE_ID.into(),
21148            origin_host: None,
21149        };
21150
21151        for i in 0..6 {
21152            storage
21153                .insert_conversation_tree(
21154                    agent_id,
21155                    None,
21156                    &make_conv(&format!("conv-{i}"), 1000 + i),
21157                )
21158                .unwrap();
21159        }
21160
21161        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
21162        storage
21163            .conn
21164            .execute_compat("DELETE FROM conversations WHERE id IN (2, 4)", fparams![])
21165            .unwrap();
21166        storage
21167            .conn
21168            .execute_compat(
21169                "DELETE FROM messages WHERE conversation_id IN (2, 4)",
21170                fparams![],
21171            )
21172            .unwrap();
21173        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
21174
21175        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
21176
21177        let page1 = storage
21178            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
21179            .unwrap();
21180        assert_eq!(page1.len(), 2);
21181        let page1_ids: Vec<i64> = page1.iter().map(|c| c.id.unwrap()).collect();
21182        assert_eq!(page1_ids, vec![1, 3]);
21183
21184        let page2 = storage
21185            .list_conversations_for_lexical_rebuild_after_id(
21186                2,
21187                *page1_ids.last().unwrap(),
21188                &agent_slugs,
21189                &workspace_paths,
21190            )
21191            .unwrap();
21192        assert_eq!(page2.len(), 2);
21193        let page2_ids: Vec<i64> = page2.iter().map(|c| c.id.unwrap()).collect();
21194        assert_eq!(page2_ids, vec![5, 6]);
21195
21196        let page3 = storage
21197            .list_conversations_for_lexical_rebuild_after_id(
21198                2,
21199                *page2_ids.last().unwrap(),
21200                &agent_slugs,
21201                &workspace_paths,
21202            )
21203            .unwrap();
21204        assert!(page3.is_empty());
21205
21206        let all_ids: Vec<i64> = page1_ids.iter().chain(page2_ids.iter()).copied().collect();
21207        assert_eq!(all_ids, vec![1, 3, 5, 6]);
21208    }
21209
21210    #[test]
21211    fn keyset_traversal_through_id_with_sparse_ranges() {
21212        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21213        use std::path::PathBuf;
21214
21215        let dir = TempDir::new().unwrap();
21216        let db_path = dir.path().join("agent_search.db");
21217        let storage = SqliteStorage::open(&db_path).unwrap();
21218        let agent = Agent {
21219            id: None,
21220            slug: "codex".into(),
21221            name: "Codex".into(),
21222            version: Some("0.2.3".into()),
21223            kind: AgentKind::Cli,
21224        };
21225        let agent_id = storage.ensure_agent(&agent).unwrap();
21226
21227        let make_conv = |label: &str, ts: i64| Conversation {
21228            id: None,
21229            agent_slug: "codex".into(),
21230            workspace: Some(PathBuf::from("/tmp/workspace")),
21231            external_id: Some(label.to_string()),
21232            title: Some(label.to_string()),
21233            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
21234            started_at: Some(ts),
21235            ended_at: Some(ts + 1),
21236            approx_tokens: None,
21237            metadata_json: serde_json::Value::Null,
21238            messages: vec![Message {
21239                id: None,
21240                idx: 0,
21241                role: MessageRole::User,
21242                author: None,
21243                created_at: Some(ts),
21244                content: format!("msg for {label}"),
21245                extra_json: serde_json::Value::Null,
21246                snippets: Vec::new(),
21247            }],
21248            source_id: LOCAL_SOURCE_ID.into(),
21249            origin_host: None,
21250        };
21251
21252        for i in 0..10 {
21253            storage
21254                .insert_conversation_tree(
21255                    agent_id,
21256                    None,
21257                    &make_conv(&format!("conv-{i}"), 1000 + i),
21258                )
21259                .unwrap();
21260        }
21261
21262        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
21263        storage
21264            .conn
21265            .execute_compat(
21266                "DELETE FROM conversations WHERE id IN (3, 5, 7, 8)",
21267                fparams![],
21268            )
21269            .unwrap();
21270        storage
21271            .conn
21272            .execute_compat(
21273                "DELETE FROM messages WHERE conversation_id IN (3, 5, 7, 8)",
21274                fparams![],
21275            )
21276            .unwrap();
21277        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
21278
21279        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
21280
21281        let through_5 = storage
21282            .list_conversations_for_lexical_rebuild_after_id_through_id(
21283                100,
21284                0,
21285                5,
21286                &agent_slugs,
21287                &workspace_paths,
21288            )
21289            .unwrap();
21290        let through_5_ids: Vec<i64> = through_5.iter().map(|c| c.id.unwrap()).collect();
21291        assert_eq!(through_5_ids, vec![1, 2, 4]);
21292
21293        let after_4_through_10 = storage
21294            .list_conversations_for_lexical_rebuild_after_id_through_id(
21295                100,
21296                4,
21297                10,
21298                &agent_slugs,
21299                &workspace_paths,
21300            )
21301            .unwrap();
21302        let ids: Vec<i64> = after_4_through_10.iter().map(|c| c.id.unwrap()).collect();
21303        assert_eq!(ids, vec![6, 9, 10]);
21304
21305        let after_10 = storage
21306            .list_conversations_for_lexical_rebuild_after_id_through_id(
21307                100,
21308                10,
21309                20,
21310                &agent_slugs,
21311                &workspace_paths,
21312            )
21313            .unwrap();
21314        assert!(after_10.is_empty());
21315    }
21316
21317    #[test]
21318    fn list_conversation_footprints_for_lexical_rebuild_estimates_bytes_and_keeps_empty_conversations()
21319     {
21320        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21321        use std::path::PathBuf;
21322
21323        let dir = TempDir::new().unwrap();
21324        let db_path = dir.path().join("agent_search.db");
21325        let storage = SqliteStorage::open(&db_path).unwrap();
21326        let agent = Agent {
21327            id: None,
21328            slug: "codex".into(),
21329            name: "Codex".into(),
21330            version: Some("0.2.3".into()),
21331            kind: AgentKind::Cli,
21332        };
21333        let agent_id = storage.ensure_agent(&agent).unwrap();
21334
21335        let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
21336            storage
21337                .insert_conversation_tree(
21338                    agent_id,
21339                    None,
21340                    &Conversation {
21341                        id: None,
21342                        agent_slug: "codex".into(),
21343                        workspace: Some(PathBuf::from("/tmp/workspace")),
21344                        external_id: Some(external_id.to_string()),
21345                        title: Some(external_id.to_string()),
21346                        source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21347                        started_at: Some(base_ts),
21348                        ended_at: Some(base_ts + 100),
21349                        approx_tokens: None,
21350                        metadata_json: serde_json::Value::Null,
21351                        messages,
21352                        source_id: LOCAL_SOURCE_ID.into(),
21353                        origin_host: None,
21354                    },
21355                )
21356                .unwrap()
21357                .conversation_id
21358        };
21359
21360        let ascii_id = insert(
21361            "footprint-ascii",
21362            1_700_000_000_000,
21363            vec![
21364                Message {
21365                    id: None,
21366                    idx: 0,
21367                    role: MessageRole::User,
21368                    author: None,
21369                    created_at: Some(1_700_000_000_001),
21370                    content: "abc".into(),
21371                    extra_json: serde_json::Value::Null,
21372                    snippets: Vec::new(),
21373                },
21374                Message {
21375                    id: None,
21376                    idx: 1,
21377                    role: MessageRole::Agent,
21378                    author: None,
21379                    created_at: Some(1_700_000_000_002),
21380                    content: "defg".into(),
21381                    extra_json: serde_json::Value::Null,
21382                    snippets: Vec::new(),
21383                },
21384            ],
21385        );
21386        let empty_id = insert("footprint-empty", 1_700_000_001_000, Vec::new());
21387        let utf8_id = insert(
21388            "footprint-utf8",
21389            1_700_000_002_000,
21390            vec![Message {
21391                id: None,
21392                idx: 0,
21393                role: MessageRole::Tool,
21394                author: None,
21395                created_at: Some(1_700_000_002_001),
21396                content: "hé🙂".into(),
21397                extra_json: serde_json::Value::Null,
21398                snippets: Vec::new(),
21399            }],
21400        );
21401        let sparse_id = insert(
21402            "footprint-sparse",
21403            1_700_000_003_000,
21404            vec![Message {
21405                id: None,
21406                idx: 10,
21407                role: MessageRole::User,
21408                author: None,
21409                created_at: Some(1_700_000_003_010),
21410                content: "sparse".into(),
21411                extra_json: serde_json::Value::Null,
21412                snippets: Vec::new(),
21413            }],
21414        );
21415        storage
21416            .conn
21417            .execute_compat(
21418                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
21419                fparams![utf8_id],
21420            )
21421            .unwrap();
21422
21423        let footprints = storage
21424            .list_conversation_footprints_for_lexical_rebuild()
21425            .unwrap();
21426        assert_eq!(
21427            footprints,
21428            vec![
21429                LexicalRebuildConversationFootprintRow {
21430                    conversation_id: ascii_id,
21431                    message_count: 2,
21432                    message_bytes: 2 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
21433                },
21434                LexicalRebuildConversationFootprintRow {
21435                    conversation_id: empty_id,
21436                    message_count: 0,
21437                    message_bytes: 0,
21438                },
21439                LexicalRebuildConversationFootprintRow {
21440                    conversation_id: utf8_id,
21441                    message_count: 1,
21442                    message_bytes: LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
21443                },
21444                LexicalRebuildConversationFootprintRow {
21445                    conversation_id: sparse_id,
21446                    message_count: 11,
21447                    message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
21448                },
21449            ]
21450        );
21451    }
21452
21453    #[test]
21454    fn list_conversation_footprints_for_lexical_rebuild_falls_back_for_missing_tail_cache() {
21455        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21456        use std::path::PathBuf;
21457
21458        let dir = TempDir::new().unwrap();
21459        let db_path = dir.path().join("agent_search.db");
21460        let storage = SqliteStorage::open(&db_path).unwrap();
21461        let agent = Agent {
21462            id: None,
21463            slug: "codex".into(),
21464            name: "Codex".into(),
21465            version: Some("0.2.3".into()),
21466            kind: AgentKind::Cli,
21467        };
21468        let agent_id = storage.ensure_agent(&agent).unwrap();
21469        let conversation_id = storage
21470            .insert_conversation_tree(
21471                agent_id,
21472                None,
21473                &Conversation {
21474                    id: None,
21475                    agent_slug: "codex".into(),
21476                    workspace: Some(PathBuf::from("/tmp/workspace")),
21477                    external_id: Some("footprint-missing-tail".to_string()),
21478                    title: Some("footprint-missing-tail".to_string()),
21479                    source_path: PathBuf::from("/tmp/footprint-missing-tail.jsonl"),
21480                    started_at: Some(1_700_000_000_000),
21481                    ended_at: Some(1_700_000_000_100),
21482                    approx_tokens: None,
21483                    metadata_json: serde_json::Value::Null,
21484                    messages: vec![Message {
21485                        id: None,
21486                        idx: 10,
21487                        role: MessageRole::User,
21488                        author: None,
21489                        created_at: Some(1_700_000_000_010),
21490                        content: "legacy sparse tail".into(),
21491                        extra_json: serde_json::Value::Null,
21492                        snippets: Vec::new(),
21493                    }],
21494                    source_id: LOCAL_SOURCE_ID.into(),
21495                    origin_host: None,
21496                },
21497            )
21498            .unwrap()
21499            .conversation_id;
21500
21501        storage
21502            .conn
21503            .execute_compat(
21504                "UPDATE conversations
21505                 SET last_message_idx = NULL, last_message_created_at = NULL
21506                 WHERE id = ?1",
21507                fparams![conversation_id],
21508            )
21509            .unwrap();
21510        storage
21511            .conn
21512            .execute_compat(
21513                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
21514                fparams![conversation_id],
21515            )
21516            .unwrap();
21517
21518        let footprints = storage
21519            .list_conversation_footprints_for_lexical_rebuild()
21520            .unwrap();
21521
21522        assert_eq!(
21523            footprints,
21524            vec![LexicalRebuildConversationFootprintRow {
21525                conversation_id,
21526                message_count: 11,
21527                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
21528            }],
21529            "missing tail-cache metadata should fall back to messages MAX(idx) instead of treating legacy conversations as empty"
21530        );
21531    }
21532
21533    #[test]
21534    fn list_conversation_footprints_for_lexical_rebuild_raises_stale_low_tail_cache() {
21535        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21536        use std::path::PathBuf;
21537
21538        let dir = TempDir::new().unwrap();
21539        let db_path = dir.path().join("agent_search.db");
21540        let storage = SqliteStorage::open(&db_path).unwrap();
21541        let agent = Agent {
21542            id: None,
21543            slug: "codex".into(),
21544            name: "Codex".into(),
21545            version: Some("0.2.3".into()),
21546            kind: AgentKind::Cli,
21547        };
21548        let agent_id = storage.ensure_agent(&agent).unwrap();
21549        let conversation_id = storage
21550            .insert_conversation_tree(
21551                agent_id,
21552                None,
21553                &Conversation {
21554                    id: None,
21555                    agent_slug: "codex".into(),
21556                    workspace: Some(PathBuf::from("/tmp/workspace")),
21557                    external_id: Some("footprint-stale-tail".to_string()),
21558                    title: Some("footprint-stale-tail".to_string()),
21559                    source_path: PathBuf::from("/tmp/footprint-stale-tail.jsonl"),
21560                    started_at: Some(1_700_000_000_000),
21561                    ended_at: Some(1_700_000_000_100),
21562                    approx_tokens: None,
21563                    metadata_json: serde_json::Value::Null,
21564                    messages: (0..3)
21565                        .map(|idx| Message {
21566                            id: None,
21567                            idx,
21568                            role: MessageRole::User,
21569                            author: None,
21570                            created_at: Some(1_700_000_000_010 + idx),
21571                            content: format!("message {idx}"),
21572                            extra_json: serde_json::Value::Null,
21573                            snippets: Vec::new(),
21574                        })
21575                        .collect(),
21576                    source_id: LOCAL_SOURCE_ID.into(),
21577                    origin_host: None,
21578                },
21579            )
21580            .unwrap()
21581            .conversation_id;
21582
21583        storage
21584            .conn
21585            .execute_compat(
21586                "UPDATE conversations
21587                 SET last_message_idx = 0, last_message_created_at = 1700000000010
21588                 WHERE id = ?1",
21589                fparams![conversation_id],
21590            )
21591            .unwrap();
21592        storage
21593            .conn
21594            .execute_compat(
21595                "UPDATE conversation_tail_state
21596                 SET last_message_idx = 0, last_message_created_at = 1700000000010
21597                 WHERE conversation_id = ?1",
21598                fparams![conversation_id],
21599            )
21600            .unwrap();
21601
21602        let footprints = storage
21603            .list_conversation_footprints_for_lexical_rebuild()
21604            .unwrap();
21605
21606        assert_eq!(
21607            footprints,
21608            vec![LexicalRebuildConversationFootprintRow {
21609                conversation_id,
21610                message_count: 3,
21611                message_bytes: 3 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
21612            }],
21613            "stale-low tail caches must not under-plan lexical shards and trip doc>plan invariants"
21614        );
21615    }
21616
21617    #[test]
21618    fn list_conversation_footprints_for_lexical_rebuild_tolerates_missing_tail_state_table() {
21619        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21620        use std::path::PathBuf;
21621
21622        let dir = TempDir::new().unwrap();
21623        let db_path = dir.path().join("agent_search.db");
21624        let storage = SqliteStorage::open(&db_path).unwrap();
21625        let agent = Agent {
21626            id: None,
21627            slug: "codex".into(),
21628            name: "Codex".into(),
21629            version: Some("0.2.3".into()),
21630            kind: AgentKind::Cli,
21631        };
21632        let agent_id = storage.ensure_agent(&agent).unwrap();
21633        let conversation_id = storage
21634            .insert_conversation_tree(
21635                agent_id,
21636                None,
21637                &Conversation {
21638                    id: None,
21639                    agent_slug: "codex".into(),
21640                    workspace: Some(PathBuf::from("/tmp/workspace")),
21641                    external_id: Some("footprint-missing-tail-table".to_string()),
21642                    title: Some("footprint-missing-tail-table".to_string()),
21643                    source_path: PathBuf::from("/tmp/footprint-missing-tail-table.jsonl"),
21644                    started_at: Some(1_700_000_000_000),
21645                    ended_at: Some(1_700_000_000_100),
21646                    approx_tokens: None,
21647                    metadata_json: serde_json::Value::Null,
21648                    messages: vec![Message {
21649                        id: None,
21650                        idx: 10,
21651                        role: MessageRole::User,
21652                        author: None,
21653                        created_at: Some(1_700_000_000_010),
21654                        content: "legacy sparse tail without hot table".into(),
21655                        extra_json: serde_json::Value::Null,
21656                        snippets: Vec::new(),
21657                    }],
21658                    source_id: LOCAL_SOURCE_ID.into(),
21659                    origin_host: None,
21660                },
21661            )
21662            .unwrap()
21663            .conversation_id;
21664
21665        storage
21666            .conn
21667            .execute_compat(
21668                "UPDATE conversations
21669                 SET last_message_idx = NULL, last_message_created_at = NULL
21670                 WHERE id = ?1",
21671                fparams![conversation_id],
21672            )
21673            .unwrap();
21674        storage
21675            .conn
21676            .execute_compat("DROP TABLE conversation_tail_state", fparams![])
21677            .unwrap();
21678
21679        let footprints = storage
21680            .list_conversation_footprints_for_lexical_rebuild()
21681            .unwrap();
21682
21683        assert_eq!(
21684            footprints,
21685            vec![LexicalRebuildConversationFootprintRow {
21686                conversation_id,
21687                message_count: 11,
21688                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
21689            }],
21690            "read-only lexical self-heal must tolerate pre-tail-cache databases and use messages MAX(idx)"
21691        );
21692    }
21693
21694    #[test]
21695    fn list_conversation_footprints_for_lexical_rebuild_tolerates_legacy_search_demo_fixture() {
21696        let fixture_db = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
21697            .join("tests")
21698            .join("fixtures")
21699            .join("search_demo_data")
21700            .join("agent_search.db");
21701        let storage = FrankenStorage::open_readonly(&fixture_db).unwrap();
21702
21703        let footprints = storage
21704            .list_conversation_footprints_for_lexical_rebuild()
21705            .unwrap();
21706
21707        assert!(
21708            !footprints.is_empty(),
21709            "search self-heal should be able to plan a lexical rebuild from the legacy search demo fixture"
21710        );
21711        assert!(
21712            footprints
21713                .iter()
21714                .all(|footprint| footprint.message_count > 0),
21715            "legacy fixture conversations should derive message counts from messages when tail caches are absent"
21716        );
21717    }
21718
21719    #[test]
21720    fn lexical_rebuild_listing_normalizes_host_only_remote_source_from_blank_source_id() {
21721        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21722        use std::path::PathBuf;
21723
21724        let dir = TempDir::new().unwrap();
21725        let db_path = dir.path().join("agent_search.db");
21726        let storage = SqliteStorage::open(&db_path).unwrap();
21727        let agent = Agent {
21728            id: None,
21729            slug: "codex".into(),
21730            name: "Codex".into(),
21731            version: Some("0.2.3".into()),
21732            kind: AgentKind::Cli,
21733        };
21734        let agent_id = storage.ensure_agent(&agent).unwrap();
21735        let conversation = Conversation {
21736            id: None,
21737            agent_slug: "codex".into(),
21738            workspace: Some(PathBuf::from("/tmp/workspace")),
21739            external_id: Some("legacy-blank-source".into()),
21740            title: Some("Legacy blank source".into()),
21741            source_path: PathBuf::from("/tmp/legacy-blank-source.jsonl"),
21742            started_at: Some(1_700_000_000_000),
21743            ended_at: Some(1_700_000_000_100),
21744            approx_tokens: None,
21745            metadata_json: serde_json::Value::Null,
21746            messages: vec![Message {
21747                id: None,
21748                idx: 0,
21749                role: MessageRole::User,
21750                author: None,
21751                created_at: Some(1_700_000_000_000),
21752                content: "hello".into(),
21753                extra_json: serde_json::Value::Null,
21754                snippets: Vec::new(),
21755            }],
21756            source_id: LOCAL_SOURCE_ID.into(),
21757            origin_host: None,
21758        };
21759
21760        let conversation_id = storage
21761            .insert_conversation_tree(agent_id, None, &conversation)
21762            .unwrap()
21763            .conversation_id;
21764        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
21765        storage
21766            .conn
21767            .execute_compat(
21768                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
21769                fparams!["   ", "dev@laptop", conversation_id],
21770            )
21771            .unwrap();
21772        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
21773
21774        let listed = storage.list_conversations(10, 0).unwrap();
21775        assert_eq!(listed.len(), 1);
21776        assert_eq!(listed[0].source_id, "dev@laptop");
21777        assert_eq!(listed[0].origin_host.as_deref(), Some("dev@laptop"));
21778
21779        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
21780        let rebuild_listed = storage
21781            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
21782            .unwrap();
21783        assert_eq!(rebuild_listed.len(), 1);
21784        assert_eq!(rebuild_listed[0].source_id, "dev@laptop");
21785        assert_eq!(rebuild_listed[0].origin_host.as_deref(), Some("dev@laptop"));
21786    }
21787
21788    #[test]
21789    fn seed_canonical_from_best_historical_bundle_copies_data_and_resets_runtime_meta() {
21790        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21791        use std::path::PathBuf;
21792
21793        let dir = TempDir::new().unwrap();
21794        let canonical_db = dir.path().join("agent_search.db");
21795        let source_db = dir
21796            .path()
21797            .join("backups/agent_search.db.20260322T020200.bak");
21798
21799        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
21800
21801        let source = SqliteStorage::open(&source_db).unwrap();
21802        let agent = Agent {
21803            id: None,
21804            slug: "codex".into(),
21805            name: "Codex".into(),
21806            version: Some("0.2.3".into()),
21807            kind: AgentKind::Cli,
21808        };
21809        let agent_id = source.ensure_agent(&agent).unwrap();
21810        let conversation = Conversation {
21811            id: None,
21812            agent_slug: "codex".into(),
21813            workspace: Some(PathBuf::from("/tmp/workspace")),
21814            external_id: Some("seed-conv".into()),
21815            title: Some("Historical seed".into()),
21816            source_path: PathBuf::from("/tmp/historical-seed.jsonl"),
21817            started_at: Some(1_700_000_000_000),
21818            ended_at: Some(1_700_000_000_100),
21819            approx_tokens: Some(42),
21820            metadata_json: serde_json::json!({"seed": true}),
21821            messages: vec![Message {
21822                id: None,
21823                idx: 0,
21824                role: MessageRole::Agent,
21825                author: Some("assistant".into()),
21826                created_at: Some(1_700_000_000_050),
21827                content: "seeded message".into(),
21828                extra_json: serde_json::json!({"usage": {"total_tokens": 12}}),
21829                snippets: Vec::new(),
21830            }],
21831            source_id: LOCAL_SOURCE_ID.into(),
21832            origin_host: None,
21833        };
21834        source
21835            .insert_conversation_tree(agent_id, None, &conversation)
21836            .unwrap();
21837        source.set_last_scan_ts(123).unwrap();
21838        source.set_last_indexed_at(456).unwrap();
21839        source.set_last_embedded_message_id(789).unwrap();
21840        source
21841            .conn
21842            .execute_compat(
21843                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
21844                fparams!["historical_bundle_salvaged:stale", "{\"stale\":true}"],
21845            )
21846            .unwrap();
21847        drop(source);
21848
21849        #[cfg(not(windows))]
21850        {
21851            // Legacy "duplicate FTS" fixture reconstruction.
21852            //
21853            // Post-V14 migration cass drops the V13-era fts_messages virtual table
21854            // and recreates it lazily, so a freshly-opened canonical DB has zero
21855            // fts_messages entries in sqlite_master. To reproduce the historical
21856            // failure mode this test exercises — a legacy v13 bundle with a
21857            // duplicated CREATE VIRTUAL TABLE row — we have to inject *both*
21858            // entries: the original V13-era contentless row and the buggy duplicate
21859            // row. Before V14 existed the original was already present after
21860            // migration and only the duplicate needed manual injection.
21861            let legacy_v13_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, content='', tokenize='porter')";
21862            let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
21863            let legacy = rusqlite_test_fixture_conn(&source_db);
21864            legacy
21865                .execute_batch(
21866                    "UPDATE meta SET value = '13' WHERE key = 'schema_version';
21867                     DELETE FROM _schema_migrations WHERE version = 14;
21868                     PRAGMA writable_schema = ON;",
21869                )
21870                .unwrap();
21871            legacy
21872                .execute(
21873                    "DELETE FROM meta WHERE key = ?1",
21874                    [FTS_FRANKEN_REBUILD_META_KEY],
21875                )
21876                .unwrap();
21877            // Inject the V13 original first.
21878            legacy
21879                .execute(
21880                    "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
21881                     VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
21882                    [legacy_v13_fts_sql],
21883                )
21884                .unwrap();
21885            // Then the duplicate that's the real subject of the fixup logic.
21886            legacy
21887                .execute(
21888                    "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
21889                     VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
21890                    [duplicate_legacy_fts_sql],
21891                )
21892                .unwrap();
21893            legacy
21894                .execute_batch("PRAGMA writable_schema = OFF;")
21895                .unwrap();
21896            drop(legacy);
21897
21898            // Verify fixture with rusqlite+writable_schema to see raw
21899            // sqlite_master rows (frankensqlite deduplicates schema entries).
21900            {
21901                let verify = rusqlite_test_fixture_conn(&source_db);
21902                verify
21903                    .execute_batch("PRAGMA writable_schema = ON;")
21904                    .unwrap();
21905                let fts_entries: i64 = verify
21906                    .query_row(
21907                        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
21908                        [],
21909                        |row| row.get(0),
21910                    )
21911                    .unwrap();
21912                assert_eq!(
21913                    fts_entries, 2,
21914                    "test fixture should reproduce the duplicate legacy fts_messages rows"
21915                );
21916                let msg_count: i64 = verify
21917                    .query_row("SELECT COUNT(*) FROM messages", [], |row| row.get(0))
21918                    .unwrap();
21919                assert_eq!(msg_count, 1);
21920            }
21921        }
21922
21923        let fresh = SqliteStorage::open(&canonical_db).unwrap();
21924        drop(fresh);
21925
21926        let outcome = seed_canonical_from_best_historical_bundle(&canonical_db)
21927            .unwrap()
21928            .unwrap();
21929        assert_eq!(outcome.bundles_imported, 1);
21930        assert_eq!(outcome.conversations_imported, 1);
21931        assert_eq!(outcome.messages_imported, 1);
21932
21933        let readonly = open_franken_with_flags(
21934            &canonical_db.to_string_lossy(),
21935            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21936        )
21937        .unwrap();
21938        let readonly_message_count: i64 = readonly
21939            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21940                row.get_typed(0)
21941            })
21942            .unwrap();
21943        assert_eq!(readonly_message_count, 1);
21944
21945        let seeded = SqliteStorage::open(&canonical_db).unwrap();
21946        assert_eq!(
21947            seeded
21948                .count_sessions_in_range(None, None, None, None)
21949                .unwrap()
21950                .0,
21951            1
21952        );
21953        let message_count: i64 = seeded
21954            .conn
21955            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21956                row.get_typed(0)
21957            })
21958            .unwrap();
21959        assert_eq!(message_count, 1);
21960        assert_eq!(seeded.get_last_scan_ts().unwrap(), None);
21961        assert_eq!(seeded.get_last_embedded_message_id().unwrap(), None);
21962
21963        let last_indexed: Option<String> = seeded
21964            .conn
21965            .query_row_map(
21966                "SELECT value FROM meta WHERE key = 'last_indexed_at'",
21967                fparams![],
21968                |row| row.get_typed(0),
21969            )
21970            .optional()
21971            .unwrap();
21972        assert!(last_indexed.is_none());
21973
21974        let salvage_keys: Vec<String> = seeded
21975            .conn
21976            .query_map_collect(
21977                "SELECT key FROM meta WHERE key LIKE 'historical_bundle_salvaged:%' ORDER BY key",
21978                fparams![],
21979                |row| row.get_typed(0),
21980            )
21981            .unwrap();
21982        assert_eq!(salvage_keys.len(), 1);
21983
21984        let reopened_readonly = open_franken_with_flags(
21985            &canonical_db.to_string_lossy(),
21986            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21987        )
21988        .unwrap();
21989        let reopened_fts_entries: i64 = reopened_readonly
21990            .query_row_map(
21991                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
21992                fparams![],
21993                |row| row.get_typed(0),
21994            )
21995            .unwrap();
21996        assert_eq!(
21997            reopened_fts_entries, 1,
21998            "seeded canonical db should keep a single stock-SQLite fts_messages schema row"
21999        );
22000        let reopened_message_count: i64 = reopened_readonly
22001            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
22002                row.get_typed(0)
22003            })
22004            .unwrap();
22005        assert_eq!(reopened_message_count, 1);
22006
22007        let franken_seeded = FrankenStorage::open(&canonical_db).unwrap();
22008        assert_eq!(
22009            franken_seeded.schema_version().unwrap(),
22010            CURRENT_SCHEMA_VERSION
22011        );
22012        // Post-V14 fts_messages is recreated lazily. `FrankenStorage::open`
22013        // alone doesn't re-register the virtual table for the frankensqlite
22014        // query engine — the consistency pass does, and this is exactly what
22015        // normal cass startup runs before the first search. Invoke it
22016        // explicitly so the query below exercises the expected post-repair
22017        // state rather than the between-steps state.
22018        franken_seeded
22019            .ensure_search_fallback_fts_consistency()
22020            .expect("ensure FTS consistency after seed");
22021        let post_franken_schema_rows: i64 = franken_seeded
22022            .raw()
22023            .query_row_map(
22024                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
22025                fparams![],
22026                |row| row.get_typed(0),
22027            )
22028            .unwrap();
22029        assert_eq!(post_franken_schema_rows, 1);
22030        let fts_probe = franken_seeded
22031            .raw()
22032            .query("SELECT COUNT(*) FROM fts_messages");
22033        assert!(
22034            fts_probe.is_ok(),
22035            "expected post-seed FTS to be queryable, got {fts_probe:?}"
22036        );
22037    }
22038
22039    #[test]
22040    fn failed_baseline_seed_preserves_existing_canonical_bundle() {
22041        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22042        use std::path::PathBuf;
22043
22044        let dir = TempDir::new().unwrap();
22045        let canonical_db = dir.path().join("agent_search.db");
22046        let source_db = dir
22047            .path()
22048            .join("backups/agent_search.db.20260325T120000Z.bad-seed.bak");
22049
22050        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
22051
22052        let canonical = SqliteStorage::open(&canonical_db).unwrap();
22053        canonical
22054            .conn
22055            .execute_compat(
22056                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
22057                fparams!["sentinel", "keep-me"],
22058            )
22059            .unwrap();
22060        drop(canonical);
22061
22062        let source = SqliteStorage::open(&source_db).unwrap();
22063        let agent = Agent {
22064            id: None,
22065            slug: "codex".into(),
22066            name: "Codex".into(),
22067            version: Some("0.2.3".into()),
22068            kind: AgentKind::Cli,
22069        };
22070        let agent_id = source.ensure_agent(&agent).unwrap();
22071        let conversation = Conversation {
22072            id: None,
22073            agent_slug: "codex".into(),
22074            workspace: Some(PathBuf::from("/tmp/workspace")),
22075            external_id: Some("bad-seed-conv".into()),
22076            title: Some("Bad seed".into()),
22077            source_path: PathBuf::from("/tmp/bad-seed.jsonl"),
22078            started_at: Some(1_700_000_000_000),
22079            ended_at: Some(1_700_000_000_100),
22080            approx_tokens: Some(42),
22081            metadata_json: serde_json::json!({"seed": "bad"}),
22082            messages: vec![Message {
22083                id: None,
22084                idx: 0,
22085                role: MessageRole::Agent,
22086                author: Some("assistant".into()),
22087                created_at: Some(1_700_000_000_050),
22088                content: "this seed should fail".into(),
22089                extra_json: serde_json::Value::Null,
22090                snippets: Vec::new(),
22091            }],
22092            source_id: LOCAL_SOURCE_ID.into(),
22093            origin_host: None,
22094        };
22095        source
22096            .insert_conversation_tree(agent_id, None, &conversation)
22097            .unwrap();
22098        drop(source);
22099
22100        let legacy = FrankenConnection::open(source_db.to_string_lossy().into_owned()).unwrap();
22101        legacy
22102            .execute("UPDATE meta SET value = '12' WHERE key = 'schema_version'")
22103            .unwrap();
22104        drop(legacy);
22105
22106        let err = seed_canonical_from_best_historical_bundle(&canonical_db).unwrap_err();
22107        assert!(
22108            err.to_string()
22109                .contains("schema_version 12 is too old for baseline import"),
22110            "unexpected seed error: {err:#}"
22111        );
22112
22113        let reopened = SqliteStorage::open(&canonical_db).unwrap();
22114        let sentinel: Option<String> = reopened
22115            .conn
22116            .query_row_map(
22117                "SELECT value FROM meta WHERE key = 'sentinel'",
22118                fparams![],
22119                |row| row.get_typed(0),
22120            )
22121            .optional()
22122            .unwrap();
22123        assert_eq!(sentinel.as_deref(), Some("keep-me"));
22124
22125        let conversation_count: i64 = reopened
22126            .conn
22127            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
22128                row.get_typed(0)
22129            })
22130            .unwrap();
22131        assert_eq!(conversation_count, 0);
22132
22133        let readonly = open_franken_with_flags(
22134            &canonical_db.to_string_lossy(),
22135            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
22136        )
22137        .unwrap();
22138        let readonly_conversation_count: i64 = readonly
22139            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
22140                row.get_typed(0)
22141            })
22142            .unwrap();
22143        assert_eq!(readonly_conversation_count, 0);
22144    }
22145
22146    #[test]
22147    fn fetch_messages_for_lexical_rebuild_skips_extra_json() {
22148        let dir = TempDir::new().unwrap();
22149        let db_path = dir.path().join("test.db");
22150        let storage = SqliteStorage::open(&db_path).unwrap();
22151
22152        let agent = Agent {
22153            id: None,
22154            slug: "codex".into(),
22155            name: "Codex".into(),
22156            version: Some("0.2.3".into()),
22157            kind: AgentKind::Cli,
22158        };
22159        let agent_id = storage.ensure_agent(&agent).unwrap();
22160
22161        let conversation = Conversation {
22162            id: None,
22163            agent_slug: "codex".into(),
22164            workspace: Some(PathBuf::from("/tmp/workspace")),
22165            external_id: Some("lexical-rebuild-test".into()),
22166            title: Some("Lexical rebuild".into()),
22167            source_path: PathBuf::from("/tmp/lexical-rebuild.jsonl"),
22168            started_at: Some(1_700_000_000_000),
22169            ended_at: Some(1_700_000_000_100),
22170            approx_tokens: Some(42),
22171            metadata_json: serde_json::Value::Null,
22172            messages: vec![Message {
22173                id: None,
22174                idx: 0,
22175                role: MessageRole::Agent,
22176                author: Some("assistant".into()),
22177                created_at: Some(1_700_000_000_050),
22178                content: "indexed text".into(),
22179                extra_json: serde_json::json!({
22180                    "usage": { "total_tokens": 1234 },
22181                    "irrelevant_blob": "still preserved in canonical storage"
22182                }),
22183                snippets: Vec::new(),
22184            }],
22185            source_id: LOCAL_SOURCE_ID.into(),
22186            origin_host: None,
22187        };
22188
22189        let inserted = storage
22190            .insert_conversation_tree(agent_id, None, &conversation)
22191            .unwrap();
22192        let conversation_id = inserted.conversation_id;
22193
22194        let stored = storage.fetch_messages(conversation_id).unwrap();
22195        assert_eq!(stored.len(), 1);
22196        assert!(!stored[0].extra_json.is_null());
22197
22198        let lexical = storage
22199            .fetch_messages_for_lexical_rebuild(conversation_id)
22200            .unwrap();
22201        assert_eq!(lexical.len(), 1);
22202        assert_eq!(lexical[0].content, "indexed text");
22203        assert_eq!(lexical[0].author.as_deref(), Some("assistant"));
22204        assert!(lexical[0].extra_json.is_null());
22205    }
22206
22207    #[test]
22208    fn fetch_messages_for_lexical_rebuild_batch_groups_and_orders_messages() {
22209        let dir = TempDir::new().unwrap();
22210        let db_path = dir.path().join("test.db");
22211        let storage = SqliteStorage::open(&db_path).unwrap();
22212
22213        let agent = Agent {
22214            id: None,
22215            slug: "codex".into(),
22216            name: "Codex".into(),
22217            version: Some("0.2.3".into()),
22218            kind: AgentKind::Cli,
22219        };
22220        let agent_id = storage.ensure_agent(&agent).unwrap();
22221
22222        let first = Conversation {
22223            id: None,
22224            agent_slug: "codex".into(),
22225            workspace: Some(PathBuf::from("/tmp/workspace")),
22226            external_id: Some("lexical-batch-1".into()),
22227            title: Some("Lexical batch 1".into()),
22228            source_path: PathBuf::from("/tmp/lexical-batch-1.jsonl"),
22229            started_at: Some(1_700_000_000_000),
22230            ended_at: Some(1_700_000_000_100),
22231            approx_tokens: Some(42),
22232            metadata_json: serde_json::Value::Null,
22233            messages: vec![
22234                Message {
22235                    id: None,
22236                    idx: 0,
22237                    role: MessageRole::User,
22238                    author: Some("user".into()),
22239                    created_at: Some(1_700_000_000_010),
22240                    content: "first-a".into(),
22241                    extra_json: serde_json::json!({"opaque": true}),
22242                    snippets: Vec::new(),
22243                },
22244                Message {
22245                    id: None,
22246                    idx: 1,
22247                    role: MessageRole::Agent,
22248                    author: Some("assistant".into()),
22249                    created_at: Some(1_700_000_000_020),
22250                    content: "first-b".into(),
22251                    extra_json: serde_json::json!({"opaque": true}),
22252                    snippets: Vec::new(),
22253                },
22254            ],
22255            source_id: LOCAL_SOURCE_ID.into(),
22256            origin_host: None,
22257        };
22258
22259        let second = Conversation {
22260            id: None,
22261            agent_slug: "codex".into(),
22262            workspace: Some(PathBuf::from("/tmp/workspace")),
22263            external_id: Some("lexical-batch-2".into()),
22264            title: Some("Lexical batch 2".into()),
22265            source_path: PathBuf::from("/tmp/lexical-batch-2.jsonl"),
22266            started_at: Some(1_700_000_000_200),
22267            ended_at: Some(1_700_000_000_300),
22268            approx_tokens: Some(84),
22269            metadata_json: serde_json::Value::Null,
22270            messages: vec![Message {
22271                id: None,
22272                idx: 0,
22273                role: MessageRole::Tool,
22274                author: Some("tool".into()),
22275                created_at: Some(1_700_000_000_210),
22276                content: "second-a".into(),
22277                extra_json: serde_json::json!({"opaque": true}),
22278                snippets: Vec::new(),
22279            }],
22280            source_id: LOCAL_SOURCE_ID.into(),
22281            origin_host: None,
22282        };
22283        let third = Conversation {
22284            external_id: Some("lexical-batch-3".into()),
22285            title: Some("Lexical batch 3".into()),
22286            source_path: PathBuf::from("/tmp/lexical-batch-3.jsonl"),
22287            messages: vec![Message {
22288                id: None,
22289                idx: 0,
22290                role: MessageRole::System,
22291                author: Some("system".into()),
22292                created_at: Some(1_700_000_000_410),
22293                content: "third-a".into(),
22294                extra_json: serde_json::json!({"opaque": true}),
22295                snippets: Vec::new(),
22296            }],
22297            ..second.clone()
22298        };
22299
22300        let first_id = storage
22301            .insert_conversation_tree(agent_id, None, &first)
22302            .unwrap()
22303            .conversation_id;
22304        let second_id = storage
22305            .insert_conversation_tree(agent_id, None, &second)
22306            .unwrap()
22307            .conversation_id;
22308        let third_id = storage
22309            .insert_conversation_tree(agent_id, None, &third)
22310            .unwrap()
22311            .conversation_id;
22312
22313        let lexical = storage
22314            .fetch_messages_for_lexical_rebuild_batch(&[third_id, first_id], None, None)
22315            .unwrap();
22316
22317        let first_messages = lexical.get(&first_id).expect("first conversation");
22318        assert_eq!(first_messages.len(), 2);
22319        assert_eq!(first_messages[0].content, "first-a");
22320        assert_eq!(first_messages[1].content, "first-b");
22321        assert!(
22322            first_messages
22323                .iter()
22324                .all(|message| message.extra_json.is_null())
22325        );
22326
22327        assert!(
22328            !lexical.contains_key(&second_id),
22329            "batch fetch must exclude conversations not requested by the caller"
22330        );
22331
22332        let third_messages = lexical.get(&third_id).expect("third conversation");
22333        assert_eq!(third_messages.len(), 1);
22334        assert_eq!(third_messages[0].content, "third-a");
22335        assert!(third_messages[0].extra_json.is_null());
22336    }
22337
22338    #[test]
22339    fn fetch_messages_for_lexical_rebuild_batch_enforces_content_byte_guardrail() {
22340        let dir = TempDir::new().unwrap();
22341        let db_path = dir.path().join("test.db");
22342        let storage = SqliteStorage::open(&db_path).unwrap();
22343
22344        let agent = Agent {
22345            id: None,
22346            slug: "codex".into(),
22347            name: "Codex".into(),
22348            version: Some("0.2.3".into()),
22349            kind: AgentKind::Cli,
22350        };
22351        let agent_id = storage.ensure_agent(&agent).unwrap();
22352
22353        let conversation = Conversation {
22354            id: None,
22355            agent_slug: "codex".into(),
22356            workspace: Some(PathBuf::from("/tmp/workspace")),
22357            external_id: Some("lexical-batch-guard".into()),
22358            title: Some("Lexical batch guard".into()),
22359            source_path: PathBuf::from("/tmp/lexical-batch-guard.jsonl"),
22360            started_at: Some(1_700_000_000_000),
22361            ended_at: Some(1_700_000_000_100),
22362            approx_tokens: Some(42),
22363            metadata_json: serde_json::Value::Null,
22364            messages: vec![
22365                Message {
22366                    id: None,
22367                    idx: 0,
22368                    role: MessageRole::User,
22369                    author: Some("user".into()),
22370                    created_at: Some(1_700_000_000_010),
22371                    content: "123456".into(),
22372                    extra_json: serde_json::Value::Null,
22373                    snippets: Vec::new(),
22374                },
22375                Message {
22376                    id: None,
22377                    idx: 1,
22378                    role: MessageRole::Agent,
22379                    author: Some("assistant".into()),
22380                    created_at: Some(1_700_000_000_020),
22381                    content: "abcdef".into(),
22382                    extra_json: serde_json::Value::Null,
22383                    snippets: Vec::new(),
22384                },
22385            ],
22386            source_id: LOCAL_SOURCE_ID.into(),
22387            origin_host: None,
22388        };
22389
22390        let conversation_id = storage
22391            .insert_conversation_tree(agent_id, None, &conversation)
22392            .unwrap()
22393            .conversation_id;
22394
22395        let error = storage
22396            .fetch_messages_for_lexical_rebuild_batch(&[conversation_id], Some(10), Some(8))
22397            .expect_err("guardrail should reject oversized batch content");
22398
22399        let message = format!("{error:#}");
22400        assert!(
22401            message.contains("content-byte guardrail"),
22402            "expected guardrail reason in error, got {message}"
22403        );
22404    }
22405
22406    #[test]
22407    fn fetch_messages_handles_manual_rows_inserted_via_raw_connection() {
22408        let dir = TempDir::new().unwrap();
22409        let db_path = dir.path().join("manual-rows.db");
22410        let storage = FrankenStorage::open(&db_path).unwrap();
22411        let conn = storage.raw();
22412
22413        conn.execute(
22414            "INSERT INTO agents (id, slug, name, kind, created_at, updated_at)
22415             VALUES (1, 'claude_code', 'Claude Code', 'local', 0, 0)",
22416        )
22417        .unwrap();
22418        conn.execute(
22419            "INSERT INTO conversations
22420             (id, agent_id, external_id, title, source_path, source_id, started_at)
22421             VALUES (1, 1, 'manual-ext', 'Manual Session', '/tmp/manual.jsonl', 'local', 200)",
22422        )
22423        .unwrap();
22424        conn.execute(
22425            "INSERT INTO messages
22426             (id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
22427             VALUES (1, 1, 0, 'user', 'tester', 1700000000000, 'manual body', '{\"k\":1}', NULL)",
22428        )
22429        .unwrap();
22430
22431        let lexical = storage.fetch_messages_for_lexical_rebuild(1).unwrap();
22432        assert_eq!(lexical.len(), 1);
22433        assert_eq!(lexical[0].content, "manual body");
22434
22435        let full = storage.fetch_messages(1).unwrap();
22436        assert_eq!(full.len(), 1);
22437        assert_eq!(full[0].content, "manual body");
22438        assert_eq!(full[0].author.as_deref(), Some("tester"));
22439        assert_eq!(full[0].extra_json, serde_json::json!({ "k": 1 }));
22440    }
22441
22442    #[test]
22443    fn lexical_rebuild_batch_messages_query_avoids_sorter_temp_btrees() {
22444        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22445        use std::path::PathBuf;
22446
22447        let dir = TempDir::new().unwrap();
22448        let db_path = dir.path().join("agent_search.db");
22449        let storage = SqliteStorage::open(&db_path).unwrap();
22450
22451        let agent = Agent {
22452            id: None,
22453            slug: "claude_code".into(),
22454            name: "Claude Code".into(),
22455            version: None,
22456            kind: AgentKind::Cli,
22457        };
22458        let agent_id = storage.ensure_agent(&agent).unwrap();
22459
22460        for (external_id, base_ts) in [
22461            ("conv-1", 1_700_000_000_000_i64),
22462            ("conv-2", 1_700_000_001_000_i64),
22463        ] {
22464            let conversation = Conversation {
22465                id: None,
22466                agent_slug: "claude_code".into(),
22467                workspace: Some(PathBuf::from("/tmp/workspace")),
22468                external_id: Some(external_id.to_string()),
22469                title: Some("Lexical rebuild".into()),
22470                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
22471                started_at: Some(base_ts),
22472                ended_at: Some(base_ts + 100),
22473                approx_tokens: None,
22474                metadata_json: serde_json::Value::Null,
22475                messages: vec![
22476                    Message {
22477                        id: None,
22478                        idx: 0,
22479                        role: MessageRole::User,
22480                        author: Some("user".into()),
22481                        created_at: Some(base_ts + 10),
22482                        content: format!("{external_id}-first"),
22483                        extra_json: serde_json::Value::Null,
22484                        snippets: Vec::new(),
22485                    },
22486                    Message {
22487                        id: None,
22488                        idx: 1,
22489                        role: MessageRole::Agent,
22490                        author: Some("assistant".into()),
22491                        created_at: Some(base_ts + 20),
22492                        content: format!("{external_id}-second"),
22493                        extra_json: serde_json::Value::Null,
22494                        snippets: Vec::new(),
22495                    },
22496                ],
22497                source_id: LOCAL_SOURCE_ID.into(),
22498                origin_host: None,
22499            };
22500            storage
22501                .insert_conversation_tree(agent_id, None, &conversation)
22502                .unwrap();
22503        }
22504
22505        let conversation_ids: Vec<i64> = storage
22506            .conn
22507            .query_map_collect(
22508                "SELECT id FROM conversations ORDER BY id",
22509                fparams![],
22510                |row| row.get_typed(0),
22511            )
22512            .unwrap();
22513        assert_eq!(conversation_ids.len(), 2);
22514
22515        let plan_details: Vec<String> = storage
22516            .conn
22517            .query_map_collect(
22518                "EXPLAIN QUERY PLAN \
22519                 SELECT conversation_id, id, idx, role, author, created_at, content \
22520                 FROM messages \
22521                 WHERE conversation_id IN (?1, ?2) \
22522                 ORDER BY conversation_id ASC, idx ASC",
22523                fparams![conversation_ids[0], conversation_ids[1]],
22524                |row| row.get_typed(3),
22525            )
22526            .unwrap();
22527
22528        assert!(
22529            plan_details
22530                .iter()
22531                .any(|detail| detail.contains("sqlite_autoindex_messages_1")),
22532            "expected batched lexical rebuild fetch to use the conversation_id/idx composite index, got {plan_details:?}"
22533        );
22534        assert!(
22535            !plan_details
22536                .iter()
22537                .any(|detail| detail.contains("TEMP B-TREE")),
22538            "expected batched lexical rebuild fetch to avoid sorter temp b-trees, got {plan_details:?}"
22539        );
22540    }
22541
22542    #[test]
22543    fn stream_messages_for_lexical_rebuild_groups_and_orders_messages() {
22544        let dir = TempDir::new().unwrap();
22545        let db_path = dir.path().join("test.db");
22546        let storage = SqliteStorage::open(&db_path).unwrap();
22547
22548        let agent = Agent {
22549            id: None,
22550            slug: "codex".into(),
22551            name: "Codex".into(),
22552            version: Some("0.2.3".into()),
22553            kind: AgentKind::Cli,
22554        };
22555        let agent_id = storage.ensure_agent(&agent).unwrap();
22556
22557        let first = Conversation {
22558            id: None,
22559            agent_slug: "codex".into(),
22560            workspace: Some(PathBuf::from("/tmp/workspace")),
22561            external_id: Some("lexical-stream-1".into()),
22562            title: Some("Lexical stream 1".into()),
22563            source_path: PathBuf::from("/tmp/lexical-stream-1.jsonl"),
22564            started_at: Some(1_700_000_000_000),
22565            ended_at: Some(1_700_000_000_100),
22566            approx_tokens: Some(42),
22567            metadata_json: serde_json::Value::Null,
22568            messages: vec![
22569                Message {
22570                    id: None,
22571                    idx: 0,
22572                    role: MessageRole::User,
22573                    author: Some("user".into()),
22574                    created_at: Some(1_700_000_000_010),
22575                    content: "first-a".into(),
22576                    extra_json: serde_json::json!({"opaque": true}),
22577                    snippets: Vec::new(),
22578                },
22579                Message {
22580                    id: None,
22581                    idx: 1,
22582                    role: MessageRole::Agent,
22583                    author: Some("assistant".into()),
22584                    created_at: Some(1_700_000_000_020),
22585                    content: "first-b".into(),
22586                    extra_json: serde_json::json!({"opaque": true}),
22587                    snippets: Vec::new(),
22588                },
22589            ],
22590            source_id: LOCAL_SOURCE_ID.into(),
22591            origin_host: None,
22592        };
22593
22594        let second = Conversation {
22595            id: None,
22596            agent_slug: "codex".into(),
22597            workspace: Some(PathBuf::from("/tmp/workspace")),
22598            external_id: Some("lexical-stream-2".into()),
22599            title: Some("Lexical stream 2".into()),
22600            source_path: PathBuf::from("/tmp/lexical-stream-2.jsonl"),
22601            started_at: Some(1_700_000_000_200),
22602            ended_at: Some(1_700_000_000_300),
22603            approx_tokens: Some(84),
22604            metadata_json: serde_json::Value::Null,
22605            messages: vec![Message {
22606                id: None,
22607                idx: 0,
22608                role: MessageRole::Tool,
22609                author: Some("tool".into()),
22610                created_at: Some(1_700_000_000_210),
22611                content: "second-a".into(),
22612                extra_json: serde_json::json!({"opaque": true}),
22613                snippets: Vec::new(),
22614            }],
22615            source_id: LOCAL_SOURCE_ID.into(),
22616            origin_host: None,
22617        };
22618
22619        let first_id = storage
22620            .insert_conversation_tree(agent_id, None, &first)
22621            .unwrap()
22622            .conversation_id;
22623        let second_id = storage
22624            .insert_conversation_tree(agent_id, None, &second)
22625            .unwrap()
22626            .conversation_id;
22627
22628        let mut streamed = Vec::new();
22629        storage
22630            .stream_messages_for_lexical_rebuild_from_conversation_id(first_id, |row| {
22631                streamed.push((
22632                    row.conversation_id,
22633                    row.idx,
22634                    row.role,
22635                    row.author,
22636                    row.content,
22637                ));
22638                Ok(())
22639            })
22640            .unwrap();
22641
22642        assert_eq!(
22643            streamed,
22644            vec![
22645                (
22646                    first_id,
22647                    0,
22648                    "user".to_string(),
22649                    Some("user".to_string()),
22650                    "first-a".to_string(),
22651                ),
22652                (
22653                    first_id,
22654                    1,
22655                    "agent".to_string(),
22656                    Some("assistant".to_string()),
22657                    "first-b".to_string(),
22658                ),
22659                (
22660                    second_id,
22661                    0,
22662                    "tool".to_string(),
22663                    Some("tool".to_string()),
22664                    "second-a".to_string(),
22665                ),
22666            ]
22667        );
22668    }
22669
22670    #[test]
22671    fn stream_messages_for_lexical_rebuild_between_conversation_ids_respects_upper_bound() {
22672        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22673        use std::path::PathBuf;
22674
22675        let dir = TempDir::new().unwrap();
22676        let db_path = dir.path().join("agent_search.db");
22677        let storage = SqliteStorage::open(&db_path).unwrap();
22678
22679        let agent = Agent {
22680            id: None,
22681            slug: "claude_code".into(),
22682            name: "Claude Code".into(),
22683            version: Some("1.2.3".into()),
22684            kind: AgentKind::Cli,
22685        };
22686        let agent_id = storage.ensure_agent(&agent).unwrap();
22687
22688        let first = Conversation {
22689            id: None,
22690            agent_slug: "claude_code".into(),
22691            workspace: Some(PathBuf::from("/tmp/workspace")),
22692            external_id: Some("lexical-range-1".into()),
22693            title: Some("Lexical range 1".into()),
22694            source_path: PathBuf::from("/tmp/lexical-range-1.jsonl"),
22695            started_at: Some(1_700_000_000_000),
22696            ended_at: Some(1_700_000_000_100),
22697            approx_tokens: Some(42),
22698            metadata_json: serde_json::Value::Null,
22699            messages: vec![Message {
22700                id: None,
22701                idx: 0,
22702                role: MessageRole::User,
22703                author: Some("user".into()),
22704                created_at: Some(1_700_000_000_010),
22705                content: "first-only".into(),
22706                extra_json: serde_json::json!({"opaque": true}),
22707                snippets: Vec::new(),
22708            }],
22709            source_id: LOCAL_SOURCE_ID.into(),
22710            origin_host: None,
22711        };
22712
22713        let second = Conversation {
22714            id: None,
22715            agent_slug: "claude_code".into(),
22716            workspace: Some(PathBuf::from("/tmp/workspace")),
22717            external_id: Some("lexical-range-2".into()),
22718            title: Some("Lexical range 2".into()),
22719            source_path: PathBuf::from("/tmp/lexical-range-2.jsonl"),
22720            started_at: Some(1_700_000_000_200),
22721            ended_at: Some(1_700_000_000_300),
22722            approx_tokens: Some(84),
22723            metadata_json: serde_json::Value::Null,
22724            messages: vec![Message {
22725                id: None,
22726                idx: 0,
22727                role: MessageRole::Tool,
22728                author: Some("tool".into()),
22729                created_at: Some(1_700_000_000_210),
22730                content: "second-should-not-appear".into(),
22731                extra_json: serde_json::json!({"opaque": true}),
22732                snippets: Vec::new(),
22733            }],
22734            source_id: LOCAL_SOURCE_ID.into(),
22735            origin_host: None,
22736        };
22737
22738        let first_id = storage
22739            .insert_conversation_tree(agent_id, None, &first)
22740            .unwrap()
22741            .conversation_id;
22742        let second_id = storage
22743            .insert_conversation_tree(agent_id, None, &second)
22744            .unwrap()
22745            .conversation_id;
22746
22747        let mut streamed = Vec::new();
22748        storage
22749            .stream_messages_for_lexical_rebuild_between_conversation_ids(
22750                first_id,
22751                first_id,
22752                |row| {
22753                    streamed.push((row.conversation_id, row.idx, row.content));
22754                    Ok(())
22755                },
22756            )
22757            .unwrap();
22758
22759        assert_eq!(streamed, vec![(first_id, 0, "first-only".to_string())]);
22760        assert!(
22761            streamed
22762                .iter()
22763                .all(|(conversation_id, _, _)| *conversation_id != second_id),
22764            "upper bound should exclude later conversation ids"
22765        );
22766    }
22767
22768    #[test]
22769    fn stream_messages_for_lexical_rebuild_between_conversation_ids_handles_mixed_ranges() {
22770        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22771        use std::path::PathBuf;
22772
22773        let dir = TempDir::new().unwrap();
22774        let db_path = dir.path().join("agent_search.db");
22775        let storage = SqliteStorage::open(&db_path).unwrap();
22776
22777        let claude_agent_id = storage
22778            .ensure_agent(&Agent {
22779                id: None,
22780                slug: "claude_code".into(),
22781                name: "Claude Code".into(),
22782                version: None,
22783                kind: AgentKind::Cli,
22784            })
22785            .unwrap();
22786        let aider_agent_id = storage
22787            .ensure_agent(&Agent {
22788                id: None,
22789                slug: "aider".into(),
22790                name: "Aider".into(),
22791                version: None,
22792                kind: AgentKind::Cli,
22793            })
22794            .unwrap();
22795
22796        type MessageSpec = (i64, MessageRole, Option<String>, Option<i64>, String);
22797
22798        let mut expected = Vec::new();
22799        let mut first_conversation_id = None;
22800        let mut last_conversation_id = None;
22801        let mut insert_conversation =
22802            |agent_id: i64,
22803             external_id: &str,
22804             title: &str,
22805             source_path: &str,
22806             started_at: i64,
22807             message_specs: Vec<MessageSpec>| {
22808                let conversation = Conversation {
22809                    id: None,
22810                    agent_slug: if agent_id == aider_agent_id {
22811                        "aider".into()
22812                    } else {
22813                        "claude_code".into()
22814                    },
22815                    workspace: Some(PathBuf::from("/tmp/workspace")),
22816                    external_id: Some(external_id.to_string()),
22817                    title: Some(title.to_string()),
22818                    source_path: PathBuf::from(source_path),
22819                    started_at: Some(started_at),
22820                    ended_at: Some(started_at + 100),
22821                    approx_tokens: None,
22822                    metadata_json: serde_json::Value::Null,
22823                    messages: message_specs
22824                        .iter()
22825                        .map(|(idx, role, author, created_at, content)| Message {
22826                            id: None,
22827                            idx: *idx,
22828                            role: role.clone(),
22829                            author: author.clone(),
22830                            created_at: *created_at,
22831                            content: content.clone(),
22832                            extra_json: serde_json::Value::Null,
22833                            snippets: Vec::new(),
22834                        })
22835                        .collect(),
22836                    source_id: LOCAL_SOURCE_ID.into(),
22837                    origin_host: None,
22838                };
22839                let conversation_id = storage
22840                    .insert_conversation_tree(agent_id, None, &conversation)
22841                    .unwrap()
22842                    .conversation_id;
22843                if first_conversation_id.is_none() {
22844                    first_conversation_id = Some(conversation_id);
22845                }
22846                last_conversation_id = Some(conversation_id);
22847                expected.extend(message_specs.into_iter().map(
22848                    |(idx, role, author, created_at, content)| {
22849                        (
22850                            conversation_id,
22851                            idx,
22852                            match role {
22853                                MessageRole::User => "user".to_string(),
22854                                MessageRole::Agent => "agent".to_string(),
22855                                MessageRole::Tool => "tool".to_string(),
22856                                MessageRole::System => "system".to_string(),
22857                                MessageRole::Other(other) => other,
22858                            },
22859                            author,
22860                            created_at,
22861                            content,
22862                        )
22863                    },
22864                ));
22865            };
22866
22867        for (label, base_ts) in [
22868            ("alpha", 1_700_000_000_000_i64),
22869            ("beta", 1_700_000_001_000_i64),
22870            ("gamma", 1_700_000_002_000_i64),
22871            ("delta", 1_700_000_003_000_i64),
22872            ("epsilon", 1_700_000_004_000_i64),
22873        ] {
22874            insert_conversation(
22875                claude_agent_id,
22876                &format!("lexical-{label}"),
22877                &format!("Lexical {label}"),
22878                &format!("/tmp/{label}.jsonl"),
22879                base_ts,
22880                vec![
22881                    (
22882                        0,
22883                        MessageRole::User,
22884                        None,
22885                        Some(base_ts + 10),
22886                        format!("{label}_content"),
22887                    ),
22888                    (
22889                        1,
22890                        MessageRole::Agent,
22891                        None,
22892                        Some(base_ts + 20),
22893                        format!("{label}_content_response"),
22894                    ),
22895                ],
22896            );
22897        }
22898
22899        insert_conversation(
22900            aider_agent_id,
22901            "lexical-aider-history",
22902            "Aider Chat: coding_agent_session_search",
22903            "/tmp/.aider.chat.history.md",
22904            1_764_619_673_394,
22905            vec![
22906                (
22907                    0,
22908                    MessageRole::System,
22909                    Some("system".to_string()),
22910                    None,
22911                    "# aider chat started at 2025-12-01 20:07:47".to_string(),
22912                ),
22913                (
22914                    1,
22915                    MessageRole::User,
22916                    Some("user".to_string()),
22917                    None,
22918                    "/tmp/workspace/.venv/bin/aider --no-git --message hello world".to_string(),
22919                ),
22920            ],
22921        );
22922        insert_conversation(
22923            aider_agent_id,
22924            "lexical-aider-fixture",
22925            "Aider Chat: aider",
22926            "/tmp/tests/fixtures/aider/.aider.chat.history.md",
22927            1_764_621_401_399,
22928            vec![
22929                (
22930                    0,
22931                    MessageRole::User,
22932                    Some("user".to_string()),
22933                    None,
22934                    "/add src/main.rs".to_string(),
22935                ),
22936                (
22937                    1,
22938                    MessageRole::Agent,
22939                    Some("assistant".to_string()),
22940                    None,
22941                    "Added src/main.rs to the chat.
22942
22943#### /add src/main.rs"
22944                        .to_string(),
22945                ),
22946                (
22947                    2,
22948                    MessageRole::User,
22949                    Some("user".to_string()),
22950                    None,
22951                    "Please refactor.".to_string(),
22952                ),
22953                (
22954                    3,
22955                    MessageRole::Agent,
22956                    Some("assistant".to_string()),
22957                    None,
22958                    "Sure, here is the code.".to_string(),
22959                ),
22960            ],
22961        );
22962
22963        let mut streamed = Vec::new();
22964        storage
22965            .stream_messages_for_lexical_rebuild_between_conversation_ids(
22966                first_conversation_id.unwrap(),
22967                last_conversation_id.unwrap(),
22968                |row| {
22969                    streamed.push((
22970                        row.conversation_id,
22971                        row.idx,
22972                        row.role,
22973                        row.author,
22974                        row.created_at,
22975                        row.content,
22976                    ));
22977                    Ok(())
22978                },
22979            )
22980            .unwrap();
22981
22982        assert_eq!(streamed, expected);
22983    }
22984
22985    #[test]
22986    fn lexical_rebuild_stream_queries_use_rowid_and_per_conversation_probes() {
22987        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22988        use std::path::PathBuf;
22989
22990        let dir = TempDir::new().unwrap();
22991        let db_path = dir.path().join("agent_search.db");
22992        let storage = SqliteStorage::open(&db_path).unwrap();
22993
22994        let agent = Agent {
22995            id: None,
22996            slug: "claude_code".into(),
22997            name: "Claude Code".into(),
22998            version: None,
22999            kind: AgentKind::Cli,
23000        };
23001        let agent_id = storage.ensure_agent(&agent).unwrap();
23002
23003        for (external_id, base_ts) in [
23004            ("conv-1", 1_700_000_000_000_i64),
23005            ("conv-2", 1_700_000_001_000_i64),
23006        ] {
23007            let conversation = Conversation {
23008                id: None,
23009                agent_slug: "claude_code".into(),
23010                workspace: Some(PathBuf::from("/tmp/workspace")),
23011                external_id: Some(external_id.to_string()),
23012                title: Some("Lexical rebuild".into()),
23013                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
23014                started_at: Some(base_ts),
23015                ended_at: Some(base_ts + 100),
23016                approx_tokens: None,
23017                metadata_json: serde_json::Value::Null,
23018                messages: vec![
23019                    Message {
23020                        id: None,
23021                        idx: 0,
23022                        role: MessageRole::User,
23023                        author: Some("user".into()),
23024                        created_at: Some(base_ts + 10),
23025                        content: format!("{external_id}-first"),
23026                        extra_json: serde_json::Value::Null,
23027                        snippets: Vec::new(),
23028                    },
23029                    Message {
23030                        id: None,
23031                        idx: 1,
23032                        role: MessageRole::Agent,
23033                        author: Some("assistant".into()),
23034                        created_at: Some(base_ts + 20),
23035                        content: format!("{external_id}-second"),
23036                        extra_json: serde_json::Value::Null,
23037                        snippets: Vec::new(),
23038                    },
23039                ],
23040                source_id: LOCAL_SOURCE_ID.into(),
23041                origin_host: None,
23042            };
23043            storage
23044                .insert_conversation_tree(agent_id, None, &conversation)
23045                .unwrap();
23046        }
23047
23048        let first_id: i64 = storage
23049            .conn
23050            .query_row_map(
23051                "SELECT id FROM conversations ORDER BY id LIMIT 1",
23052                fparams![],
23053                |row| row.get_typed(0),
23054            )
23055            .unwrap();
23056        let last_id: i64 = storage
23057            .conn
23058            .query_row_map(
23059                "SELECT id FROM conversations ORDER BY id DESC LIMIT 1",
23060                fparams![],
23061                |row| row.get_typed(0),
23062            )
23063            .unwrap();
23064
23065        let conversation_plan_details: Vec<String> = storage
23066            .conn
23067            .query_map_collect(
23068                "EXPLAIN QUERY PLAN                  SELECT id FROM conversations                  WHERE id >= ?1 AND id <= ?2                  ORDER BY id ASC",
23069                fparams![first_id, last_id],
23070                |row| row.get_typed(3),
23071            )
23072            .unwrap();
23073        assert!(
23074            !conversation_plan_details
23075                .iter()
23076                .any(|detail| detail.contains("TEMP B-TREE")),
23077            "expected streamed lexical rebuild conversation listing to avoid sorter temp b-trees, got {conversation_plan_details:?}"
23078        );
23079
23080        let message_plan_details: Vec<String> = storage
23081            .conn
23082            .query_map_collect(
23083                "EXPLAIN QUERY PLAN                  SELECT id, idx, role, author, created_at, content                  FROM messages INDEXED BY sqlite_autoindex_messages_1                  WHERE conversation_id = ?1                  ORDER BY idx",
23084                fparams![first_id],
23085                |row| row.get_typed(3),
23086            )
23087            .unwrap();
23088        assert!(
23089            message_plan_details
23090                .iter()
23091                .any(|detail| detail.contains("sqlite_autoindex_messages_1")
23092                    || detail.contains("idx_messages_conv_idx")),
23093            "expected per-conversation lexical rebuild fetch to use the conversation_id/idx index, got {message_plan_details:?}"
23094        );
23095        assert!(
23096            !message_plan_details
23097                .iter()
23098                .any(|detail| detail.contains("TEMP B-TREE")),
23099            "expected per-conversation lexical rebuild fetch to avoid sorter temp b-trees, got {message_plan_details:?}"
23100        );
23101    }
23102
23103    #[test]
23104    fn discover_historical_database_bundles_prefers_larger_archives_first() {
23105        let dir = TempDir::new().unwrap();
23106        let canonical_db = dir.path().join("agent_search.db");
23107        fs::write(&canonical_db, b"canonical").unwrap();
23108
23109        let smaller = dir.path().join("agent_search.corrupt.small");
23110        fs::write(&smaller, vec![0_u8; 32]).unwrap();
23111
23112        let backups_dir = dir.path().join("backups");
23113        fs::create_dir_all(&backups_dir).unwrap();
23114        let larger = backups_dir.join("agent_search.db.20260322T020200.bak");
23115        fs::write(&larger, vec![0_u8; 128]).unwrap();
23116
23117        let bundles = discover_historical_database_bundles(&canonical_db);
23118        let ordered_paths: Vec<PathBuf> =
23119            bundles.into_iter().map(|bundle| bundle.root_path).collect();
23120
23121        assert_eq!(ordered_paths, vec![larger, smaller]);
23122    }
23123
23124    #[test]
23125    fn discover_historical_database_bundles_prefers_queryable_direct_bundles_first() {
23126        let dir = TempDir::new().unwrap();
23127        let canonical_db = dir.path().join("agent_search.db");
23128        fs::write(&canonical_db, b"canonical").unwrap();
23129
23130        let larger_corrupt = dir.path().join("agent_search.corrupt.20260324_212907");
23131        fs::write(&larger_corrupt, vec![0_u8; 4096]).unwrap();
23132
23133        let backups_dir = dir.path().join("backups");
23134        fs::create_dir_all(&backups_dir).unwrap();
23135        let smaller_healthy = backups_dir.join("agent_search.db.20260322T020200.bak");
23136        let conn = FrankenConnection::open(smaller_healthy.to_string_lossy().into_owned()).unwrap();
23137        conn.execute_batch(
23138            "CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT);
23139             CREATE TABLE messages (
23140                 id INTEGER PRIMARY KEY,
23141                 conversation_id INTEGER NOT NULL,
23142                 idx INTEGER NOT NULL,
23143                 content TEXT
23144             );
23145             INSERT INTO conversations(id, source_path) VALUES (1, '/tmp/history.jsonl');
23146             INSERT INTO messages(id, conversation_id, idx, content)
23147             VALUES (1, 1, 0, 'seed');",
23148        )
23149        .unwrap();
23150        drop(conn);
23151
23152        let bundles = discover_historical_database_bundles(&canonical_db);
23153        let ordered_paths: Vec<PathBuf> = bundles
23154            .iter()
23155            .map(|bundle| bundle.root_path.clone())
23156            .collect();
23157
23158        assert_eq!(ordered_paths, vec![smaller_healthy, larger_corrupt]);
23159        assert!(bundles[0].supports_direct_readonly);
23160        assert!(!bundles[1].supports_direct_readonly);
23161    }
23162
23163    #[test]
23164    fn salvage_historical_databases_skips_unreadable_quarantined_bundles() {
23165        let dir = TempDir::new().unwrap();
23166        let canonical_db = dir.path().join("agent_search.db");
23167        let storage = SqliteStorage::open(&canonical_db).unwrap();
23168
23169        let quarantined = dir.path().join("agent_search.corrupt.20260324_212907");
23170        fs::write(&quarantined, b"not a sqlite database").unwrap();
23171
23172        let discovered: Vec<PathBuf> = discover_historical_database_bundles(&canonical_db)
23173            .into_iter()
23174            .map(|bundle| bundle.root_path)
23175            .collect();
23176        assert_eq!(discovered, vec![quarantined]);
23177
23178        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
23179        assert_eq!(outcome.bundles_considered, 1);
23180        assert_eq!(outcome.bundles_imported, 0);
23181        assert_eq!(outcome.conversations_imported, 0);
23182        assert_eq!(outcome.messages_imported, 0);
23183        assert!(storage.list_conversations(10, 0).unwrap().is_empty());
23184    }
23185
23186    #[test]
23187    fn discover_historical_database_bundles_includes_repair_lab_and_snapshots_named_roots() {
23188        let dir = TempDir::new().unwrap();
23189        let canonical_db = dir.path().join("agent_search.db");
23190        fs::write(&canonical_db, b"canonical").unwrap();
23191
23192        let repair_lab_dir = dir.path().join("repair-lab").join("live-copy");
23193        fs::create_dir_all(&repair_lab_dir).unwrap();
23194        let repair_lab_db = repair_lab_dir.join("agent_search.db");
23195        fs::write(&repair_lab_db, vec![0_u8; 96]).unwrap();
23196        fs::write(
23197            repair_lab_dir.join("agent_search.rebuild-test.db"),
23198            vec![0_u8; 192],
23199        )
23200        .unwrap();
23201
23202        let snapshots_dir = dir.path().join("snapshots").join("20260324T013201Z");
23203        fs::create_dir_all(&snapshots_dir).unwrap();
23204        let snapshot_db = snapshots_dir.join("agent_search.db");
23205        fs::write(&snapshot_db, vec![0_u8; 64]).unwrap();
23206
23207        let bundles = discover_historical_database_bundles(&canonical_db);
23208        let ordered_paths: Vec<PathBuf> =
23209            bundles.into_iter().map(|bundle| bundle.root_path).collect();
23210
23211        assert!(ordered_paths.contains(&repair_lab_db));
23212        assert!(ordered_paths.contains(&snapshot_db));
23213        assert!(
23214            !ordered_paths
23215                .iter()
23216                .any(|path| path.file_name().and_then(|name| name.to_str())
23217                    == Some("agent_search.rebuild-test.db"))
23218        );
23219    }
23220
23221    #[test]
23222    fn discover_historical_database_bundles_prefers_healthy_backup_over_replay_priority() {
23223        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
23224
23225        let dir = TempDir::new().unwrap();
23226        let canonical_db = dir.path().join("agent_search.db");
23227        fs::write(&canonical_db, b"canonical").unwrap();
23228
23229        let replay_dir = dir
23230            .path()
23231            .join("repair-lab")
23232            .join("replay-20260324T070101Z");
23233        fs::create_dir_all(&replay_dir).unwrap();
23234        let replay_db = replay_dir.join("agent_search.db");
23235        let replay_storage = SqliteStorage::open(&replay_db).unwrap();
23236        let agent = Agent {
23237            id: None,
23238            slug: "codex".into(),
23239            name: "Codex".into(),
23240            version: Some("0.2.3".into()),
23241            kind: AgentKind::Cli,
23242        };
23243        let agent_id = replay_storage.ensure_agent(&agent).unwrap();
23244        let conversation = Conversation {
23245            id: None,
23246            agent_slug: "codex".into(),
23247            workspace: Some(PathBuf::from("/tmp/workspace")),
23248            external_id: Some("replay-conv".into()),
23249            title: Some("Replay bundle".into()),
23250            source_path: PathBuf::from("/tmp/replay.jsonl"),
23251            started_at: Some(1_700_000_000_000),
23252            ended_at: Some(1_700_000_000_100),
23253            approx_tokens: Some(42),
23254            metadata_json: serde_json::Value::Null,
23255            messages: vec![Message {
23256                id: None,
23257                idx: 0,
23258                role: MessageRole::Agent,
23259                author: Some("assistant".into()),
23260                created_at: Some(1_700_000_000_050),
23261                content: "replay message".into(),
23262                extra_json: serde_json::Value::Null,
23263                snippets: Vec::new(),
23264            }],
23265            source_id: LOCAL_SOURCE_ID.into(),
23266            origin_host: None,
23267        };
23268        replay_storage
23269            .insert_conversation_tree(agent_id, None, &conversation)
23270            .unwrap();
23271        drop(replay_storage);
23272
23273        let replay_legacy = rusqlite_test_fixture_conn(&replay_db);
23274        replay_legacy
23275            .execute_batch(
23276                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
23277                 DELETE FROM _schema_migrations WHERE version = 14;
23278                 PRAGMA writable_schema = ON;",
23279            )
23280            .unwrap();
23281        replay_legacy
23282            .execute(
23283                "DELETE FROM meta WHERE key = ?1",
23284                [FTS_FRANKEN_REBUILD_META_KEY],
23285            )
23286            .unwrap();
23287        #[cfg(not(windows))]
23288        {
23289            let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
23290            replay_legacy
23291                .execute(
23292                    "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
23293                     VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
23294                    [duplicate_legacy_fts_sql],
23295                )
23296                .unwrap();
23297        }
23298        replay_legacy
23299            .execute_batch("PRAGMA writable_schema = OFF;")
23300            .unwrap();
23301        drop(replay_legacy);
23302
23303        let backups_dir = dir.path().join("backups");
23304        fs::create_dir_all(&backups_dir).unwrap();
23305        let clean_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
23306        let clean_storage = SqliteStorage::open(&clean_backup).unwrap();
23307        let clean_agent_id = clean_storage.ensure_agent(&agent).unwrap();
23308        clean_storage
23309            .insert_conversation_tree(clean_agent_id, None, &conversation)
23310            .unwrap();
23311        drop(clean_storage);
23312
23313        let bundles = discover_historical_database_bundles(&canonical_db);
23314        let ordered_paths: Vec<PathBuf> = bundles
23315            .iter()
23316            .map(|bundle| bundle.root_path.clone())
23317            .collect();
23318
23319        assert_eq!(ordered_paths[0], clean_backup);
23320        assert_eq!(ordered_paths[1], replay_db);
23321        assert_eq!(
23322            bundles[0].probe.schema_version,
23323            Some(CURRENT_SCHEMA_VERSION)
23324        );
23325        // Post-V14 cass drops the fts_messages virtual table during migration
23326        // and recreates it lazily on first open, so a freshly-migrated "clean"
23327        // backup has zero fts_messages rows in sqlite_master. The bundle is
23328        // still ranked as healthy by `bundle_health_rank` because 0 rows is a
23329        // legitimate lazy-FTS state (see comment there).
23330        assert_eq!(bundles[0].probe.fts_schema_rows, Some(0));
23331        // `fts_queryable` mirrors a direct rusqlite probe; with 0 sqlite_master
23332        // rows the table isn't queryable until lazy repair runs.
23333        assert!(!bundles[0].probe.fts_queryable);
23334        assert_eq!(bundles[1].probe.schema_version, Some(13));
23335        // The replay bundle had V14 run (dropping fts_messages → 0 rows), then
23336        // the test rolls meta.schema_version back to 13 and deletes the V14
23337        // marker. On Unix CI we also inject a duplicate sqlite_master row to
23338        // exercise the malformed-bundle probe path that depends on sqlite3.
23339        let expected_fts_schema_rows = if cfg!(windows) { Some(0) } else { Some(1) };
23340        assert_eq!(bundles[1].probe.fts_schema_rows, expected_fts_schema_rows);
23341    }
23342
23343    #[test]
23344    fn ensure_fts_consistency_via_rusqlite_catches_up_missing_rows() {
23345        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
23346
23347        let dir = TempDir::new().unwrap();
23348        let db_path = dir.path().join("fts-catchup.db");
23349        let storage = SqliteStorage::open(&db_path).unwrap();
23350        let agent = Agent {
23351            id: None,
23352            slug: "codex".into(),
23353            name: "Codex".into(),
23354            version: Some("0.2.3".into()),
23355            kind: AgentKind::Cli,
23356        };
23357        let agent_id = storage.ensure_agent(&agent).unwrap();
23358        let conversation = Conversation {
23359            id: None,
23360            agent_slug: "codex".into(),
23361            workspace: Some(PathBuf::from("/tmp/workspace")),
23362            external_id: Some("fts-catchup".into()),
23363            title: Some("FTS catchup".into()),
23364            source_path: PathBuf::from("/tmp/fts-catchup.jsonl"),
23365            started_at: Some(1_700_000_000_000),
23366            ended_at: Some(1_700_000_000_100),
23367            approx_tokens: Some(42),
23368            metadata_json: serde_json::Value::Null,
23369            messages: vec![Message {
23370                id: None,
23371                idx: 0,
23372                role: MessageRole::User,
23373                author: Some("user".into()),
23374                created_at: Some(1_700_000_000_050),
23375                content: "initial message".into(),
23376                extra_json: serde_json::Value::Null,
23377                snippets: Vec::new(),
23378            }],
23379            source_id: LOCAL_SOURCE_ID.into(),
23380            origin_host: None,
23381        };
23382        storage
23383            .insert_conversation_tree(agent_id, None, &conversation)
23384            .unwrap();
23385        drop(storage);
23386
23387        rebuild_fts_via_rusqlite(&db_path).unwrap();
23388
23389        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
23390        let conversation_id: i64 = conn
23391            .query_row_map("SELECT id FROM conversations LIMIT 1", fparams![], |row| {
23392                row.get_typed(0)
23393            })
23394            .unwrap();
23395        conn.execute_compat(
23396            "INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
23397             VALUES(2, ?1, 1, 'assistant', 'assistant', 1700000000060, 'authentication catchup', NULL, NULL)",
23398            fparams![conversation_id],
23399        )
23400        .unwrap();
23401        drop(conn);
23402
23403        let repair = ensure_fts_consistency_via_rusqlite(&db_path).unwrap();
23404        assert_eq!(
23405            repair,
23406            FtsConsistencyRepair::IncrementalCatchUp {
23407                inserted_rows: 1,
23408                total_rows: 2
23409            }
23410        );
23411
23412        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
23413        let auth_rows: i64 = conn
23414            .query_row_map(
23415                "SELECT COUNT(*) FROM fts_messages WHERE rowid = 2",
23416                fparams![],
23417                |row| row.get_typed(0),
23418            )
23419            .unwrap();
23420        assert_eq!(auth_rows, 1);
23421    }
23422
23423    #[test]
23424    fn rebuild_fts_via_rusqlite_cleans_duplicate_legacy_schema_rows() {
23425        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
23426
23427        let dir = TempDir::new().unwrap();
23428        let db_path = dir.path().join("fts-duplicate-rebuild.db");
23429
23430        let storage = SqliteStorage::open(&db_path).unwrap();
23431        let agent = Agent {
23432            id: None,
23433            slug: "codex".into(),
23434            name: "Codex".into(),
23435            version: Some("0.2.3".into()),
23436            kind: AgentKind::Cli,
23437        };
23438        let agent_id = storage.ensure_agent(&agent).unwrap();
23439        let conversation = Conversation {
23440            id: None,
23441            agent_slug: "codex".into(),
23442            workspace: Some(PathBuf::from("/ws")),
23443            external_id: Some("retro".into()),
23444            title: Some("retro".into()),
23445            source_path: PathBuf::from("/tmp/retro.jsonl"),
23446            started_at: Some(42),
23447            ended_at: Some(42),
23448            approx_tokens: None,
23449            metadata_json: serde_json::Value::Null,
23450            messages: vec![Message {
23451                id: None,
23452                idx: 0,
23453                role: MessageRole::User,
23454                author: None,
23455                created_at: Some(42),
23456                content: "retro investigation".into(),
23457                extra_json: serde_json::Value::Null,
23458                snippets: Vec::new(),
23459            }],
23460            source_id: LOCAL_SOURCE_ID.into(),
23461            origin_host: None,
23462        };
23463        storage
23464            .insert_conversation_tree(agent_id, None, &conversation)
23465            .unwrap();
23466        drop(storage);
23467        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
23468
23469        let conn = rusqlite_test_fixture_conn(&db_path);
23470        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
23471        conn.execute(
23472            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
23473             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
23474            ["CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')"],
23475        )
23476        .unwrap();
23477        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
23478        let duplicate_rows: i64 = conn
23479            .query_row(
23480                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
23481                [],
23482                |row| row.get(0),
23483            )
23484            .unwrap();
23485        assert_eq!(duplicate_rows, 2);
23486        drop(conn);
23487
23488        let inserted = rebuild_fts_via_rusqlite(&db_path).unwrap();
23489        assert_eq!(inserted, 1);
23490
23491        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
23492        let schema_rows = franken_fts_schema_rows(&conn).unwrap();
23493        assert_eq!(
23494            schema_rows, 1,
23495            "DROP TABLE should leave one clean FTS schema"
23496        );
23497        let match_count: i64 = conn
23498            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
23499                row.get_typed(0)
23500            })
23501            .unwrap();
23502        assert_eq!(match_count, 1);
23503    }
23504
23505    // =========================================================================
23506    // Agent storage tests (bead yln.4)
23507    // =========================================================================
23508
23509    #[test]
23510    fn ensure_agent_creates_new() {
23511        let dir = TempDir::new().unwrap();
23512        let db_path = dir.path().join("test.db");
23513        let storage = SqliteStorage::open(&db_path).unwrap();
23514
23515        let agent = Agent {
23516            id: None,
23517            slug: "test_agent".into(),
23518            name: "Test Agent".into(),
23519            version: Some("1.0".into()),
23520            kind: AgentKind::Cli,
23521        };
23522
23523        let id = storage.ensure_agent(&agent).unwrap();
23524        assert!(id > 0);
23525    }
23526
23527    #[test]
23528    fn ensure_agent_returns_existing_id() {
23529        let dir = TempDir::new().unwrap();
23530        let db_path = dir.path().join("test.db");
23531        let storage = SqliteStorage::open(&db_path).unwrap();
23532
23533        let agent = Agent {
23534            id: None,
23535            slug: "codex".into(),
23536            name: "Codex".into(),
23537            version: None,
23538            kind: AgentKind::Cli,
23539        };
23540
23541        let id1 = storage.ensure_agent(&agent).unwrap();
23542        let id2 = storage.ensure_agent(&agent).unwrap();
23543        assert_eq!(id1, id2);
23544    }
23545
23546    #[test]
23547    fn ensure_agent_unchanged_preserves_updated_at() {
23548        let dir = TempDir::new().unwrap();
23549        let db_path = dir.path().join("test.db");
23550        let storage = SqliteStorage::open(&db_path).unwrap();
23551
23552        let agent = Agent {
23553            id: None,
23554            slug: "codex".into(),
23555            name: "Codex".into(),
23556            version: Some("1.0".into()),
23557            kind: AgentKind::Cli,
23558        };
23559
23560        storage.ensure_agent(&agent).unwrap();
23561        let initial_updated_at: i64 = storage
23562            .conn
23563            .query_row_map(
23564                "SELECT updated_at FROM agents WHERE slug = ?1",
23565                fparams![agent.slug.as_str()],
23566                |row| row.get_typed(0),
23567            )
23568            .unwrap();
23569        std::thread::sleep(std::time::Duration::from_millis(5));
23570
23571        storage.ensure_agent(&agent).unwrap();
23572        let fetched_updated_at: i64 = storage
23573            .conn
23574            .query_row_map(
23575                "SELECT updated_at FROM agents WHERE slug = ?1",
23576                fparams![agent.slug.as_str()],
23577                |row| row.get_typed(0),
23578            )
23579            .unwrap();
23580
23581        assert_eq!(fetched_updated_at, initial_updated_at);
23582    }
23583
23584    #[test]
23585    fn ensure_agent_changed_metadata_updates_cached_slug() {
23586        let dir = TempDir::new().unwrap();
23587        let db_path = dir.path().join("test.db");
23588        let storage = SqliteStorage::open(&db_path).unwrap();
23589
23590        let mut agent = Agent {
23591            id: None,
23592            slug: "codex".into(),
23593            name: "Codex".into(),
23594            version: Some("1.0".into()),
23595            kind: AgentKind::Cli,
23596        };
23597
23598        let id1 = storage.ensure_agent(&agent).unwrap();
23599        agent.name = "Codex CLI".into();
23600        agent.version = Some("1.1".into());
23601        let id2 = storage.ensure_agent(&agent).unwrap();
23602
23603        let fetched: (String, Option<String>) = storage
23604            .conn
23605            .query_row_map(
23606                "SELECT name, version FROM agents WHERE slug = ?1",
23607                fparams![agent.slug.as_str()],
23608                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23609            )
23610            .unwrap();
23611
23612        assert_eq!(id1, id2);
23613        assert_eq!(fetched, ("Codex CLI".into(), Some("1.1".into())));
23614    }
23615
23616    #[test]
23617    fn list_agents_returns_inserted() {
23618        let dir = TempDir::new().unwrap();
23619        let db_path = dir.path().join("test.db");
23620        let storage = SqliteStorage::open(&db_path).unwrap();
23621
23622        let agent = Agent {
23623            id: None,
23624            slug: "new_agent".into(),
23625            name: "New Agent".into(),
23626            version: None,
23627            kind: AgentKind::VsCode,
23628        };
23629        storage.ensure_agent(&agent).unwrap();
23630
23631        let agents = storage.list_agents().unwrap();
23632        assert!(agents.iter().any(|a| a.slug == "new_agent"));
23633    }
23634
23635    // =========================================================================
23636    // Workspace storage tests (bead yln.4)
23637    // =========================================================================
23638
23639    #[test]
23640    fn ensure_workspace_creates_new() {
23641        let dir = TempDir::new().unwrap();
23642        let db_path = dir.path().join("test.db");
23643        let storage = SqliteStorage::open(&db_path).unwrap();
23644
23645        let id = storage
23646            .ensure_workspace(Path::new("/home/user/project"), Some("My Project"))
23647            .unwrap();
23648        assert!(id > 0);
23649    }
23650
23651    #[test]
23652    fn ensure_workspace_returns_existing() {
23653        let dir = TempDir::new().unwrap();
23654        let db_path = dir.path().join("test.db");
23655        let storage = SqliteStorage::open(&db_path).unwrap();
23656
23657        let path = Path::new("/home/user/myproject");
23658        let id1 = storage.ensure_workspace(path, None).unwrap();
23659        let id2 = storage.ensure_workspace(path, None).unwrap();
23660        assert_eq!(id1, id2);
23661    }
23662
23663    #[test]
23664    fn ensure_workspace_changed_display_name_updates_cached_path() {
23665        let dir = TempDir::new().unwrap();
23666        let db_path = dir.path().join("test.db");
23667        let storage = SqliteStorage::open(&db_path).unwrap();
23668
23669        let path = Path::new("/home/user/myproject");
23670        let id1 = storage.ensure_workspace(path, Some("Before")).unwrap();
23671        let id2 = storage.ensure_workspace(path, Some("After")).unwrap();
23672
23673        let display_name: Option<String> = storage
23674            .conn
23675            .query_row_map(
23676                "SELECT display_name FROM workspaces WHERE path = ?1",
23677                fparams![path.to_string_lossy().as_ref()],
23678                |row| row.get_typed(0),
23679            )
23680            .unwrap();
23681
23682        assert_eq!(id1, id2);
23683        assert_eq!(display_name.as_deref(), Some("After"));
23684    }
23685
23686    #[test]
23687    fn list_workspaces_returns_inserted() {
23688        let dir = TempDir::new().unwrap();
23689        let db_path = dir.path().join("test.db");
23690        let storage = SqliteStorage::open(&db_path).unwrap();
23691
23692        storage
23693            .ensure_workspace(Path::new("/test/workspace"), Some("Test WS"))
23694            .unwrap();
23695
23696        let workspaces = storage.list_workspaces().unwrap();
23697        assert!(
23698            workspaces
23699                .iter()
23700                .any(|w| w.path.to_str() == Some("/test/workspace"))
23701        );
23702    }
23703
23704    // =========================================================================
23705    // Source storage tests (bead yln.4)
23706    // =========================================================================
23707
23708    #[test]
23709    fn upsert_source_creates_new() {
23710        let dir = TempDir::new().unwrap();
23711        let db_path = dir.path().join("test.db");
23712        let storage = SqliteStorage::open(&db_path).unwrap();
23713
23714        let source = Source {
23715            id: "test-laptop".into(),
23716            kind: SourceKind::Ssh,
23717            host_label: Some("test.local".into()),
23718            machine_id: Some("test-machine-id".into()),
23719            platform: None,
23720            config_json: None,
23721            created_at: Some(SqliteStorage::now_millis()),
23722            updated_at: None,
23723        };
23724
23725        storage.upsert_source(&source).unwrap();
23726        let fetched = storage.get_source("test-laptop").unwrap();
23727        assert!(fetched.is_some());
23728        assert_eq!(fetched.unwrap().host_label, Some("test.local".into()));
23729    }
23730
23731    #[test]
23732    fn upsert_source_updates_existing() {
23733        let dir = TempDir::new().unwrap();
23734        let db_path = dir.path().join("test.db");
23735        let storage = SqliteStorage::open(&db_path).unwrap();
23736
23737        let source1 = Source {
23738            id: "my-source".into(),
23739            kind: SourceKind::Ssh,
23740            host_label: Some("Original Label".into()),
23741            machine_id: None,
23742            platform: None,
23743            config_json: None,
23744            created_at: Some(SqliteStorage::now_millis()),
23745            updated_at: None,
23746        };
23747        storage.upsert_source(&source1).unwrap();
23748
23749        let source2 = Source {
23750            id: "my-source".into(),
23751            kind: SourceKind::Ssh,
23752            host_label: Some("Updated Label".into()),
23753            machine_id: None,
23754            platform: Some("linux".into()),
23755            config_json: None,
23756            created_at: Some(SqliteStorage::now_millis()),
23757            updated_at: Some(SqliteStorage::now_millis()),
23758        };
23759        storage.upsert_source(&source2).unwrap();
23760
23761        let fetched = storage.get_source("my-source").unwrap().unwrap();
23762        assert_eq!(fetched.host_label, Some("Updated Label".into()));
23763        assert!(fetched.platform.is_some());
23764    }
23765
23766    #[test]
23767    fn upsert_source_unchanged_preserves_updated_at() {
23768        let dir = TempDir::new().unwrap();
23769        let db_path = dir.path().join("test.db");
23770        let storage = SqliteStorage::open(&db_path).unwrap();
23771
23772        let source = Source {
23773            id: "stable-source".into(),
23774            kind: SourceKind::Ssh,
23775            host_label: Some("builder.local".into()),
23776            machine_id: None,
23777            platform: Some("linux".into()),
23778            config_json: Some(serde_json::json!({"role": "bench"})),
23779            created_at: None,
23780            updated_at: None,
23781        };
23782
23783        storage.upsert_source(&source).unwrap();
23784        let initial = storage.get_source("stable-source").unwrap().unwrap();
23785        std::thread::sleep(std::time::Duration::from_millis(5));
23786
23787        storage.upsert_source(&source).unwrap();
23788        let fetched = storage.get_source("stable-source").unwrap().unwrap();
23789
23790        assert_eq!(fetched.created_at, initial.created_at);
23791        assert_eq!(fetched.updated_at, initial.updated_at);
23792        assert_eq!(fetched.host_label, initial.host_label);
23793        assert_eq!(fetched.platform, initial.platform);
23794        assert_eq!(fetched.config_json, initial.config_json);
23795    }
23796
23797    #[test]
23798    fn ensure_source_for_conversation_recreates_remote_source_after_delete() {
23799        let dir = TempDir::new().unwrap();
23800        let db_path = dir.path().join("test.db");
23801        let storage = SqliteStorage::open(&db_path).unwrap();
23802
23803        let conversation = Conversation {
23804            id: None,
23805            agent_slug: "codex".into(),
23806            workspace: Some(PathBuf::from("/ws/cache-recreate")),
23807            external_id: Some("cache-recreate".into()),
23808            title: Some("Cache Recreate".into()),
23809            source_path: PathBuf::from("/log/cache-recreate.jsonl"),
23810            started_at: Some(1_700_000_000_000),
23811            ended_at: Some(1_700_000_000_001),
23812            approx_tokens: Some(16),
23813            metadata_json: serde_json::json!({}),
23814            messages: vec![Message {
23815                id: None,
23816                idx: 0,
23817                role: MessageRole::User,
23818                author: Some("tester".into()),
23819                created_at: Some(1_700_000_000_000),
23820                content: "cache recreate".into(),
23821                extra_json: serde_json::json!({}),
23822                snippets: Vec::new(),
23823            }],
23824            source_id: "cache-remote-source".into(),
23825            origin_host: Some("builder-cache".into()),
23826        };
23827
23828        storage
23829            .ensure_source_for_conversation(&conversation)
23830            .unwrap();
23831        assert!(storage.get_source("cache-remote-source").unwrap().is_some());
23832
23833        let deleted = storage.delete_source("cache-remote-source", false).unwrap();
23834        assert!(deleted);
23835        assert!(storage.get_source("cache-remote-source").unwrap().is_none());
23836
23837        storage
23838            .ensure_source_for_conversation(&conversation)
23839            .unwrap();
23840        let recreated = storage.get_source("cache-remote-source").unwrap();
23841        assert!(recreated.is_some());
23842        assert_eq!(
23843            recreated.unwrap().host_label.as_deref(),
23844            Some("builder-cache")
23845        );
23846    }
23847
23848    #[test]
23849    fn delete_source_removes_entry() {
23850        let dir = TempDir::new().unwrap();
23851        let db_path = dir.path().join("test.db");
23852        let storage = SqliteStorage::open(&db_path).unwrap();
23853
23854        let source = Source {
23855            id: "to-delete".into(),
23856            kind: SourceKind::Local,
23857            host_label: None,
23858            machine_id: None,
23859            platform: None,
23860            config_json: None,
23861            created_at: Some(SqliteStorage::now_millis()),
23862            updated_at: None,
23863        };
23864        storage.upsert_source(&source).unwrap();
23865
23866        let deleted = storage.delete_source("to-delete", false).unwrap();
23867        assert!(deleted);
23868
23869        let fetched = storage.get_source("to-delete").unwrap();
23870        assert!(fetched.is_none());
23871    }
23872
23873    #[test]
23874    fn delete_source_cannot_delete_local() {
23875        let dir = TempDir::new().unwrap();
23876        let db_path = dir.path().join("test.db");
23877        let storage = SqliteStorage::open(&db_path).unwrap();
23878
23879        let result = storage.delete_source(LOCAL_SOURCE_ID, false);
23880        assert!(result.is_err());
23881    }
23882
23883    #[test]
23884    fn list_sources_includes_local() {
23885        let dir = TempDir::new().unwrap();
23886        let db_path = dir.path().join("test.db");
23887        let storage = SqliteStorage::open(&db_path).unwrap();
23888
23889        let sources = storage.list_sources().unwrap();
23890        assert!(sources.iter().any(|s| s.id == LOCAL_SOURCE_ID));
23891    }
23892
23893    #[test]
23894    fn insert_conversation_tree_blank_local_source_normalizes_to_local_id() {
23895        let dir = TempDir::new().unwrap();
23896        let db_path = dir.path().join("test.db");
23897        let storage = SqliteStorage::open(&db_path).unwrap();
23898
23899        let agent_id = storage
23900            .ensure_agent(&Agent {
23901                id: None,
23902                slug: "codex".into(),
23903                name: "Codex".into(),
23904                version: None,
23905                kind: AgentKind::Cli,
23906            })
23907            .unwrap();
23908
23909        let conversation = Conversation {
23910            id: None,
23911            agent_slug: "codex".into(),
23912            workspace: None,
23913            external_id: Some("blank-local-source".into()),
23914            title: Some("Blank local source".into()),
23915            source_path: dir.path().join("blank-local.jsonl"),
23916            started_at: Some(1_700_000_000_000),
23917            ended_at: Some(1_700_000_000_001),
23918            approx_tokens: None,
23919            metadata_json: serde_json::Value::Null,
23920            messages: vec![Message {
23921                id: None,
23922                idx: 0,
23923                role: MessageRole::User,
23924                author: None,
23925                created_at: Some(1_700_000_000_000),
23926                content: "hello".into(),
23927                extra_json: serde_json::Value::Null,
23928                snippets: Vec::new(),
23929            }],
23930            source_id: "   ".into(),
23931            origin_host: None,
23932        };
23933
23934        storage
23935            .insert_conversation_tree(agent_id, None, &conversation)
23936            .unwrap();
23937
23938        assert!(storage.get_source("   ").unwrap().is_none());
23939        let source = storage
23940            .get_source(LOCAL_SOURCE_ID)
23941            .unwrap()
23942            .expect("local source row should exist");
23943        assert_eq!(source.kind, SourceKind::Local);
23944        assert_eq!(source.host_label, None);
23945
23946        let conversations = storage.list_conversations(10, 0).unwrap();
23947        assert_eq!(conversations.len(), 1);
23948        assert_eq!(conversations[0].source_id, LOCAL_SOURCE_ID);
23949        assert_eq!(conversations[0].origin_host, None);
23950    }
23951
23952    #[test]
23953    fn repeated_local_inserts_do_not_touch_bootstrap_source_row() {
23954        let dir = TempDir::new().unwrap();
23955        let db_path = dir.path().join("test.db");
23956        let storage = SqliteStorage::open(&db_path).unwrap();
23957
23958        let agent_id = storage
23959            .ensure_agent(&Agent {
23960                id: None,
23961                slug: "codex".into(),
23962                name: "Codex".into(),
23963                version: None,
23964                kind: AgentKind::Cli,
23965            })
23966            .unwrap();
23967
23968        let bootstrap_updated_at: i64 = storage
23969            .conn
23970            .query_row_map(
23971                "SELECT updated_at FROM sources WHERE id = ?1",
23972                fparams![LOCAL_SOURCE_ID],
23973                |row| row.get_typed(0),
23974            )
23975            .unwrap();
23976
23977        let make_conversation = |external_id: &str, suffix: &str| Conversation {
23978            id: None,
23979            agent_slug: "codex".into(),
23980            workspace: None,
23981            external_id: Some(external_id.into()),
23982            title: Some(format!("Local source {suffix}")),
23983            source_path: dir.path().join(format!("local-{suffix}.jsonl")),
23984            started_at: Some(1_700_000_000_000),
23985            ended_at: Some(1_700_000_000_001),
23986            approx_tokens: None,
23987            metadata_json: serde_json::Value::Null,
23988            messages: vec![Message {
23989                id: None,
23990                idx: 0,
23991                role: MessageRole::User,
23992                author: None,
23993                created_at: Some(1_700_000_000_000),
23994                content: format!("hello-{suffix}"),
23995                extra_json: serde_json::Value::Null,
23996                snippets: Vec::new(),
23997            }],
23998            source_id: LOCAL_SOURCE_ID.into(),
23999            origin_host: None,
24000        };
24001
24002        std::thread::sleep(std::time::Duration::from_millis(5));
24003        storage
24004            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-1", "one"))
24005            .unwrap();
24006        let after_first_insert: i64 = storage
24007            .conn
24008            .query_row_map(
24009                "SELECT updated_at FROM sources WHERE id = ?1",
24010                fparams![LOCAL_SOURCE_ID],
24011                |row| row.get_typed(0),
24012            )
24013            .unwrap();
24014
24015        std::thread::sleep(std::time::Duration::from_millis(5));
24016        storage
24017            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-2", "two"))
24018            .unwrap();
24019        let after_second_insert: i64 = storage
24020            .conn
24021            .query_row_map(
24022                "SELECT updated_at FROM sources WHERE id = ?1",
24023                fparams![LOCAL_SOURCE_ID],
24024                |row| row.get_typed(0),
24025            )
24026            .unwrap();
24027
24028        assert_eq!(after_first_insert, bootstrap_updated_at);
24029        assert_eq!(after_second_insert, bootstrap_updated_at);
24030    }
24031
24032    #[test]
24033    fn insert_conversation_tree_blank_remote_source_normalizes_to_origin_host() {
24034        let dir = TempDir::new().unwrap();
24035        let db_path = dir.path().join("test.db");
24036        let storage = SqliteStorage::open(&db_path).unwrap();
24037
24038        let agent_id = storage
24039            .ensure_agent(&Agent {
24040                id: None,
24041                slug: "codex".into(),
24042                name: "Codex".into(),
24043                version: None,
24044                kind: AgentKind::Cli,
24045            })
24046            .unwrap();
24047
24048        let conversation = Conversation {
24049            id: None,
24050            agent_slug: "codex".into(),
24051            workspace: None,
24052            external_id: Some("blank-remote-source".into()),
24053            title: Some("Blank remote source".into()),
24054            source_path: dir.path().join("blank-remote.jsonl"),
24055            started_at: Some(1_700_000_000_000),
24056            ended_at: Some(1_700_000_000_001),
24057            approx_tokens: None,
24058            metadata_json: serde_json::Value::Null,
24059            messages: vec![Message {
24060                id: None,
24061                idx: 0,
24062                role: MessageRole::User,
24063                author: None,
24064                created_at: Some(1_700_000_000_000),
24065                content: "hello".into(),
24066                extra_json: serde_json::Value::Null,
24067                snippets: Vec::new(),
24068            }],
24069            source_id: "   ".into(),
24070            origin_host: Some("user@work-laptop".into()),
24071        };
24072
24073        storage
24074            .insert_conversation_tree(agent_id, None, &conversation)
24075            .unwrap();
24076
24077        assert!(storage.get_source("   ").unwrap().is_none());
24078        let source = storage
24079            .get_source("user@work-laptop")
24080            .unwrap()
24081            .expect("normalized remote source row should exist");
24082        assert_eq!(source.kind, SourceKind::Ssh);
24083        assert_eq!(source.host_label.as_deref(), Some("user@work-laptop"));
24084
24085        let conversations = storage.list_conversations(10, 0).unwrap();
24086        assert_eq!(conversations.len(), 1);
24087        assert_eq!(conversations[0].source_id, "user@work-laptop");
24088        assert_eq!(
24089            conversations[0].origin_host.as_deref(),
24090            Some("user@work-laptop")
24091        );
24092    }
24093
24094    #[test]
24095    fn insert_conversations_batched_normalizes_host_only_remote_source_id() {
24096        let dir = TempDir::new().unwrap();
24097        let db_path = dir.path().join("test.db");
24098        let storage = SqliteStorage::open(&db_path).unwrap();
24099
24100        let agent_id = storage
24101            .ensure_agent(&Agent {
24102                id: None,
24103                slug: "codex".into(),
24104                name: "Codex".into(),
24105                version: None,
24106                kind: AgentKind::Cli,
24107            })
24108            .unwrap();
24109
24110        let conversation = Conversation {
24111            id: None,
24112            agent_slug: "codex".into(),
24113            workspace: None,
24114            external_id: Some("batched-blank-remote-source".into()),
24115            title: Some("Batched blank remote source".into()),
24116            source_path: dir.path().join("batched-blank-remote.jsonl"),
24117            started_at: Some(1_700_000_000_000),
24118            ended_at: Some(1_700_000_000_001),
24119            approx_tokens: None,
24120            metadata_json: serde_json::Value::Null,
24121            messages: vec![Message {
24122                id: None,
24123                idx: 0,
24124                role: MessageRole::User,
24125                author: None,
24126                created_at: Some(1_700_000_000_000),
24127                content: "hello".into(),
24128                extra_json: serde_json::Value::Null,
24129                snippets: Vec::new(),
24130            }],
24131            source_id: "   ".into(),
24132            origin_host: Some("user@batch-host".into()),
24133        };
24134
24135        storage
24136            .insert_conversations_batched(&[(agent_id, None, &conversation)])
24137            .unwrap();
24138
24139        assert!(storage.get_source("   ").unwrap().is_none());
24140        let source = storage
24141            .get_source("user@batch-host")
24142            .unwrap()
24143            .expect("normalized batched remote source row should exist");
24144        assert_eq!(source.kind, SourceKind::Ssh);
24145        assert_eq!(source.host_label.as_deref(), Some("user@batch-host"));
24146
24147        let conversations = storage.list_conversations(10, 0).unwrap();
24148        assert_eq!(conversations.len(), 1);
24149        assert_eq!(conversations[0].source_id, "user@batch-host");
24150        assert_eq!(
24151            conversations[0].origin_host.as_deref(),
24152            Some("user@batch-host")
24153        );
24154    }
24155
24156    #[test]
24157    fn get_source_ids_excludes_local() {
24158        let dir = TempDir::new().unwrap();
24159        let db_path = dir.path().join("test.db");
24160        let storage = SqliteStorage::open(&db_path).unwrap();
24161
24162        // Add a non-local source
24163        let source = Source {
24164            id: "remote-1".into(),
24165            kind: SourceKind::Ssh,
24166            host_label: Some("server".into()),
24167            machine_id: None,
24168            platform: None,
24169            config_json: None,
24170            created_at: Some(SqliteStorage::now_millis()),
24171            updated_at: None,
24172        };
24173        storage.upsert_source(&source).unwrap();
24174
24175        let ids = storage.get_source_ids().unwrap();
24176        assert!(!ids.contains(&LOCAL_SOURCE_ID.to_string()));
24177        assert!(ids.contains(&"remote-1".to_string()));
24178    }
24179
24180    // =========================================================================
24181    // Scan timestamp tests (bead yln.4)
24182    // =========================================================================
24183
24184    #[test]
24185    fn get_last_scan_ts_returns_none_initially() {
24186        let dir = TempDir::new().unwrap();
24187        let db_path = dir.path().join("test.db");
24188        let storage = SqliteStorage::open(&db_path).unwrap();
24189
24190        let ts = storage.get_last_scan_ts().unwrap();
24191        assert!(ts.is_none());
24192    }
24193
24194    #[test]
24195    fn set_and_get_last_scan_ts() {
24196        let dir = TempDir::new().unwrap();
24197        let db_path = dir.path().join("test.db");
24198        let storage = SqliteStorage::open(&db_path).unwrap();
24199
24200        let expected_ts = 1700000000000_i64;
24201        storage.set_last_scan_ts(expected_ts).unwrap();
24202
24203        let actual_ts = storage.get_last_scan_ts().unwrap();
24204        assert_eq!(actual_ts, Some(expected_ts));
24205    }
24206
24207    #[test]
24208    fn connector_last_scan_ts_round_trip_normalizes_name() -> anyhow::Result<()> {
24209        let dir = TempDir::new()?;
24210        let db_path = dir.path().join("test.db");
24211        let storage = SqliteStorage::open(&db_path)?;
24212
24213        assert_eq!(storage.get_connector_last_scan_ts(" Codex ")?, None);
24214
24215        let expected_ts = 1_700_000_123_456_i64;
24216        storage.set_connector_last_scan_ts(" Codex ", expected_ts)?;
24217
24218        assert_eq!(
24219            storage.get_connector_last_scan_ts("codex")?,
24220            Some(expected_ts)
24221        );
24222        assert_eq!(
24223            storage.get_connector_last_scan_ts("CODEX")?,
24224            Some(expected_ts)
24225        );
24226        assert_eq!(storage.get_connector_last_scan_ts("claude-code")?, None);
24227        Ok(())
24228    }
24229
24230    #[test]
24231    fn connector_has_conversations_tracks_archived_agent_slug() -> anyhow::Result<()> {
24232        let dir = TempDir::new()?;
24233        let db_path = dir.path().join("test.db");
24234        let storage = SqliteStorage::open(&db_path)?;
24235        let agent_id = storage.ensure_agent(&Agent {
24236            id: None,
24237            slug: "codex".into(),
24238            name: "Codex".into(),
24239            version: None,
24240            kind: AgentKind::Cli,
24241        })?;
24242
24243        assert!(!storage.connector_has_conversations("codex")?);
24244
24245        let conversation = Conversation {
24246            id: None,
24247            agent_slug: "codex".into(),
24248            workspace: None,
24249            external_id: Some("connector-watermark-fixture".into()),
24250            title: Some("Connector watermark fixture".into()),
24251            source_path: PathBuf::from("/tmp/connector-watermark-fixture.jsonl"),
24252            started_at: Some(1_700_000_000_000),
24253            ended_at: Some(1_700_000_000_001),
24254            approx_tokens: None,
24255            metadata_json: serde_json::Value::Null,
24256            messages: vec![Message {
24257                id: None,
24258                idx: 0,
24259                role: MessageRole::User,
24260                author: None,
24261                created_at: Some(1_700_000_000_000),
24262                content: "per-connector watermark regression".into(),
24263                extra_json: serde_json::Value::Null,
24264                snippets: Vec::new(),
24265            }],
24266            source_id: LOCAL_SOURCE_ID.into(),
24267            origin_host: None,
24268        };
24269        storage.insert_conversation_tree(agent_id, None, &conversation)?;
24270
24271        assert!(storage.connector_has_conversations(" Codex ")?);
24272        assert!(!storage.connector_has_conversations("claude-code")?);
24273        assert!(!storage.connector_has_conversations(" ")?);
24274        Ok(())
24275    }
24276
24277    #[test]
24278    fn connector_scan_states_loads_watermarks_and_agent_presence() -> anyhow::Result<()> {
24279        let dir = TempDir::new()?;
24280        let db_path = dir.path().join("test.db");
24281        let storage = SqliteStorage::open(&db_path)?;
24282        let agent_id = storage.ensure_agent(&Agent {
24283            id: None,
24284            slug: "claude_code".into(),
24285            name: "Claude Code".into(),
24286            version: None,
24287            kind: AgentKind::Cli,
24288        })?;
24289        storage.set_connector_last_scan_ts(" Codex ", 1_700_000_123_456)?;
24290
24291        let conversation = Conversation {
24292            id: None,
24293            agent_slug: "claude_code".into(),
24294            workspace: None,
24295            external_id: Some("connector-scan-states-fixture".into()),
24296            title: Some("Connector scan states fixture".into()),
24297            source_path: PathBuf::from("/tmp/connector-scan-states-fixture.jsonl"),
24298            started_at: Some(1_700_000_000_000),
24299            ended_at: Some(1_700_000_000_001),
24300            approx_tokens: None,
24301            metadata_json: serde_json::Value::Null,
24302            messages: vec![Message {
24303                id: None,
24304                idx: 0,
24305                role: MessageRole::User,
24306                author: None,
24307                created_at: Some(1_700_000_000_000),
24308                content: "bulk connector scan state regression".into(),
24309                extra_json: serde_json::Value::Null,
24310                snippets: Vec::new(),
24311            }],
24312            source_id: LOCAL_SOURCE_ID.into(),
24313            origin_host: None,
24314        };
24315        storage.insert_conversation_tree(agent_id, None, &conversation)?;
24316
24317        let states = storage.connector_scan_states(&["codex", "claude", "gemini"])?;
24318        assert_eq!(
24319            states.get("codex").copied(),
24320            Some((Some(1_700_000_123_456), false)),
24321            "bulk state should preserve connector-specific watermarks"
24322        );
24323        assert_eq!(
24324            states.get("claude").copied(),
24325            Some((None, true)),
24326            "bulk state should honor known connector slug aliases"
24327        );
24328        assert_eq!(
24329            states.get("gemini").copied(),
24330            Some((None, false)),
24331            "bulk state should identify newly enabled connectors with no archived rows"
24332        );
24333        Ok(())
24334    }
24335
24336    #[test]
24337    fn connector_has_conversations_checks_known_agent_slug_aliases() -> anyhow::Result<()> {
24338        let dir = TempDir::new()?;
24339        let db_path = dir.path().join("test.db");
24340        let storage = SqliteStorage::open(&db_path)?;
24341        let agent_id = storage.ensure_agent(&Agent {
24342            id: None,
24343            slug: "claude_code".into(),
24344            name: "Claude Code".into(),
24345            version: None,
24346            kind: AgentKind::Cli,
24347        })?;
24348
24349        let conversation = Conversation {
24350            id: None,
24351            agent_slug: "claude_code".into(),
24352            workspace: None,
24353            external_id: Some("connector-watermark-claude-fixture".into()),
24354            title: Some("Claude connector watermark fixture".into()),
24355            source_path: PathBuf::from("/tmp/connector-watermark-claude-fixture.jsonl"),
24356            started_at: Some(1_700_000_000_000),
24357            ended_at: Some(1_700_000_000_001),
24358            approx_tokens: None,
24359            metadata_json: serde_json::Value::Null,
24360            messages: vec![Message {
24361                id: None,
24362                idx: 0,
24363                role: MessageRole::User,
24364                author: None,
24365                created_at: Some(1_700_000_000_000),
24366                content: "claude connector alias regression".into(),
24367                extra_json: serde_json::Value::Null,
24368                snippets: Vec::new(),
24369            }],
24370            source_id: LOCAL_SOURCE_ID.into(),
24371            origin_host: None,
24372        };
24373        storage.insert_conversation_tree(agent_id, None, &conversation)?;
24374
24375        assert!(
24376            storage.connector_has_conversations("claude")?,
24377            "the claude connector factory name must recognize legacy claude_code rows"
24378        );
24379        assert!(storage.connector_has_conversations("claude-code")?);
24380        assert!(storage.connector_has_conversations("claude_code")?);
24381        assert!(!storage.connector_has_conversations("codex")?);
24382        Ok(())
24383    }
24384
24385    // =========================================================================
24386    // now_millis utility test (bead yln.4)
24387    // =========================================================================
24388
24389    #[test]
24390    fn now_millis_returns_reasonable_value() {
24391        let ts = SqliteStorage::now_millis();
24392        // Should be after Jan 1, 2020 (approx 1577836800000)
24393        assert!(ts > 1577836800000);
24394        // Should be before Jan 1, 2100 (approx 4102444800000)
24395        assert!(ts < 4102444800000);
24396    }
24397
24398    // =========================================================================
24399    // Binary Metadata Serialization Tests (Opt 3.1)
24400    // =========================================================================
24401
24402    #[test]
24403    fn msgpack_roundtrip_basic_object() {
24404        let value = serde_json::json!({
24405            "key": "value",
24406            "number": 42,
24407            "nested": { "inner": true }
24408        });
24409
24410        let bytes = serialize_json_to_msgpack(&value).expect("should serialize");
24411        let recovered = deserialize_msgpack_to_json(&bytes);
24412
24413        assert_eq!(value, recovered);
24414    }
24415
24416    #[test]
24417    fn msgpack_returns_none_for_null() {
24418        let value = serde_json::Value::Null;
24419        assert!(serialize_json_to_msgpack(&value).is_none());
24420    }
24421
24422    #[test]
24423    fn message_insert_stores_null_extra_json_as_sql_null() {
24424        let dir = TempDir::new().unwrap();
24425        let db_path = dir.path().join("test.db");
24426        let storage = SqliteStorage::open(&db_path).unwrap();
24427        let agent_id = storage
24428            .ensure_agent(&Agent {
24429                id: None,
24430                slug: "codex".into(),
24431                name: "Codex".into(),
24432                version: None,
24433                kind: AgentKind::Cli,
24434            })
24435            .unwrap();
24436        let conversation = Conversation {
24437            id: None,
24438            agent_slug: "codex".into(),
24439            workspace: None,
24440            external_id: Some("null-extra-json".into()),
24441            title: Some("Null extra_json".into()),
24442            source_path: PathBuf::from("/tmp/null-extra-json.jsonl"),
24443            started_at: Some(1_700_000_000_000),
24444            ended_at: Some(1_700_000_000_001),
24445            approx_tokens: None,
24446            metadata_json: serde_json::Value::Null,
24447            messages: vec![Message {
24448                id: None,
24449                idx: 0,
24450                role: MessageRole::User,
24451                author: None,
24452                created_at: Some(1_700_000_000_000),
24453                content: "null metadata message".into(),
24454                extra_json: serde_json::Value::Null,
24455                snippets: Vec::new(),
24456            }],
24457            source_id: LOCAL_SOURCE_ID.into(),
24458            origin_host: None,
24459        };
24460
24461        let conversation_id = storage
24462            .insert_conversation_tree(agent_id, None, &conversation)
24463            .unwrap()
24464            .conversation_id;
24465
24466        let (extra_json, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
24467            .conn
24468            .query_row_map(
24469                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
24470                fparams![conversation_id],
24471                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
24472            )
24473            .unwrap();
24474        assert!(extra_json.is_none());
24475        assert!(extra_bin.is_none());
24476
24477        let stored = storage.fetch_messages(conversation_id).unwrap();
24478        assert!(stored[0].extra_json.is_null());
24479    }
24480
24481    #[test]
24482    fn message_insert_stores_nonempty_extra_json_as_msgpack_only() {
24483        let dir = TempDir::new().unwrap();
24484        let db_path = dir.path().join("test.db");
24485        let storage = SqliteStorage::open(&db_path).unwrap();
24486        let agent_id = storage
24487            .ensure_agent(&Agent {
24488                id: None,
24489                slug: "codex".into(),
24490                name: "Codex".into(),
24491                version: None,
24492                kind: AgentKind::Cli,
24493            })
24494            .unwrap();
24495        let extra_json = serde_json::json!({ "idx": 7, "kind": "profile" });
24496        let conversation = Conversation {
24497            id: None,
24498            agent_slug: "codex".into(),
24499            workspace: None,
24500            external_id: Some("msgpack-extra-json".into()),
24501            title: Some("MessagePack extra_json".into()),
24502            source_path: PathBuf::from("/tmp/msgpack-extra-json.jsonl"),
24503            started_at: Some(1_700_000_000_000),
24504            ended_at: Some(1_700_000_000_001),
24505            approx_tokens: None,
24506            metadata_json: serde_json::Value::Null,
24507            messages: vec![Message {
24508                id: None,
24509                idx: 0,
24510                role: MessageRole::User,
24511                author: None,
24512                created_at: Some(1_700_000_000_000),
24513                content: "msgpack metadata message".into(),
24514                extra_json: extra_json.clone(),
24515                snippets: Vec::new(),
24516            }],
24517            source_id: LOCAL_SOURCE_ID.into(),
24518            origin_host: None,
24519        };
24520
24521        let conversation_id = storage
24522            .insert_conversation_tree(agent_id, None, &conversation)
24523            .unwrap()
24524            .conversation_id;
24525
24526        let (extra_json_text, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
24527            .conn
24528            .query_row_map(
24529                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
24530                fparams![conversation_id],
24531                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
24532            )
24533            .unwrap();
24534        assert!(extra_json_text.is_none());
24535        assert!(extra_bin.is_some());
24536
24537        let stored = storage.fetch_messages(conversation_id).unwrap();
24538        assert_eq!(stored[0].extra_json, extra_json);
24539    }
24540
24541    #[test]
24542    fn conversation_insert_preserves_null_metadata_json_as_json_null() {
24543        let dir = TempDir::new().unwrap();
24544        let db_path = dir.path().join("test.db");
24545        let storage = SqliteStorage::open(&db_path).unwrap();
24546        let agent_id = storage
24547            .ensure_agent(&Agent {
24548                id: None,
24549                slug: "codex".into(),
24550                name: "Codex".into(),
24551                version: None,
24552                kind: AgentKind::Cli,
24553            })
24554            .unwrap();
24555        let conversation = Conversation {
24556            id: None,
24557            agent_slug: "codex".into(),
24558            workspace: None,
24559            external_id: Some("null-conversation-metadata".into()),
24560            title: Some("Null conversation metadata".into()),
24561            source_path: PathBuf::from("/tmp/null-conversation-metadata.jsonl"),
24562            started_at: Some(1_700_000_000_000),
24563            ended_at: Some(1_700_000_000_001),
24564            approx_tokens: None,
24565            metadata_json: serde_json::Value::Null,
24566            messages: vec![Message {
24567                id: None,
24568                idx: 0,
24569                role: MessageRole::User,
24570                author: None,
24571                created_at: Some(1_700_000_000_000),
24572                content: "null conversation metadata message".into(),
24573                extra_json: serde_json::Value::Null,
24574                snippets: Vec::new(),
24575            }],
24576            source_id: LOCAL_SOURCE_ID.into(),
24577            origin_host: None,
24578        };
24579
24580        storage
24581            .insert_conversation_tree(agent_id, None, &conversation)
24582            .unwrap();
24583
24584        let (metadata_json, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
24585            .conn
24586            .query_row_map(
24587                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
24588                fparams!["null-conversation-metadata"],
24589                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
24590            )
24591            .unwrap();
24592        assert_eq!(metadata_json.as_deref(), Some("null"));
24593        assert!(metadata_bin.is_none());
24594
24595        let listed = storage.list_conversations(10, 0).unwrap();
24596        assert!(listed[0].metadata_json.is_null());
24597    }
24598
24599    #[test]
24600    fn conversation_insert_stores_nonempty_metadata_as_msgpack_only() {
24601        let dir = TempDir::new().unwrap();
24602        let db_path = dir.path().join("test.db");
24603        let storage = SqliteStorage::open(&db_path).unwrap();
24604        let agent_id = storage
24605            .ensure_agent(&Agent {
24606                id: None,
24607                slug: "codex".into(),
24608                name: "Codex".into(),
24609                version: None,
24610                kind: AgentKind::Cli,
24611            })
24612            .unwrap();
24613        let metadata_json = serde_json::json!({ "bench": true, "source": "profile" });
24614        let conversation = Conversation {
24615            id: None,
24616            agent_slug: "codex".into(),
24617            workspace: None,
24618            external_id: Some("msgpack-conversation-metadata".into()),
24619            title: Some("MessagePack conversation metadata".into()),
24620            source_path: PathBuf::from("/tmp/msgpack-conversation-metadata.jsonl"),
24621            started_at: Some(1_700_000_000_000),
24622            ended_at: Some(1_700_000_000_001),
24623            approx_tokens: None,
24624            metadata_json: metadata_json.clone(),
24625            messages: vec![Message {
24626                id: None,
24627                idx: 0,
24628                role: MessageRole::User,
24629                author: None,
24630                created_at: Some(1_700_000_000_000),
24631                content: "msgpack conversation metadata message".into(),
24632                extra_json: serde_json::Value::Null,
24633                snippets: Vec::new(),
24634            }],
24635            source_id: LOCAL_SOURCE_ID.into(),
24636            origin_host: None,
24637        };
24638
24639        storage
24640            .insert_conversation_tree(agent_id, None, &conversation)
24641            .unwrap();
24642
24643        let (metadata_text, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
24644            .conn
24645            .query_row_map(
24646                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
24647                fparams!["msgpack-conversation-metadata"],
24648                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
24649            )
24650            .unwrap();
24651        assert!(metadata_text.is_none());
24652        assert!(metadata_bin.is_some());
24653
24654        let listed = storage.list_conversations(10, 0).unwrap();
24655        assert_eq!(listed[0].metadata_json, metadata_json);
24656    }
24657
24658    #[test]
24659    fn msgpack_returns_none_for_empty_object() {
24660        let value = serde_json::json!({});
24661        assert!(serialize_json_to_msgpack(&value).is_none());
24662    }
24663
24664    #[test]
24665    fn parse_historical_json_column_preserves_large_payloads_as_raw_json() {
24666        let raw = format!("{{\"blob\":\"{}\"}}", "x".repeat(1_000_000));
24667
24668        let value = parse_historical_json_column(Some(raw.clone()));
24669
24670        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
24671        assert_eq!(json_value_size_hint(&value), raw.len());
24672    }
24673
24674    #[test]
24675    fn parse_historical_json_column_preserves_small_payloads_as_raw_json() {
24676        let raw = String::from("{\"ok\":true,\"n\":1}");
24677
24678        let value = parse_historical_json_column(Some(raw.clone()));
24679
24680        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
24681    }
24682
24683    #[test]
24684    fn msgpack_serializes_non_empty_array() {
24685        let value = serde_json::json!([1, 2, 3]);
24686        let bytes = serialize_json_to_msgpack(&value).expect("should serialize array");
24687        let recovered = deserialize_msgpack_to_json(&bytes);
24688        assert_eq!(value, recovered);
24689    }
24690
24691    #[test]
24692    fn msgpack_smaller_than_json() {
24693        let value = serde_json::json!({
24694            "field_name_one": "some_value",
24695            "field_name_two": 123456,
24696            "field_name_three": [1, 2, 3, 4, 5],
24697            "field_name_four": { "nested": true }
24698        });
24699
24700        let json_bytes = serde_json::to_vec(&value).unwrap();
24701        let msgpack_bytes = serialize_json_to_msgpack(&value).unwrap();
24702
24703        // MessagePack should be smaller due to more compact encoding
24704        assert!(
24705            msgpack_bytes.len() < json_bytes.len(),
24706            "MessagePack ({} bytes) should be smaller than JSON ({} bytes)",
24707            msgpack_bytes.len(),
24708            json_bytes.len()
24709        );
24710    }
24711
24712    #[test]
24713    fn migration_v7_adds_binary_columns() {
24714        let dir = TempDir::new().unwrap();
24715        let db_path = dir.path().join("test.db");
24716        let storage = SqliteStorage::open(&db_path).unwrap();
24717
24718        // Verify metadata_bin column exists
24719        let has_metadata_bin = storage
24720            .raw()
24721            .query("PRAGMA table_info(conversations)")
24722            .unwrap()
24723            .iter()
24724            .any(|row| row.get_typed::<String>(1).unwrap() == "metadata_bin");
24725        assert!(
24726            has_metadata_bin,
24727            "conversations should have metadata_bin column"
24728        );
24729
24730        // Verify extra_bin column exists
24731        let has_extra_bin = storage
24732            .raw()
24733            .query("PRAGMA table_info(messages)")
24734            .unwrap()
24735            .iter()
24736            .any(|row| row.get_typed::<String>(1).unwrap() == "extra_bin");
24737        assert!(has_extra_bin, "messages should have extra_bin column");
24738    }
24739
24740    #[test]
24741    fn insert_conversation_tree_rehydrates_append_tail_state_cache_after_manual_clear() {
24742        let dir = TempDir::new().unwrap();
24743        let db_path = dir.path().join("append-tail-state-cache.db");
24744        let storage = SqliteStorage::open(&db_path).unwrap();
24745        let agent_id = storage
24746            .ensure_agent(&Agent {
24747                id: None,
24748                slug: "codex".into(),
24749                name: "Codex".into(),
24750                version: None,
24751                kind: AgentKind::Cli,
24752            })
24753            .unwrap();
24754        let workspace = PathBuf::from("/ws/profiled-append-remote");
24755        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
24756
24757        let initial = make_profiled_append_remote_merge_conversation(11, 5);
24758        let insert_outcome = storage
24759            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
24760            .unwrap();
24761        let conversation_id = insert_outcome.conversation_id;
24762
24763        let initial_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
24764            .raw()
24765            .query_row_map(
24766                "SELECT ended_at, last_message_idx, last_message_created_at
24767                 FROM conversation_tail_state
24768                 WHERE conversation_id = ?1",
24769                fparams![conversation_id],
24770                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
24771            )
24772            .unwrap();
24773        assert_eq!(initial_tail, (Some(111_005), Some(4), Some(111_004)));
24774
24775        storage
24776            .raw()
24777            .execute_compat(
24778                "UPDATE conversations SET ended_at = ?1 WHERE id = ?2",
24779                fparams![111_999_i64, conversation_id],
24780            )
24781            .unwrap();
24782        storage
24783            .raw()
24784            .execute_compat(
24785                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
24786                fparams![conversation_id],
24787            )
24788            .unwrap();
24789
24790        let appended = make_profiled_append_remote_merge_conversation(11, 10);
24791        let append_outcome = storage
24792            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
24793            .unwrap();
24794        assert_eq!(append_outcome.inserted_indices, vec![5, 6, 7, 8, 9]);
24795
24796        let final_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
24797            .raw()
24798            .query_row_map(
24799                "SELECT ended_at, last_message_idx, last_message_created_at
24800                 FROM conversation_tail_state
24801                 WHERE conversation_id = ?1",
24802                fparams![conversation_id],
24803                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
24804            )
24805            .unwrap();
24806        assert_eq!(final_tail, (Some(111_999), Some(9), Some(111_009)));
24807    }
24808
24809    #[test]
24810    fn msgpack_deserialize_empty_returns_default() {
24811        let recovered = deserialize_msgpack_to_json(&[]);
24812        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
24813    }
24814
24815    #[test]
24816    fn msgpack_deserialize_garbage_returns_default() {
24817        // Use truncated msgpack data that will fail to parse
24818        // 0x85 indicates a fixmap with 5 elements, but we don't provide them
24819        let recovered = deserialize_msgpack_to_json(&[0x85]);
24820        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
24821    }
24822
24823    #[test]
24824    fn stats_aggregator_collects_and_expands() {
24825        let mut agg = StatsAggregator::new();
24826        assert!(agg.is_empty());
24827
24828        // Record some stats
24829        // Day 100, agent "claude", source "local"
24830        agg.record("claude", "local", 100, 5, 500);
24831        // Day 100, agent "codex", source "local"
24832        agg.record("codex", "local", 100, 3, 300);
24833        // Day 101, agent "claude", source "local"
24834        agg.record("claude", "local", 101, 2, 200);
24835
24836        assert!(!agg.is_empty());
24837        assert_eq!(agg.raw_entry_count(), 3);
24838
24839        let entries = agg.expand();
24840        // Each raw entry expands to 4 permutations.
24841        // But (all, local) and (all, all) will aggregate.
24842        //
24843        // Raw:
24844        // 1. (100, claude, local) -> 1 sess, 5 msgs, 500 chars
24845        // 2. (100, codex, local)  -> 1 sess, 3 msgs, 300 chars
24846        // 3. (101, claude, local) -> 1 sess, 2 msgs, 200 chars
24847        //
24848        // Expanded 1 (day 100):
24849        // - (100, claude, local): 1 sess, 5 msgs, 500 chars
24850        // - (100, all, local):    1 (from claude) + 1 (from codex) = 2 sess, 8 msgs, 800 chars
24851        // - (100, claude, all):   1 sess, 5 msgs, 500 chars
24852        // - (100, codex, local):  1 sess, 3 msgs, 300 chars
24853        // - (100, codex, all):    1 sess, 3 msgs, 300 chars
24854        // - (100, all, all):      2 sess, 8 msgs, 800 chars
24855        //
24856        // Expanded 3 (day 101):
24857        // - (101, claude, local): 1 sess, 2 msgs, 200 chars
24858        // - (101, all, local):    1 sess, 2 msgs, 200 chars
24859        // - (101, claude, all):   1 sess, 2 msgs, 200 chars
24860        // - (101, all, all):      1 sess, 2 msgs, 200 chars
24861        //
24862        // Total unique keys in expanded map:
24863        // Day 100: (claude, local), (codex, local), (all, local), (claude, all), (codex, all), (all, all) = 6
24864        // Day 101: (claude, local), (all, local), (claude, all), (all, all) = 4
24865        // Total = 10 entries
24866
24867        assert_eq!(entries.len(), 10);
24868
24869        // Verify totals for day 100, all/all
24870        let day100_all = entries
24871            .iter()
24872            .find(|(d, a, s, _)| *d == 100 && a == "all" && s == "all")
24873            .unwrap();
24874        assert_eq!(day100_all.3.session_count_delta, 2);
24875        assert_eq!(day100_all.3.message_count_delta, 8);
24876        assert_eq!(day100_all.3.total_chars_delta, 800);
24877    }
24878
24879    // =========================================================================
24880    // LazyFrankenDb tests (bd-1ueu)
24881    // =========================================================================
24882
24883    #[test]
24884    fn lazy_franken_db_not_open_before_get() {
24885        let dir = TempDir::new().unwrap();
24886        let db_path = dir.path().join("lazy_test.db");
24887
24888        // Create a real DB so the path exists
24889        let _storage = SqliteStorage::open(&db_path).unwrap();
24890
24891        let lazy = LazyFrankenDb::new(db_path);
24892        assert!(
24893            !lazy.is_open(),
24894            "LazyFrankenDb must not open on construction"
24895        );
24896    }
24897
24898    #[test]
24899    fn lazy_franken_db_opens_on_first_get() {
24900        let dir = TempDir::new().unwrap();
24901        let db_path = dir.path().join("lazy_test.db");
24902
24903        // Create a real DB so the path exists
24904        let _storage = SqliteStorage::open(&db_path).unwrap();
24905        drop(_storage);
24906
24907        let lazy = LazyFrankenDb::new(db_path);
24908        assert!(!lazy.is_open());
24909
24910        let conn = lazy.get("test").expect("should open successfully");
24911        let count: i64 = conn
24912            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |r| {
24913                r.get_typed(0)
24914            })
24915            .unwrap();
24916        assert_eq!(count, 0);
24917        drop(conn);
24918
24919        assert!(lazy.is_open(), "LazyFrankenDb must be open after get()");
24920    }
24921
24922    #[test]
24923    fn lazy_franken_db_reuses_connection() {
24924        let dir = TempDir::new().unwrap();
24925        let db_path = dir.path().join("lazy_test.db");
24926        let _storage = SqliteStorage::open(&db_path).unwrap();
24927        drop(_storage);
24928
24929        let lazy = LazyFrankenDb::new(db_path);
24930
24931        // First access opens
24932        {
24933            let conn = lazy.get("first").unwrap();
24934            conn.execute_batch("CREATE TABLE IF NOT EXISTS test_tbl (id INTEGER)")
24935                .unwrap();
24936        }
24937
24938        // Second access reuses (table still exists)
24939        {
24940            let conn = lazy.get("second").unwrap();
24941            let count: i64 = conn
24942                .query_row_map("SELECT COUNT(*) FROM test_tbl", fparams![], |r| {
24943                    r.get_typed(0)
24944                })
24945                .unwrap();
24946            assert_eq!(count, 0);
24947        }
24948    }
24949
24950    #[test]
24951    fn lazy_franken_db_not_found_error() {
24952        let dir = TempDir::new().unwrap();
24953        let db_path = dir.path().join("nonexistent.db");
24954
24955        let lazy = LazyFrankenDb::new(db_path);
24956        let result = lazy.get("test");
24957        assert!(result.is_err());
24958        assert!(
24959            matches!(result.unwrap_err(), LazyDbError::NotFound(_)),
24960            "should return NotFound for missing DB"
24961        );
24962    }
24963
24964    #[test]
24965    fn lazy_franken_db_path_accessor() {
24966        let path = PathBuf::from("/tmp/test_lazy.db");
24967        let lazy = LazyFrankenDb::new(path.clone());
24968        assert_eq!(lazy.path(), path.as_path());
24969    }
24970
24971    // =========================================================================
24972    // Pricing / cost estimation tests (bead z9fse.10)
24973    // =========================================================================
24974
24975    #[test]
24976    fn sql_like_match_basic_patterns() {
24977        assert!(sql_like_match("claude-opus-4-20250101", "claude-opus-4%"));
24978        assert!(sql_like_match("claude-opus-4", "claude-opus-4%"));
24979        assert!(!sql_like_match("claude-sonnet-4", "claude-opus-4%"));
24980
24981        // Middle wildcard (gemini pattern)
24982        assert!(sql_like_match("gemini-2.0-flash-001", "gemini-2%flash%"));
24983        assert!(sql_like_match("gemini-2-flash", "gemini-2%flash%"));
24984        assert!(!sql_like_match("gemini-2-pro", "gemini-2%flash%"));
24985
24986        // Exact match
24987        assert!(sql_like_match("hello", "hello"));
24988        assert!(!sql_like_match("hello!", "hello"));
24989
24990        // Underscore wildcard
24991        assert!(sql_like_match("gpt-4o", "gpt-4_"));
24992        assert!(!sql_like_match("gpt-4oo", "gpt-4_"));
24993
24994        // Case insensitive
24995        assert!(sql_like_match("Claude-Opus-4", "claude-opus-4%"));
24996    }
24997
24998    #[test]
24999    fn date_str_to_day_id_converts_correctly() {
25000        // 2025-10-01 is 2100 days after 2020-01-01
25001        assert_eq!(date_str_to_day_id("2025-10-01").unwrap(), 2100);
25002        // 2024-04-01 is 1552 days after 2020-01-01
25003        assert_eq!(date_str_to_day_id("2024-04-01").unwrap(), 1552);
25004        assert!(date_str_to_day_id("invalid").is_err());
25005    }
25006
25007    #[test]
25008    fn pricing_table_lookup_selects_matching_entry() {
25009        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
25010        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
25011        let table = PricingTable {
25012            entries: vec![
25013                PricingEntry {
25014                    model_pattern: "claude-opus-4%".into(),
25015                    provider: "anthropic".into(),
25016                    input_cost_per_mtok: 15.0,
25017                    output_cost_per_mtok: 75.0,
25018                    cache_read_cost_per_mtok: Some(1.5),
25019                    cache_creation_cost_per_mtok: Some(18.75),
25020                    effective_day_id: effective_day,
25021                },
25022                PricingEntry {
25023                    model_pattern: "claude-sonnet-4%".into(),
25024                    provider: "anthropic".into(),
25025                    input_cost_per_mtok: 3.0,
25026                    output_cost_per_mtok: 15.0,
25027                    cache_read_cost_per_mtok: Some(0.3),
25028                    cache_creation_cost_per_mtok: Some(3.75),
25029                    effective_day_id: effective_day,
25030                },
25031            ],
25032        };
25033
25034        let result = table.lookup("claude-opus-4-20260101", lookup_day);
25035        assert!(result.is_some());
25036        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
25037
25038        let result = table.lookup("claude-sonnet-4-latest", lookup_day);
25039        assert!(result.is_some());
25040        assert_eq!(result.unwrap().input_cost_per_mtok, 3.0);
25041
25042        assert!(table.lookup("unknown-model", lookup_day).is_none());
25043    }
25044
25045    #[test]
25046    fn pricing_table_lookup_respects_effective_date() {
25047        let effective_day_1 = date_str_to_day_id("2025-10-01").unwrap();
25048        let effective_day_2 = date_str_to_day_id("2026-01-01").unwrap();
25049        let table = PricingTable {
25050            entries: vec![
25051                PricingEntry {
25052                    model_pattern: "claude-opus-4%".into(),
25053                    provider: "anthropic".into(),
25054                    input_cost_per_mtok: 15.0,
25055                    output_cost_per_mtok: 75.0,
25056                    cache_read_cost_per_mtok: None,
25057                    cache_creation_cost_per_mtok: None,
25058                    effective_day_id: effective_day_1,
25059                },
25060                PricingEntry {
25061                    model_pattern: "claude-opus-4%".into(),
25062                    provider: "anthropic".into(),
25063                    input_cost_per_mtok: 12.0,
25064                    output_cost_per_mtok: 60.0,
25065                    cache_read_cost_per_mtok: None,
25066                    cache_creation_cost_per_mtok: None,
25067                    effective_day_id: effective_day_2,
25068                },
25069            ],
25070        };
25071
25072        // Before price drop
25073        let result = table.lookup("claude-opus-4", date_str_to_day_id("2025-11-01").unwrap());
25074        assert!(result.is_some());
25075        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
25076
25077        // After price drop
25078        let result = table.lookup("claude-opus-4", date_str_to_day_id("2026-02-01").unwrap());
25079        assert!(result.is_some());
25080        assert_eq!(result.unwrap().input_cost_per_mtok, 12.0);
25081
25082        // Before all pricing
25083        assert!(
25084            table
25085                .lookup("claude-opus-4", date_str_to_day_id("2024-01-01").unwrap())
25086                .is_none()
25087        );
25088    }
25089
25090    #[test]
25091    fn pricing_table_lookup_specificity_tiebreak() {
25092        let effective_day = date_str_to_day_id("2025-01-01").unwrap();
25093        let lookup_day = date_str_to_day_id("2026-01-01").unwrap();
25094        let table = PricingTable {
25095            entries: vec![
25096                PricingEntry {
25097                    model_pattern: "gpt-4%".into(),
25098                    provider: "openai".into(),
25099                    input_cost_per_mtok: 10.0,
25100                    output_cost_per_mtok: 30.0,
25101                    cache_read_cost_per_mtok: None,
25102                    cache_creation_cost_per_mtok: None,
25103                    effective_day_id: effective_day,
25104                },
25105                PricingEntry {
25106                    model_pattern: "gpt-4-turbo%".into(),
25107                    provider: "openai".into(),
25108                    input_cost_per_mtok: 5.0,
25109                    output_cost_per_mtok: 15.0,
25110                    cache_read_cost_per_mtok: None,
25111                    cache_creation_cost_per_mtok: None,
25112                    effective_day_id: effective_day,
25113                },
25114            ],
25115        };
25116
25117        // Longer pattern wins for specific model
25118        let result = table.lookup("gpt-4-turbo-2025", lookup_day);
25119        assert!(result.is_some());
25120        assert_eq!(result.unwrap().input_cost_per_mtok, 5.0);
25121
25122        // Shorter pattern matches broader model
25123        let result = table.lookup("gpt-4o", lookup_day);
25124        assert!(result.is_some());
25125        assert_eq!(result.unwrap().input_cost_per_mtok, 10.0);
25126    }
25127
25128    #[test]
25129    fn pricing_table_compute_cost_basic() {
25130        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
25131        let table = PricingTable {
25132            entries: vec![PricingEntry {
25133                model_pattern: "claude-opus-4%".into(),
25134                provider: "anthropic".into(),
25135                input_cost_per_mtok: 15.0,
25136                output_cost_per_mtok: 75.0,
25137                cache_read_cost_per_mtok: Some(1.5),
25138                cache_creation_cost_per_mtok: Some(18.75),
25139                effective_day_id: effective_day,
25140            }],
25141        };
25142
25143        let cost = table.compute_cost(
25144            Some("claude-opus-4-latest"),
25145            date_str_to_day_id("2026-02-06").unwrap(),
25146            Some(1000),
25147            Some(500),
25148            None,
25149            None,
25150        );
25151        assert!(cost.is_some());
25152        // 1000 * 15.0 / 1M + 500 * 75.0 / 1M = 0.015 + 0.0375 = 0.0525
25153        assert!((cost.unwrap() - 0.0525).abs() < 1e-10);
25154    }
25155
25156    #[test]
25157    fn pricing_table_compute_cost_with_cache() {
25158        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
25159        let table = PricingTable {
25160            entries: vec![PricingEntry {
25161                model_pattern: "claude-opus-4%".into(),
25162                provider: "anthropic".into(),
25163                input_cost_per_mtok: 15.0,
25164                output_cost_per_mtok: 75.0,
25165                cache_read_cost_per_mtok: Some(1.5),
25166                cache_creation_cost_per_mtok: Some(18.75),
25167                effective_day_id: effective_day,
25168            }],
25169        };
25170
25171        let cost = table.compute_cost(
25172            Some("claude-opus-4-latest"),
25173            date_str_to_day_id("2026-02-06").unwrap(),
25174            Some(1_000_000),
25175            Some(100_000),
25176            Some(500_000),
25177            Some(200_000),
25178        );
25179        assert!(cost.is_some());
25180        // input excludes cache tokens to avoid double-charging them at both the
25181        // full input rate and the cache-specific rates.
25182        // non-cache input: 300K * 15/1M = 4.5, output: 100K * 75/1M = 7.5
25183        // cache_read: 500K * 1.5/1M = 0.75, cache_creation: 200K * 18.75/1M = 3.75
25184        // total = 16.5
25185        assert!((cost.unwrap() - 16.5).abs() < 1e-10);
25186    }
25187
25188    #[test]
25189    fn pricing_table_compute_cost_returns_none_for_unknown_model() {
25190        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
25191        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
25192        let table = PricingTable {
25193            entries: vec![PricingEntry {
25194                model_pattern: "claude-opus-4%".into(),
25195                provider: "anthropic".into(),
25196                input_cost_per_mtok: 15.0,
25197                output_cost_per_mtok: 75.0,
25198                cache_read_cost_per_mtok: None,
25199                cache_creation_cost_per_mtok: None,
25200                effective_day_id: effective_day,
25201            }],
25202        };
25203
25204        assert!(
25205            table
25206                .compute_cost(
25207                    Some("unknown-model"),
25208                    lookup_day,
25209                    Some(1000),
25210                    Some(500),
25211                    None,
25212                    None
25213                )
25214                .is_none()
25215        );
25216        assert!(
25217            table
25218                .compute_cost(None, lookup_day, Some(1000), Some(500), None, None)
25219                .is_none()
25220        );
25221        assert!(
25222            table
25223                .compute_cost(Some("claude-opus-4"), lookup_day, None, None, None, None)
25224                .is_none()
25225        );
25226    }
25227
25228    #[test]
25229    fn pricing_table_load_from_db() {
25230        let dir = TempDir::new().unwrap();
25231        let db_path = dir.path().join("test.db");
25232        let storage = SqliteStorage::open(&db_path).unwrap();
25233
25234        let table = PricingTable::load(&storage.conn).unwrap();
25235        assert!(!table.is_empty());
25236
25237        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
25238
25239        let opus = table.lookup("claude-opus-4-latest", lookup_day);
25240        assert!(opus.is_some());
25241        assert_eq!(opus.unwrap().input_cost_per_mtok, 15.0);
25242
25243        let flash = table.lookup("gemini-2.0-flash-001", lookup_day);
25244        assert!(flash.is_some());
25245        assert_eq!(flash.unwrap().input_cost_per_mtok, 0.075);
25246    }
25247
25248    #[test]
25249    fn pricing_table_load_rejects_invalid_effective_date() {
25250        let dir = TempDir::new().unwrap();
25251        let db_path = dir.path().join("test.db");
25252        let storage = SqliteStorage::open(&db_path).unwrap();
25253
25254        storage
25255            .conn
25256            .execute_compat(
25257                "INSERT INTO model_pricing (
25258                    model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
25259                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
25260                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
25261                fparams![
25262                    "broken-model%",
25263                    "test",
25264                    1.0_f64,
25265                    2.0_f64,
25266                    Option::<f64>::None,
25267                    Option::<f64>::None,
25268                    "not-a-date"
25269                ],
25270            )
25271            .unwrap();
25272
25273        let err = PricingTable::load(&storage.conn).unwrap_err();
25274        assert!(err.to_string().contains("invalid effective_date"));
25275    }
25276
25277    #[test]
25278    fn pricing_diagnostics_tracks_coverage() {
25279        let mut diag = PricingDiagnostics::default();
25280        diag.record_priced();
25281        diag.record_priced();
25282        diag.record_unpriced(Some("custom-model-v1"));
25283        diag.record_unpriced(Some("custom-model-v1"));
25284        diag.record_unpriced(None);
25285
25286        assert_eq!(diag.priced_count, 2);
25287        assert_eq!(diag.unpriced_count, 3);
25288        assert_eq!(diag.unknown_models.len(), 2);
25289        assert_eq!(diag.unknown_models["custom-model-v1"], 2);
25290        assert_eq!(diag.unknown_models["(none)"], 1);
25291    }
25292
25293    // =========================================================================
25294    // FrankenStorage migration tests (bead 2j6p6)
25295    // =========================================================================
25296
25297    /// Helper: create a FrankenStorage wrapping an in-memory connection and
25298    /// run migrations. This exercises the same code path as `open()` but avoids
25299    /// frankensqlite's file-based autoindex renaming limitation (V5 uses
25300    /// ALTER TABLE RENAME which triggers sqlite_autoindex lookup issues on
25301    /// file-based pagers).
25302    fn franken_storage_in_memory() -> FrankenStorage {
25303        let conn = FrankenConnection::open(":memory:").unwrap();
25304        let storage = FrankenStorage::new(conn, PathBuf::from(":memory:"));
25305        storage.run_migrations().unwrap();
25306        storage.apply_config().unwrap();
25307        storage
25308    }
25309
25310    #[test]
25311    fn franken_migrations_create_all_tables() {
25312        let storage = franken_storage_in_memory();
25313
25314        // Should be at CURRENT_SCHEMA_VERSION.
25315        let version = storage.schema_version().unwrap();
25316        assert_eq!(
25317            version, CURRENT_SCHEMA_VERSION,
25318            "fresh FrankenStorage should be at current schema version"
25319        );
25320
25321        // Core tables from V1 should exist.
25322        let rows = storage
25323            .raw()
25324            .query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
25325            .unwrap();
25326        let table_names: Vec<String> = rows
25327            .iter()
25328            .filter_map(|r| r.get_typed::<String>(0).ok())
25329            .collect();
25330
25331        for required in [
25332            "meta",
25333            "agents",
25334            "workspaces",
25335            "conversations",
25336            "messages",
25337            "snippets",
25338            "tags",
25339            "conversation_tags",
25340        ] {
25341            assert!(
25342                table_names.contains(&required.to_string()),
25343                "missing table: {required}"
25344            );
25345        }
25346
25347        // V4 sources table.
25348        assert!(
25349            table_names.contains(&"sources".to_string()),
25350            "missing sources table"
25351        );
25352
25353        // V8 daily_stats table.
25354        assert!(
25355            table_names.contains(&"daily_stats".to_string()),
25356            "missing daily_stats table"
25357        );
25358
25359        // V9 embedding_jobs table.
25360        assert!(
25361            table_names.contains(&"embedding_jobs".to_string()),
25362            "missing embedding_jobs table"
25363        );
25364
25365        // V11 message_metrics, usage_hourly, usage_daily tables.
25366        for analytics_table in ["message_metrics", "usage_hourly", "usage_daily"] {
25367            assert!(
25368                table_names.contains(&analytics_table.to_string()),
25369                "missing table: {analytics_table}"
25370            );
25371        }
25372        assert!(
25373            table_names.contains(&"conversation_tail_state".to_string()),
25374            "missing conversation_tail_state table"
25375        );
25376        assert!(
25377            table_names.contains(&"conversation_external_lookup".to_string()),
25378            "missing conversation_external_lookup table"
25379        );
25380        assert!(
25381            table_names.contains(&"conversation_external_tail_lookup".to_string()),
25382            "missing conversation_external_tail_lookup table"
25383        );
25384
25385        // Fresh frankensqlite databases should record the combined V13 base
25386        // schema plus every additive post-V13 migration.
25387        let rows = storage
25388            .raw()
25389            .query("SELECT COUNT(*) FROM _schema_migrations;")
25390            .unwrap();
25391        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
25392        assert_eq!(
25393            count,
25394            (13..=CURRENT_SCHEMA_VERSION).count() as i64,
25395            "_schema_migrations should record the V13 base schema and post-V13 migrations"
25396        );
25397
25398        // The latest applied migration should be the current schema version.
25399        let rows = storage
25400            .raw()
25401            .query("SELECT version FROM _schema_migrations ORDER BY version;")
25402            .unwrap();
25403        let versions: Vec<i64> = rows
25404            .iter()
25405            .map(|row| row.get_typed(0))
25406            .collect::<std::result::Result<_, _>>()
25407            .unwrap();
25408        assert_eq!(
25409            versions,
25410            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
25411            "_schema_migrations should contain v13 through current"
25412        );
25413    }
25414
25415    #[test]
25416    fn franken_migrations_idempotent() {
25417        let storage = franken_storage_in_memory();
25418        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25419
25420        // Re-running migrations on the same connection is a no-op.
25421        storage.run_migrations().unwrap();
25422        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25423    }
25424
25425    #[test]
25426    fn migration_v20_backfills_conversation_external_tail_lookup() {
25427        let storage = franken_storage_in_memory();
25428        let agent_id = storage
25429            .ensure_agent(&Agent {
25430                id: None,
25431                slug: "codex".into(),
25432                name: "Codex".into(),
25433                version: None,
25434                kind: AgentKind::Cli,
25435            })
25436            .unwrap();
25437        let workspace_id = storage
25438            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
25439            .unwrap();
25440        let mut conv = make_profiled_storage_remote_conversation(1919, 2);
25441        conv.source_id = "profiled-storage-remote-source-東京".into();
25442        conv.external_id = Some("profiled-storage-remote-☃-1919".into());
25443        let outcome = storage
25444            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
25445            .unwrap();
25446        let external_id = conv.external_id.as_deref().unwrap();
25447        let lookup_key = conversation_external_lookup_key(&conv.source_id, agent_id, external_id);
25448
25449        storage
25450            .raw()
25451            .execute("DELETE FROM conversation_external_tail_lookup")
25452            .unwrap();
25453        storage
25454            .raw()
25455            .execute("DELETE FROM _schema_migrations WHERE version = 20")
25456            .unwrap();
25457        storage
25458            .raw()
25459            .execute_compat(
25460                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
25461                fparams!["19"],
25462            )
25463            .unwrap();
25464
25465        storage.run_migrations().unwrap();
25466
25467        let backfilled: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
25468            .raw()
25469            .query_row_map(
25470                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
25471                 FROM conversation_external_tail_lookup
25472                 WHERE lookup_key = ?1",
25473                fparams![lookup_key.as_str()],
25474                |row| {
25475                    Ok((
25476                        row.get_typed(0)?,
25477                        row.get_typed(1)?,
25478                        row.get_typed(2)?,
25479                        row.get_typed(3)?,
25480                    ))
25481                },
25482            )
25483            .unwrap();
25484        assert_eq!(
25485            backfilled,
25486            (
25487                outcome.conversation_id,
25488                conv.ended_at,
25489                Some(1),
25490                conv.messages[1].created_at
25491            )
25492        );
25493        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25494    }
25495
25496    #[test]
25497    fn migration_v15_creates_lazy_tail_state_cache() {
25498        let conn = FrankenConnection::open(":memory:").unwrap();
25499        conn.execute_batch(
25500            "CREATE TABLE conversations (
25501                 id INTEGER PRIMARY KEY,
25502                 ended_at INTEGER
25503             );
25504             CREATE TABLE messages (
25505                 id INTEGER PRIMARY KEY,
25506                 conversation_id INTEGER NOT NULL,
25507                 idx INTEGER NOT NULL,
25508                 created_at INTEGER
25509             );
25510             INSERT INTO conversations(id, ended_at) VALUES
25511                 (1, 1710000000300),
25512                 (2, NULL);
25513             INSERT INTO messages(id, conversation_id, idx, created_at) VALUES
25514                 (10, 1, 0, 1710000000100),
25515                 (11, 1, 1, 1710000000200),
25516                 (12, 2, 0, 1710000000400);",
25517        )
25518        .unwrap();
25519
25520        conn.execute(
25521            "CREATE TABLE _schema_migrations (
25522                version INTEGER PRIMARY KEY,
25523                name TEXT NOT NULL,
25524                applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))
25525             );",
25526        )
25527        .unwrap();
25528
25529        assert!(
25530            apply_conversation_tail_state_cache_migration(&conn).unwrap(),
25531            "v15 migration should apply once"
25532        );
25533        assert!(
25534            !apply_conversation_tail_state_cache_migration(&conn).unwrap(),
25535            "v15 migration should be idempotent once recorded"
25536        );
25537
25538        let columns = conn.query("PRAGMA table_info(conversations);").unwrap();
25539        let column_names: HashSet<String> = columns
25540            .iter()
25541            .map(|row| row.get_typed(1))
25542            .collect::<std::result::Result<_, frankensqlite::FrankenError>>()
25543            .unwrap();
25544        assert!(column_names.contains("last_message_idx"));
25545        assert!(column_names.contains("last_message_created_at"));
25546
25547        let tail_rows: i64 = conn
25548            .query("SELECT COUNT(*) FROM conversation_tail_state;")
25549            .unwrap()
25550            .first()
25551            .unwrap()
25552            .get_typed(0)
25553            .unwrap();
25554        assert_eq!(
25555            tail_rows, 0,
25556            "v15 should create the cache without an open-time message scan"
25557        );
25558
25559        let applied: i64 = conn
25560            .query("SELECT COUNT(*) FROM _schema_migrations WHERE version = 15;")
25561            .unwrap()
25562            .first()
25563            .unwrap()
25564            .get_typed(0)
25565            .unwrap();
25566        assert_eq!(applied, 1);
25567    }
25568
25569    #[test]
25570    fn schema_repair_adds_missing_conversations_token_columns() {
25571        let conn = FrankenConnection::open(":memory:").unwrap();
25572        conn.execute_batch(
25573            "CREATE TABLE conversations (
25574                 id INTEGER PRIMARY KEY,
25575                 agent_id INTEGER NOT NULL,
25576                 source_path TEXT NOT NULL
25577             );",
25578        )
25579        .unwrap();
25580        let storage = FrankenStorage::new(conn, std::path::PathBuf::from(":memory:"));
25581
25582        storage.repair_missing_conversation_token_columns().unwrap();
25583        storage.repair_missing_conversation_token_columns().unwrap();
25584
25585        let columns = franken_table_column_names(&storage.conn, "conversations").unwrap();
25586        for &(column_name, _) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
25587            assert!(
25588                columns.contains(column_name),
25589                "schema repair should add conversations.{column_name}"
25590            );
25591        }
25592    }
25593
25594    #[test]
25595    fn franken_meta_schema_version_in_sync() {
25596        let storage = franken_storage_in_memory();
25597
25598        // meta.schema_version should be kept in sync.
25599        let rows = storage
25600            .raw()
25601            .query("SELECT value FROM meta WHERE key = 'schema_version';")
25602            .unwrap();
25603        let meta_version: String = rows.first().unwrap().get_typed(0).unwrap();
25604        assert_eq!(
25605            meta_version,
25606            CURRENT_SCHEMA_VERSION.to_string(),
25607            "meta.schema_version should match CURRENT_SCHEMA_VERSION"
25608        );
25609    }
25610
25611    #[test]
25612    fn franken_transition_from_meta_version() {
25613        let dir = TempDir::new().unwrap();
25614        let db_path = dir.path().join("test_transition.db");
25615
25616        // Simulate an existing database created by SqliteStorage at version 10.
25617        // We create just enough schema to test the transition.
25618        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
25619        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
25620            .unwrap();
25621        conn.execute("INSERT INTO meta(key, value) VALUES('schema_version', '10');")
25622            .unwrap();
25623        // Create a dummy conversations table so transition doesn't think it's corrupted.
25624        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
25625            .unwrap();
25626        drop(conn);
25627
25628        // Now run the transition function.
25629        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
25630        transition_from_meta_version(&conn).unwrap();
25631
25632        // The frankensqlite path uses a combined V13 base migration, so a
25633        // legacy V10 marker is bridged to V13 and later idempotent repair fills
25634        // in any missing V11-V13 objects.
25635        let rows = conn
25636            .query("SELECT version FROM _schema_migrations ORDER BY version;")
25637            .unwrap();
25638        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
25639        assert_eq!(
25640            versions,
25641            (1..=13).collect::<Vec<i64>>(),
25642            "transition should bridge legacy V10 databases through the combined V13 base marker"
25643        );
25644    }
25645
25646    #[test]
25647    fn franken_transition_from_current_meta_backfills_current_schema_marker() {
25648        let dir = TempDir::new().unwrap();
25649        let db_path = dir.path().join("test_current_transition.db");
25650
25651        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
25652        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
25653            .unwrap();
25654        conn.execute_compat(
25655            "INSERT INTO meta(key, value) VALUES('schema_version', ?1);",
25656            &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
25657        )
25658        .unwrap();
25659        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
25660            .unwrap();
25661        drop(conn);
25662
25663        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
25664        transition_from_meta_version(&conn).unwrap();
25665
25666        let rows = conn
25667            .query("SELECT version FROM _schema_migrations ORDER BY version;")
25668            .unwrap();
25669        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
25670        assert_eq!(
25671            versions,
25672            (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
25673            "current meta schema marker should backfill every known migration"
25674        );
25675    }
25676
25677    #[test]
25678    fn franken_transition_skips_when_already_done() {
25679        let dir = TempDir::new().unwrap();
25680        let db_path = dir.path().join("test_transition_skip.db");
25681
25682        // Create a DB that already has _schema_migrations.
25683        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
25684        conn.execute(
25685            "CREATE TABLE _schema_migrations (version INTEGER PRIMARY KEY, name TEXT NOT NULL, applied_at TEXT NOT NULL DEFAULT 'now');",
25686        ).unwrap();
25687        conn.execute("INSERT INTO _schema_migrations (version, name) VALUES (1, 'test');")
25688            .unwrap();
25689
25690        // Transition should be a no-op.
25691        transition_from_meta_version(&conn).unwrap();
25692
25693        // Should still have exactly 1 entry.
25694        let rows = conn
25695            .query("SELECT COUNT(*) FROM _schema_migrations;")
25696            .unwrap();
25697        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
25698        assert_eq!(
25699            count, 1,
25700            "transition should not re-run on already-transitioned DB"
25701        );
25702    }
25703
25704    #[test]
25705    fn franken_transition_fresh_db_is_noop() {
25706        let dir = TempDir::new().unwrap();
25707        let db_path = dir.path().join("test_fresh_noop.db");
25708
25709        // Empty database — no meta table, no tables at all.
25710        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
25711        transition_from_meta_version(&conn).unwrap();
25712
25713        // _schema_migrations should NOT have been created.
25714        let res = conn.query("SELECT * FROM \"_schema_migrations\";");
25715        assert!(
25716            res.is_err(),
25717            "transition should not create _schema_migrations on fresh DB"
25718        );
25719    }
25720
25721    #[test]
25722    fn franken_transition_with_fts_virtual_table_succeeds() {
25723        let dir = TempDir::new().unwrap();
25724        let db_path = dir.path().join("test_transition_with_fts.db");
25725
25726        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
25727        conn.execute_batch(
25728            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
25729             INSERT INTO meta(key, value) VALUES('schema_version', '13');
25730             CREATE TABLE conversations (id INTEGER PRIMARY KEY);
25731             CREATE VIRTUAL TABLE fts_messages USING fts5(
25732                 content,
25733                 title,
25734                 agent,
25735                 workspace,
25736                 source_path,
25737                 created_at,
25738                 content='',
25739                 tokenize='porter unicode61'
25740             );",
25741        )
25742        .unwrap();
25743        drop(conn);
25744
25745        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
25746        transition_from_meta_version(&conn).unwrap();
25747
25748        let rows = conn
25749            .query("SELECT version FROM _schema_migrations ORDER BY version;")
25750            .unwrap();
25751        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
25752        assert_eq!(versions, (1..=13).collect::<Vec<i64>>());
25753    }
25754
25755    #[test]
25756    fn franken_storage_open_legacy_v13_with_fts_virtual_table_succeeds() {
25757        let dir = TempDir::new().unwrap();
25758        let db_path = dir.path().join("test_open_legacy_v13_with_fts.db");
25759
25760        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
25761        conn.execute_batch(
25762            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
25763             INSERT INTO meta(key, value) VALUES('schema_version', '13');
25764             CREATE TABLE agents (
25765                 id INTEGER PRIMARY KEY,
25766                 slug TEXT NOT NULL
25767             );
25768             CREATE TABLE workspaces (
25769                 id INTEGER PRIMARY KEY,
25770                 path TEXT NOT NULL
25771             );
25772             CREATE TABLE sources (
25773                 id TEXT PRIMARY KEY,
25774                 kind TEXT NOT NULL,
25775                 host_label TEXT,
25776                 machine_id TEXT,
25777                 platform TEXT,
25778                 config_json TEXT,
25779                 created_at INTEGER NOT NULL,
25780                 updated_at INTEGER NOT NULL
25781             );
25782             CREATE TABLE conversations (
25783                 id INTEGER PRIMARY KEY,
25784                 agent_id INTEGER NOT NULL,
25785                 workspace_id INTEGER,
25786                 source_id TEXT NOT NULL DEFAULT 'local',
25787                 external_id TEXT,
25788                 title TEXT,
25789                 source_path TEXT NOT NULL,
25790                 started_at INTEGER,
25791                 ended_at INTEGER
25792             );
25793             CREATE TABLE messages (
25794                 id INTEGER PRIMARY KEY,
25795                 conversation_id INTEGER NOT NULL,
25796                 idx INTEGER NOT NULL,
25797                 role TEXT NOT NULL,
25798                 author TEXT,
25799                 created_at INTEGER,
25800                 content TEXT NOT NULL,
25801                 extra_json TEXT,
25802                 extra_bin BLOB
25803             );
25804             INSERT INTO agents(id, slug) VALUES (1, 'codex');
25805             INSERT INTO workspaces(id, path) VALUES (1, '/data/projects/coding_agent_session_search');
25806             INSERT INTO sources(id, kind, host_label, created_at, updated_at)
25807             VALUES ('local', 'local', NULL, 1710000000000, 1710000000000);
25808             INSERT INTO conversations(
25809                 id,
25810                 agent_id,
25811                 workspace_id,
25812                 source_id,
25813                 external_id,
25814                 title,
25815                 source_path,
25816                 started_at
25817             )
25818             VALUES (
25819                 1,
25820                 1,
25821                 1,
25822                 'local',
25823                 'legacy-session',
25824                 'legacy session',
25825                 '/tmp/legacy.jsonl',
25826                 1710000000000
25827             );
25828             INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content)
25829             VALUES (1, 1, 0, 'user', 'tester', 1710000000000, 'legacy content');
25830             CREATE VIRTUAL TABLE fts_messages USING fts5(
25831                 content,
25832                 title,
25833                 agent,
25834                 workspace,
25835                 source_path,
25836                 created_at,
25837                 message_id,
25838                 content='',
25839                 tokenize='porter unicode61'
25840             );",
25841        )
25842        .unwrap();
25843        drop(conn);
25844
25845        let storage = FrankenStorage::open(&db_path).unwrap();
25846        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25847
25848        let rows = storage
25849            .raw()
25850            .query("SELECT version FROM _schema_migrations ORDER BY version;")
25851            .unwrap();
25852        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
25853        assert_eq!(versions, (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>());
25854    }
25855
25856    #[test]
25857    fn franken_storage_open_repairs_duplicate_fts_messages_schema_rows() {
25858        let dir = TempDir::new().unwrap();
25859        let db_path = dir.path().join("test_open_repairs_duplicate_fts_schema.db");
25860
25861        let storage = FrankenStorage::open(&db_path).unwrap();
25862        let agent = Agent {
25863            id: None,
25864            slug: "codex".into(),
25865            name: "Codex".into(),
25866            version: None,
25867            kind: AgentKind::Cli,
25868        };
25869        let agent_id = storage.ensure_agent(&agent).unwrap();
25870        let conversation = Conversation {
25871            id: None,
25872            agent_slug: "codex".into(),
25873            workspace: Some(PathBuf::from("/tmp/workspace")),
25874            external_id: Some("dup-fts-schema".into()),
25875            title: Some("Duplicate FTS schema".into()),
25876            source_path: PathBuf::from("/tmp/dup-fts-schema.jsonl"),
25877            started_at: Some(1_700_000_000_000),
25878            ended_at: Some(1_700_000_000_100),
25879            approx_tokens: Some(42),
25880            metadata_json: serde_json::Value::Null,
25881            messages: vec![Message {
25882                id: None,
25883                idx: 0,
25884                role: MessageRole::User,
25885                author: Some("user".into()),
25886                created_at: Some(1_700_000_000_050),
25887                content: "message that should remain queryable".into(),
25888                extra_json: serde_json::Value::Null,
25889                snippets: Vec::new(),
25890            }],
25891            source_id: LOCAL_SOURCE_ID.into(),
25892            origin_host: None,
25893        };
25894        storage
25895            .insert_conversation_tree(agent_id, None, &conversation)
25896            .unwrap();
25897        drop(storage);
25898        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
25899
25900        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
25901        let conn = rusqlite_test_fixture_conn(&db_path);
25902        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
25903        conn.execute(
25904            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
25905             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
25906            [duplicate_legacy_fts_sql],
25907        )
25908        .unwrap();
25909        conn.execute(
25910            "DELETE FROM meta WHERE key = ?1",
25911            [FTS_FRANKEN_REBUILD_META_KEY],
25912        )
25913        .unwrap();
25914        // Simulate a pre-fix upgraded database that has never gone through the
25915        // authoritative frankensqlite FTS rebuild generation yet.
25916        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
25917
25918        let duplicate_rows: i64 = conn
25919            .query_row(
25920                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
25921                [],
25922                |row| row.get(0),
25923            )
25924            .unwrap();
25925        assert_eq!(duplicate_rows, 2);
25926        drop(conn);
25927
25928        let reopened = FrankenStorage::open(&db_path).unwrap();
25929        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25930        let generation_rows: Vec<String> = reopened
25931            .raw()
25932            .query_map_collect(
25933                "SELECT value FROM meta WHERE key = ?1",
25934                fparams![FTS_FRANKEN_REBUILD_META_KEY],
25935                |row| row.get_typed(0),
25936            )
25937            .unwrap();
25938        assert_eq!(
25939            generation_rows.len(),
25940            0,
25941            "canonical open should not eagerly rewrite FTS repair metadata"
25942        );
25943        reopened.ensure_search_fallback_fts_consistency().unwrap();
25944        let repaired = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
25945        assert_eq!(franken_fts_schema_rows(&repaired).unwrap(), 1);
25946
25947        let total_messages: i64 = reopened
25948            .raw()
25949            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25950                row.get_typed(0)
25951            })
25952            .unwrap();
25953        let total_fts_rows: i64 = reopened
25954            .raw()
25955            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
25956                row.get_typed(0)
25957            })
25958            .unwrap();
25959        assert_eq!(total_fts_rows, total_messages);
25960    }
25961
25962    #[test]
25963    fn fts_messages_integrity_reports_missing_shadow_tables() {
25964        let dir = TempDir::new().unwrap();
25965        let healthy_db_path = dir.path().join("healthy_fts.db");
25966
25967        {
25968            let storage = FrankenStorage::open(&healthy_db_path).unwrap();
25969            storage.ensure_search_fallback_fts_consistency().unwrap();
25970            storage
25971                .validate_fts_messages_integrity()
25972                .expect("freshly materialized fts_messages should pass integrity validation");
25973        }
25974
25975        let corrupt_db_path = dir.path().join("test_corrupt_fts_missing_shadows.db");
25976        {
25977            let conn = rusqlite_test_fixture_conn(&corrupt_db_path);
25978            conn.execute("CREATE TABLE schema_anchor(id INTEGER PRIMARY KEY)", [])
25979                .unwrap();
25980            let orphaned_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
25981            conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
25982            conn.execute(
25983                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
25984                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
25985                [orphaned_fts_sql],
25986            )
25987            .unwrap();
25988            conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
25989        }
25990
25991        let open_err = FrankenConnection::open(corrupt_db_path.to_string_lossy().to_string())
25992            .expect_err("orphaned fts_messages schema should fail during connection open");
25993        let integrity = fts_messages_integrity_error_from_message(open_err.to_string())
25994            .expect("open-time FTS corruption should map to the typed FTS integrity kind");
25995        assert_eq!(integrity.missing_shadow_tables(), &["fts_messages_content"]);
25996        let rendered = integrity.to_string();
25997        assert!(
25998            rendered.contains("fts_messages")
25999                && rendered.contains("required FTS5 shadow tables")
26000                && rendered.contains("fts_messages_content"),
26001            "error should be an operator-facing FTS corruption diagnosis: {rendered}"
26002        );
26003    }
26004
26005    #[test]
26006    fn franken_storage_open_fresh_db_keeps_single_franken_fts_schema_row() {
26007        let dir = TempDir::new().unwrap();
26008        let db_path = dir.path().join("fresh-franken-storage-open.db");
26009
26010        let storage = FrankenStorage::open(&db_path).unwrap();
26011        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
26012
26013        // The FTS5 virtual table is no longer created eagerly by the
26014        // migration runner (V14 drops the old internal-content table and the
26015        // current contentless table is recreated lazily — see MIGRATION_V14).
26016        // Invoke the repair path to match normal cass startup, then assert
26017        // there is exactly one fts_messages entry in sqlite_schema (no
26018        // duplicates).
26019        storage
26020            .ensure_search_fallback_fts_consistency()
26021            .expect("ensure FTS consistency after fresh open");
26022        drop(storage);
26023
26024        let c_reader = FrankenConnection::open(db_path.to_string_lossy().into_owned())
26025            .expect("open DB via frankensqlite for sqlite_master inspection");
26026        assert_eq!(
26027            franken_fts_schema_rows(&c_reader).unwrap(),
26028            1,
26029            "exactly one fts_messages schema row should exist after ensure_search_fallback_fts_consistency"
26030        );
26031        drop(c_reader);
26032
26033        let storage = FrankenStorage::open(&db_path).unwrap();
26034        assert!(
26035            storage
26036                .raw()
26037                .query("SELECT COUNT(*) FROM fts_messages")
26038                .is_ok(),
26039            "fts_messages must be queryable through frankensqlite after open"
26040        );
26041    }
26042
26043    #[test]
26044    fn franken_storage_open_repairs_missing_analytics_tables_when_version_markers_lie() {
26045        let dir = TempDir::new().unwrap();
26046        let db_path = dir.path().join("test_repair_missing_analytics.db");
26047
26048        {
26049            let storage = FrankenStorage::open(&db_path).unwrap();
26050            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
26051        }
26052
26053        {
26054            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
26055            for table in &[
26056                "usage_models_daily",
26057                "usage_daily",
26058                "usage_hourly",
26059                "message_metrics",
26060                "token_daily_stats",
26061                "token_usage",
26062                "model_pricing",
26063                "embedding_jobs",
26064                "daily_stats",
26065            ] {
26066                conn.execute(&format!("DROP TABLE IF EXISTS {table}"))
26067                    .unwrap();
26068            }
26069            conn.execute_compat(
26070                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
26071                &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
26072            )
26073            .unwrap();
26074        }
26075
26076        let repaired = FrankenStorage::open(&db_path).unwrap();
26077        assert_eq!(repaired.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
26078
26079        let analytics_count: i64 = repaired
26080            .raw()
26081            .query_row_map(
26082                "SELECT COUNT(*) FROM sqlite_master
26083                 WHERE type='table'
26084                   AND name IN (
26085                     'daily_stats',
26086                     'embedding_jobs',
26087                     'token_usage',
26088                     'token_daily_stats',
26089                     'model_pricing',
26090                     'message_metrics',
26091                     'usage_hourly',
26092                     'usage_daily',
26093                     'usage_models_daily'
26094                   )",
26095                &[],
26096                |row| row.get_typed(0),
26097            )
26098            .unwrap();
26099        assert_eq!(
26100            analytics_count, 9,
26101            "open() should recreate the missing analytics tables even when schema_version already says current"
26102        );
26103    }
26104
26105    #[test]
26106    fn current_schema_repair_batches_cover_every_required_probe() {
26107        let missing_tables: Vec<&'static str> = REQUIRED_CURRENT_SCHEMA_TABLE_PROBES
26108            .iter()
26109            .map(|(table_name, _)| *table_name)
26110            .collect();
26111
26112        let batches = current_schema_repair_batches_for_missing_tables(&missing_tables).unwrap();
26113        let covered_tables: HashSet<&'static str> = batches
26114            .iter()
26115            .flat_map(|batch| batch.tables.iter().copied())
26116            .collect();
26117
26118        for table_name in missing_tables {
26119            assert!(
26120                covered_tables.contains(table_name),
26121                "missing repair coverage for {table_name}"
26122            );
26123        }
26124    }
26125
26126    #[test]
26127    fn current_schema_repair_batches_do_not_replay_core_schema_bootstrap() {
26128        for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
26129            assert!(
26130                !batch.sql.contains("CREATE TABLE IF NOT EXISTS meta"),
26131                "repair batch {} should not recreate meta",
26132                batch.name
26133            );
26134            assert!(
26135                !batch.sql.contains("CREATE TABLE IF NOT EXISTS agents"),
26136                "repair batch {} should not recreate agents",
26137                batch.name
26138            );
26139            assert!(
26140                !batch.sql.contains("CREATE TABLE IF NOT EXISTS workspaces"),
26141                "repair batch {} should not recreate workspaces",
26142                batch.name
26143            );
26144            assert!(
26145                !batch
26146                    .sql
26147                    .contains("CREATE TABLE IF NOT EXISTS conversations"),
26148                "repair batch {} should not recreate conversations",
26149                batch.name
26150            );
26151            assert!(
26152                !batch.sql.contains("CREATE TABLE IF NOT EXISTS messages"),
26153                "repair batch {} should not recreate messages",
26154                batch.name
26155            );
26156            assert!(
26157                !batch.sql.contains("CREATE TABLE IF NOT EXISTS snippets"),
26158                "repair batch {} should not recreate snippets",
26159                batch.name
26160            );
26161            assert!(
26162                !batch.sql.contains("CREATE VIRTUAL TABLE fts_messages"),
26163                "repair batch {} should not recreate FTS tables",
26164                batch.name
26165            );
26166            assert!(
26167                !batch.sql.contains("DROP TABLE"),
26168                "repair batch {} should never drop tables",
26169                batch.name
26170            );
26171        }
26172    }
26173
26174    #[test]
26175    fn build_cass_migrations_applies_combined_v13() {
26176        let conn = FrankenConnection::open(":memory:").unwrap();
26177        let base_result = build_cass_migrations_before_tail_cache()
26178            .run(&conn)
26179            .unwrap();
26180        assert!(apply_conversation_tail_state_cache_migration(&conn).unwrap());
26181        let post_result = build_cass_migrations_after_tail_cache().run(&conn).unwrap();
26182
26183        assert!(base_result.was_fresh);
26184        let mut applied = base_result.applied;
26185        applied.push(15);
26186        applied.extend(post_result.applied);
26187        assert_eq!(
26188            applied,
26189            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
26190            "should apply combined V13 plus additive post-V13 migrations"
26191        );
26192        let current: i64 = conn
26193            .query("SELECT MAX(version) FROM _schema_migrations;")
26194            .unwrap()
26195            .first()
26196            .unwrap()
26197            .get_typed(0)
26198            .unwrap();
26199        assert_eq!(current, CURRENT_SCHEMA_VERSION);
26200    }
26201
26202    #[test]
26203    fn franken_insert_conversations_batched_populates_analytics_rollups() {
26204        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
26205        use frankensqlite::compat::{ConnectionExt, RowExt};
26206        use std::path::PathBuf;
26207
26208        let dir = TempDir::new().unwrap();
26209        let db_path = dir.path().join("franken-index.db");
26210        let storage = FrankenStorage::open(&db_path).unwrap();
26211
26212        let agent = Agent {
26213            id: None,
26214            slug: "claude_code".into(),
26215            name: "Claude Code".into(),
26216            version: Some("1.0".into()),
26217            kind: AgentKind::Cli,
26218        };
26219        let agent_id = storage.ensure_agent(&agent).unwrap();
26220
26221        let ts_ms = 1_770_551_400_000_i64;
26222        let usage_json = serde_json::json!({
26223            "message": {
26224                "model": "claude-opus-4-6",
26225                "usage": {
26226                    "input_tokens": 100,
26227                    "output_tokens": 50,
26228                    "cache_read_input_tokens": 25,
26229                    "cache_creation_input_tokens": 10,
26230                    "service_tier": "standard"
26231                }
26232            }
26233        });
26234
26235        let conv = Conversation {
26236            id: None,
26237            agent_slug: "claude_code".into(),
26238            workspace: Some(PathBuf::from("/tmp/workspace")),
26239            external_id: Some("franken-batch-upsert".into()),
26240            title: Some("Franken batch upsert".into()),
26241            source_path: PathBuf::from("/tmp/franken.jsonl"),
26242            started_at: Some(ts_ms),
26243            ended_at: Some(ts_ms + 60_000),
26244            approx_tokens: None,
26245            metadata_json: serde_json::Value::Null,
26246            messages: vec![
26247                Message {
26248                    id: None,
26249                    idx: 0,
26250                    role: MessageRole::User,
26251                    author: None,
26252                    created_at: Some(ts_ms),
26253                    content: "Please make a plan.".into(),
26254                    extra_json: serde_json::Value::Null,
26255                    snippets: vec![],
26256                },
26257                Message {
26258                    id: None,
26259                    idx: 1,
26260                    role: MessageRole::Agent,
26261                    author: None,
26262                    created_at: Some(ts_ms + 30_000),
26263                    content: "## Plan\n\n1. Reproduce\n2. Patch\n3. Verify".into(),
26264                    extra_json: usage_json,
26265                    snippets: vec![],
26266                },
26267            ],
26268            source_id: "local".into(),
26269            origin_host: None,
26270        };
26271
26272        let outcomes = storage
26273            .insert_conversations_batched(&[(agent_id, None, &conv)])
26274            .unwrap();
26275        assert_eq!(outcomes.len(), 1);
26276        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
26277
26278        let conn = storage.raw();
26279        let daily_stats_rows: i64 = conn
26280            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
26281                row.get_typed(0)
26282            })
26283            .unwrap();
26284        let token_daily_rows: i64 = conn
26285            .query_row_map(
26286                "SELECT COUNT(*) FROM token_daily_stats",
26287                fparams![],
26288                |row| row.get_typed(0),
26289            )
26290            .unwrap();
26291        let usage_daily_rows: i64 = conn
26292            .query_row_map("SELECT COUNT(*) FROM usage_daily", fparams![], |row| {
26293                row.get_typed(0)
26294            })
26295            .unwrap();
26296        let model_daily_rows: i64 = conn
26297            .query_row_map(
26298                "SELECT COUNT(*) FROM usage_models_daily",
26299                fparams![],
26300                |row| row.get_typed(0),
26301            )
26302            .unwrap();
26303
26304        assert!(daily_stats_rows > 0, "daily_stats should be populated");
26305        assert!(
26306            token_daily_rows > 0,
26307            "token_daily_stats should be populated"
26308        );
26309        assert!(usage_daily_rows > 0, "usage_daily should be populated");
26310        assert!(
26311            model_daily_rows > 0,
26312            "usage_models_daily should be populated"
26313        );
26314    }
26315
26316    // =========================================================================
26317    // FrankenConnectionManager tests (bead 3rlf8)
26318    // =========================================================================
26319
26320    #[test]
26321    fn connection_manager_creates_readers() {
26322        let dir = TempDir::new().unwrap();
26323        let db_path = dir.path().join("cm.db");
26324
26325        // Create the DB first
26326        let fs = FrankenStorage::open(&db_path).unwrap();
26327        drop(fs);
26328
26329        let config = ConnectionManagerConfig {
26330            reader_count: 3,
26331            max_writers: 2,
26332        };
26333        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
26334        assert_eq!(mgr.reader_count(), 3);
26335        assert_eq!(mgr.max_writers(), 2);
26336    }
26337
26338    #[test]
26339    fn connection_manager_clamps_zero_writer_limit_to_prevent_deadlock() {
26340        let dir = TempDir::new().unwrap();
26341        let db_path = dir.path().join("cm.db");
26342
26343        let fs = FrankenStorage::open(&db_path).unwrap();
26344        drop(fs);
26345
26346        let mgr = std::sync::Arc::new(
26347            FrankenConnectionManager::new(
26348                &db_path,
26349                ConnectionManagerConfig {
26350                    reader_count: 0,
26351                    max_writers: 0,
26352                },
26353            )
26354            .unwrap(),
26355        );
26356        assert_eq!(mgr.reader_count(), 1);
26357        assert_eq!(mgr.max_writers(), 1);
26358
26359        let (tx, rx) = std::sync::mpsc::channel();
26360        let mgr_for_thread = std::sync::Arc::clone(&mgr);
26361        std::thread::spawn(move || {
26362            let result = mgr_for_thread.writer().map(|mut guard| {
26363                guard.mark_committed();
26364            });
26365            tx.send(result.is_ok()).expect("writer result send");
26366        });
26367
26368        assert!(
26369            rx.recv_timeout(Duration::from_secs(10)).unwrap(),
26370            "writer acquisition should not block forever when configured with zero writer slots"
26371        );
26372    }
26373
26374    #[test]
26375    fn connection_manager_reader_round_robin() {
26376        let dir = TempDir::new().unwrap();
26377        let db_path = dir.path().join("cm.db");
26378
26379        let fs = FrankenStorage::open(&db_path).unwrap();
26380        drop(fs);
26381
26382        let config = ConnectionManagerConfig {
26383            reader_count: 2,
26384            max_writers: 1,
26385        };
26386        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
26387
26388        // Reader index should advance (round-robin)
26389        let idx_before = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
26390        let _r1 = mgr.reader();
26391        let idx_after = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
26392        assert_eq!(idx_after, idx_before + 1, "reader index should advance");
26393    }
26394
26395    #[test]
26396    fn connection_manager_writer_reads_and_writes() {
26397        use frankensqlite::compat::RowExt;
26398
26399        let dir = TempDir::new().unwrap();
26400        let db_path = dir.path().join("cm.db");
26401
26402        let fs = FrankenStorage::open(&db_path).unwrap();
26403        drop(fs);
26404
26405        let mgr = FrankenConnectionManager::new(&db_path, Default::default()).unwrap();
26406
26407        // Acquire writer and insert data
26408        {
26409            let mut guard = mgr.writer().unwrap();
26410            guard
26411                .storage()
26412                .raw()
26413                .execute("CREATE TABLE IF NOT EXISTS cm_test (id INTEGER PRIMARY KEY, val TEXT)")
26414                .unwrap();
26415            guard
26416                .storage()
26417                .raw()
26418                .execute("INSERT INTO cm_test (val) VALUES ('hello')")
26419                .unwrap();
26420            guard.mark_committed();
26421        }
26422
26423        // Verify via reader (returns MutexGuard<SendFrankenConnection>)
26424        let reader_guard = mgr.reader();
26425        let rows = reader_guard.query("SELECT val FROM cm_test").unwrap();
26426        assert_eq!(rows.len(), 1);
26427        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "hello");
26428    }
26429
26430    #[test]
26431    fn connection_manager_writer_guard_drops_releases_slot() {
26432        let dir = TempDir::new().unwrap();
26433        let db_path = dir.path().join("cm.db");
26434
26435        let fs = FrankenStorage::open(&db_path).unwrap();
26436        drop(fs);
26437
26438        let config = ConnectionManagerConfig {
26439            reader_count: 1,
26440            max_writers: 1,
26441        };
26442        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
26443
26444        // Acquire and release writer
26445        {
26446            let mut guard = mgr.writer().unwrap();
26447            guard.mark_committed();
26448        }
26449
26450        // Should be able to acquire again (slot released)
26451        let mut guard2 = mgr.writer().unwrap();
26452        guard2.mark_committed();
26453    }
26454
26455    #[test]
26456    fn connection_manager_concurrent_writer_works() {
26457        use frankensqlite::compat::RowExt;
26458
26459        let dir = TempDir::new().unwrap();
26460        let db_path = dir.path().join("cm.db");
26461
26462        let fs = FrankenStorage::open(&db_path).unwrap();
26463        drop(fs);
26464
26465        let config = ConnectionManagerConfig {
26466            reader_count: 1,
26467            max_writers: 2,
26468        };
26469        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
26470
26471        {
26472            let mut guard = mgr.concurrent_writer().unwrap();
26473            guard
26474                .storage()
26475                .raw()
26476                .execute("CREATE TABLE IF NOT EXISTS cm_conc (id INTEGER PRIMARY KEY, val TEXT)")
26477                .unwrap();
26478            guard
26479                .storage()
26480                .raw()
26481                .execute("INSERT INTO cm_conc (val) VALUES ('concurrent')")
26482                .unwrap();
26483            guard.mark_committed();
26484        }
26485
26486        let reader_guard = mgr.reader();
26487        let rows = reader_guard.query("SELECT val FROM cm_conc").unwrap();
26488        assert_eq!(rows.len(), 1);
26489        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "concurrent");
26490    }
26491
26492    #[test]
26493    fn connection_manager_default_config() {
26494        let config = ConnectionManagerConfig::default();
26495        assert_eq!(config.reader_count, 4);
26496        assert!(config.max_writers > 0);
26497    }
26498
26499    #[test]
26500    fn purge_agent_archive_data_removes_only_target_agent_and_rebuilds_derived_tables() {
26501        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
26502        use std::path::PathBuf;
26503
26504        fn seed_conversation(storage: &FrankenStorage, agent_slug: &str, marker: &str) {
26505            let agent = Agent {
26506                id: None,
26507                slug: agent_slug.into(),
26508                name: agent_slug.into(),
26509                version: None,
26510                kind: AgentKind::Cli,
26511            };
26512            let agent_id = storage.ensure_agent(&agent).unwrap();
26513            let conversation = Conversation {
26514                id: None,
26515                agent_slug: agent_slug.into(),
26516                workspace: Some(PathBuf::from("/tmp/workspace")),
26517                external_id: Some(format!("{agent_slug}-{marker}")),
26518                title: Some(format!("{agent_slug} {marker}")),
26519                source_path: PathBuf::from(format!("/tmp/{agent_slug}-{marker}.jsonl")),
26520                started_at: Some(1_700_000_000_000),
26521                ended_at: Some(1_700_000_000_100),
26522                approx_tokens: None,
26523                metadata_json: serde_json::Value::Null,
26524                messages: vec![
26525                    Message {
26526                        id: None,
26527                        idx: 0,
26528                        role: MessageRole::User,
26529                        author: Some("user".into()),
26530                        created_at: Some(1_700_000_000_010),
26531                        content: format!("{agent_slug} {marker} user"),
26532                        extra_json: serde_json::Value::Null,
26533                        snippets: Vec::new(),
26534                    },
26535                    Message {
26536                        id: None,
26537                        idx: 1,
26538                        role: MessageRole::Agent,
26539                        author: Some("assistant".into()),
26540                        created_at: Some(1_700_000_000_020),
26541                        content: format!("{agent_slug} {marker} assistant"),
26542                        extra_json: serde_json::Value::Null,
26543                        snippets: Vec::new(),
26544                    },
26545                ],
26546                source_id: LOCAL_SOURCE_ID.into(),
26547                origin_host: None,
26548            };
26549            storage
26550                .insert_conversation_tree(agent_id, None, &conversation)
26551                .unwrap();
26552        }
26553
26554        let dir = TempDir::new().unwrap();
26555        let db_path = dir.path().join("agent_search.db");
26556        let storage = FrankenStorage::open(&db_path).unwrap();
26557
26558        seed_conversation(&storage, "openclaw", "purge-target");
26559        seed_conversation(&storage, "codex", "keep-target");
26560
26561        let purge = storage.purge_agent_archive_data("openclaw").unwrap();
26562        assert_eq!(purge.conversations_deleted, 1);
26563        assert_eq!(purge.messages_deleted, 2);
26564
26565        storage.rebuild_fts().unwrap();
26566        storage.rebuild_analytics().unwrap();
26567        storage.rebuild_daily_stats().unwrap();
26568        storage.rebuild_token_daily_stats().unwrap();
26569
26570        let agents = storage.list_agents().unwrap();
26571        assert_eq!(agents.len(), 1);
26572        assert_eq!(agents[0].slug, "codex");
26573        assert_eq!(storage.total_conversation_count().unwrap(), 1);
26574        assert_eq!(storage.total_message_count().unwrap(), 2);
26575
26576        let fts_rows: i64 = storage
26577            .raw()
26578            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
26579                row.get_typed(0)
26580            })
26581            .unwrap();
26582        assert_eq!(fts_rows, 2);
26583
26584        let total_daily_sessions: i64 = storage
26585            .raw()
26586            .query_row_map(
26587                "SELECT COALESCE(SUM(session_count), 0)
26588                 FROM daily_stats
26589                 WHERE agent_slug = 'all' AND source_id = 'all'",
26590                fparams![],
26591                |row| row.get_typed(0),
26592            )
26593            .unwrap();
26594        assert_eq!(total_daily_sessions, 1);
26595
26596        let openclaw_token_rows: i64 = storage
26597            .raw()
26598            .query_row_map(
26599                "SELECT COUNT(*) FROM token_daily_stats WHERE agent_slug = 'openclaw'",
26600                fparams![],
26601                |row| row.get_typed(0),
26602            )
26603            .unwrap();
26604        assert_eq!(openclaw_token_rows, 0);
26605    }
26606
26607    /// Regression for cass#202: a `Connection` dropped mid-transaction can
26608    /// leave child rows persisted without a matching parent. The next indexer
26609    /// pass then trips `FOREIGN KEY constraint failed` on every write, the
26610    /// session never gets marked indexed, and the pending backlog grows
26611    /// without bound. `cleanup_orphan_fk_rows` is the indexer-startup
26612    /// self-heal that breaks the cycle.
26613    #[test]
26614    fn cleanup_orphan_fk_rows_removes_orphans_and_is_noop_on_clean_db() {
26615        let dir = TempDir::new().unwrap();
26616        let db_path = dir.path().join("orphan_fk_self_heal.db");
26617        let storage = FrankenStorage::open(&db_path).unwrap();
26618
26619        // Plant orphan rows directly: rows whose FK parent does not exist.
26620        // FK enforcement is temporarily off so the planted rows can land.
26621        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
26622
26623        // Seed a real conversation so a subset of children DO have valid
26624        // parents — we want the cleanup to be precise, not a table-flush.
26625        storage
26626            .raw()
26627            .execute_compat(
26628                "INSERT INTO agents(id, slug, name, kind, created_at, updated_at) \
26629                 VALUES(1, 'test-agent', 'Test Agent', 'cli', 0, 0)",
26630                fparams![],
26631            )
26632            .unwrap();
26633        storage
26634            .raw()
26635            .execute_compat(
26636                "INSERT INTO conversations(id, agent_id, source_id, source_path, started_at) \
26637                 VALUES(1, 1, 'local', '/tmp/real.jsonl', 0)",
26638                fparams![],
26639            )
26640            .unwrap();
26641        storage
26642            .raw()
26643            .execute_compat(
26644                "INSERT INTO messages(id, conversation_id, idx, role, content) \
26645                 VALUES(1, 1, 0, 'user', 'real message')",
26646                fparams![],
26647            )
26648            .unwrap();
26649
26650        // Plant orphan messages referencing conversation_id=99999 (does not exist)
26651        // and conversation_id=0 (the specific shape reported in #202). Distinct
26652        // (conversation_id, idx) pairs are required by the UNIQUE constraint.
26653        for (mid, cid, idx) in [(101_i64, 99_999_i64, 0_i64), (102, 0, 0), (103, 0, 1)] {
26654            storage
26655                .raw()
26656                .execute_compat(
26657                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
26658                     VALUES(?1, ?2, ?3, 'user', 'orphan message')",
26659                    fparams![mid, cid, idx],
26660                )
26661                .unwrap();
26662        }
26663
26664        // Rows below are not directly orphaned because their immediate
26665        // `messages` parent exists, but that parent is itself orphaned. The
26666        // cleanup deletes them explicitly before deleting orphan messages so the
26667        // FK cascade engine does not have to run one delete program per orphan.
26668        for message_id in [1_i64, 101_i64, 102_i64] {
26669            storage
26670                .raw()
26671                .execute_compat(
26672                    "INSERT INTO message_metrics(
26673                         message_id, created_at_ms, hour_id, day_id, agent_slug,
26674                         role, content_chars, content_tokens_est
26675                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 13, 2)",
26676                    fparams![message_id],
26677                )
26678                .unwrap();
26679            storage
26680                .raw()
26681                .execute_compat(
26682                    "INSERT INTO token_usage(
26683                         message_id, conversation_id, agent_id, timestamp_ms, day_id,
26684                         role, content_chars
26685                     ) VALUES(?1, 1, 1, 0, 0, 'user', 13)",
26686                    fparams![message_id],
26687                )
26688                .unwrap();
26689        }
26690
26691        // Plant a directly-orphan snippet — message_id=99999 does not exist
26692        // anywhere, so this exercises the snippets DELETE path rather than
26693        // riding on the cascade from the orphan-message DELETE.
26694        storage
26695            .raw()
26696            .execute_compat(
26697                "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text) \
26698                 VALUES(99999, '/tmp/orphan-snippet.rs', 1, 2, 'rust', 'fn main() {}')",
26699                fparams![],
26700            )
26701            .unwrap();
26702
26703        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
26704
26705        // Sanity: the planted orphans are visible.
26706        let messages_before: i64 = storage
26707            .raw()
26708            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
26709                row.get_typed(0)
26710            })
26711            .unwrap();
26712        assert_eq!(messages_before, 4); // 1 real + 3 orphans
26713        let snippets_before: i64 = storage
26714            .raw()
26715            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
26716                row.get_typed(0)
26717            })
26718            .unwrap();
26719        assert_eq!(snippets_before, 1);
26720        let metrics_before: i64 = storage
26721            .raw()
26722            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
26723                row.get_typed(0)
26724            })
26725            .unwrap();
26726        assert_eq!(metrics_before, 3);
26727        let token_usage_before: i64 = storage
26728            .raw()
26729            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
26730                row.get_typed(0)
26731            })
26732            .unwrap();
26733        assert_eq!(token_usage_before, 3);
26734
26735        // Run the self-heal.
26736        let report = storage.cleanup_orphan_fk_rows().unwrap();
26737
26738        // 3 orphan messages + 1 directly-orphan snippet = 4 primary orphans
26739        // reported. Dependent message_metrics/token_usage rows for orphan
26740        // messages are pruned too, but they are not double-counted because the
26741        // orphan message is the root row that made them invalid.
26742        let messages_after: i64 = storage
26743            .raw()
26744            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
26745                row.get_typed(0)
26746            })
26747            .unwrap();
26748        assert_eq!(messages_after, 1, "real message must be preserved");
26749        let snippets_after: i64 = storage
26750            .raw()
26751            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
26752                row.get_typed(0)
26753            })
26754            .unwrap();
26755        assert_eq!(snippets_after, 0);
26756        let metrics_after: i64 = storage
26757            .raw()
26758            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
26759                row.get_typed(0)
26760            })
26761            .unwrap();
26762        assert_eq!(metrics_after, 1, "real message metric must be preserved");
26763        let token_usage_after: i64 = storage
26764            .raw()
26765            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
26766                row.get_typed(0)
26767            })
26768            .unwrap();
26769        assert_eq!(token_usage_after, 1, "real token row must be preserved");
26770
26771        assert_eq!(report.total, 4, "report total: {:?}", report);
26772        let messages_count = report
26773            .per_table
26774            .iter()
26775            .find(|(t, _)| *t == "messages")
26776            .map(|(_, c)| *c);
26777        assert_eq!(messages_count, Some(3));
26778        let snippets_count = report
26779            .per_table
26780            .iter()
26781            .find(|(t, _)| *t == "snippets")
26782            .map(|(_, c)| *c);
26783        assert_eq!(snippets_count, Some(1));
26784
26785        // Second invocation on a now-clean DB must be a no-op.
26786        let second = storage.cleanup_orphan_fk_rows().unwrap();
26787        assert_eq!(second.total, 0);
26788        assert!(second.per_table.is_empty());
26789    }
26790
26791    #[test]
26792    fn cleanup_orphan_fk_rows_handles_more_than_one_delete_chunk() {
26793        let dir = TempDir::new().unwrap();
26794        let db_path = dir.path().join("orphan_fk_chunked_self_heal.db");
26795        let storage = FrankenStorage::open(&db_path).unwrap();
26796        let orphan_count = ORPHAN_FK_ID_CHUNK_SIZE + 3;
26797
26798        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
26799        {
26800            let mut tx = storage.raw().transaction().unwrap();
26801            for idx in 0..orphan_count {
26802                let message_id = 10_000_i64 + i64::try_from(idx).unwrap();
26803                let conversation_id = 20_000_i64 + i64::try_from(idx).unwrap();
26804                tx.execute_compat(
26805                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
26806                     VALUES(?1, ?2, 0, 'user', 'orphan message')",
26807                    fparams![message_id, conversation_id],
26808                )
26809                .unwrap();
26810                tx.execute_compat(
26811                    "INSERT INTO message_metrics(
26812                         message_id, created_at_ms, hour_id, day_id, agent_slug,
26813                         role, content_chars, content_tokens_est
26814                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 14, 2)",
26815                    fparams![message_id],
26816                )
26817                .unwrap();
26818            }
26819            tx.commit().unwrap();
26820        }
26821        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
26822
26823        let report = storage.cleanup_orphan_fk_rows().unwrap();
26824
26825        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
26826        let messages_count = report
26827            .per_table
26828            .iter()
26829            .find(|(table, _)| *table == "messages")
26830            .map(|(_, count)| *count);
26831        assert_eq!(messages_count, Some(i64::try_from(orphan_count).unwrap()));
26832        let messages_after: i64 = storage
26833            .raw()
26834            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
26835                row.get_typed(0)
26836            })
26837            .unwrap();
26838        assert_eq!(messages_after, 0);
26839        let metrics_after: i64 = storage
26840            .raw()
26841            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
26842                row.get_typed(0)
26843            })
26844            .unwrap();
26845        assert_eq!(metrics_after, 0);
26846    }
26847
26848    #[test]
26849    fn cleanup_orphan_fk_rows_pages_direct_child_orphans() {
26850        let dir = TempDir::new().unwrap();
26851        let db_path = dir.path().join("direct_orphan_fk_paged_self_heal.db");
26852        let storage = FrankenStorage::open(&db_path).unwrap();
26853        let orphan_count = (ORPHAN_FK_ID_CHUNK_SIZE * 2) + 5;
26854
26855        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
26856        {
26857            let mut tx = storage.raw().transaction().unwrap();
26858            for idx in 0..orphan_count {
26859                let message_id = 50_000_i64 + i64::try_from(idx).unwrap();
26860                tx.execute_compat(
26861                    "INSERT INTO message_metrics(
26862                         message_id, created_at_ms, hour_id, day_id, agent_slug,
26863                         role, content_chars, content_tokens_est
26864                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 21, 3)",
26865                    fparams![message_id],
26866                )
26867                .unwrap();
26868            }
26869            tx.commit().unwrap();
26870        }
26871        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
26872
26873        let report = storage.cleanup_orphan_fk_rows().unwrap();
26874
26875        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
26876        let metrics_count = report
26877            .per_table
26878            .iter()
26879            .filter(|(table, _)| *table == "message_metrics")
26880            .map(|(_, count)| *count)
26881            .sum::<i64>();
26882        assert_eq!(metrics_count, i64::try_from(orphan_count).unwrap());
26883        assert_eq!(
26884            report
26885                .per_table
26886                .iter()
26887                .filter(|(table, _)| *table == "message_metrics")
26888                .count(),
26889            1,
26890            "paged cleanup should aggregate report entries by table: {report:?}"
26891        );
26892        let metrics_after: i64 = storage
26893            .raw()
26894            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
26895                row.get_typed(0)
26896            })
26897            .unwrap();
26898        assert_eq!(metrics_after, 0);
26899    }
26900}