Skip to main content

coding_agent_search/storage/
sqlite.rs

1//! `SQLite` backend: schema, pragmas, and migrations.
2
3use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
4use crate::sources::provenance::{LOCAL_SOURCE_ID, Source, SourceKind};
5use anyhow::{Context, Result, anyhow, bail};
6use frankensqlite::{
7    Connection as FrankenConnection, Row as FrankenRow, SqliteValue,
8    compat::{
9        ConnectionExt as FrankenConnectionExt, OpenFlags as FrankenOpenFlags,
10        OptionalExtension as FrankenOptionalExtension, ParamValue, RowExt as FrankenRowExt,
11        Transaction as FrankenTransaction, TransactionExt as FrankenTransactionExt,
12        open_with_flags as open_franken_with_flags, param_slice_to_values, params_from_iter,
13    },
14    migrate::MigrationRunner,
15};
16use serde::{Deserialize, Serialize};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::io::{BufRead, BufReader, Write};
22use std::process::{Command, Stdio};
23use std::sync::{
24    Arc,
25    atomic::{AtomicBool, AtomicI8, AtomicI64, AtomicU64, AtomicUsize, Ordering},
26};
27
28/// Frankensqlite parameter list builder.
29macro_rules! fparams {
30    () => {
31        &[] as &[ParamValue]
32    };
33    ($($val:expr),+ $(,)?) => {
34        &[$(ParamValue::from($val)),+] as &[ParamValue]
35    };
36}
37use std::path::{Path, PathBuf};
38use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
39use thiserror::Error;
40use tracing::info;
41
42const DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT: Duration = Duration::from_secs(30);
43const DOCTOR_MUTATION_LOCK_MAX_METADATA_READ: u64 = 64 * 1024;
44
45// -------------------------------------------------------------------------
46// Lazy FrankenSQLite Connection (bd-1ueu)
47// -------------------------------------------------------------------------
48// Defers opening the database until first use, cutting startup cost for
49// commands that may not need the DB at all.  Thread-safe via parking_lot
50// Mutex; logs the reason and duration of the open on first access.
51
52/// Error from lazy database initialization.
53#[derive(Debug, Error)]
54pub enum LazyDbError {
55    #[error("Database not found at {0}")]
56    NotFound(PathBuf),
57    #[error("Failed to open FrankenSQLite database at {path}: {source}")]
58    FrankenOpenFailed {
59        path: PathBuf,
60        source: frankensqlite::FrankenError,
61    },
62}
63
64// -------------------------------------------------------------------------
65// LazyFrankenDb — lazy wrapper around FrankenConnection
66// -------------------------------------------------------------------------
67
68/// Wrapper around `FrankenConnection` that implements `Send`.
69///
70/// `FrankenConnection` is `!Send` because it uses `Rc` internally.
71/// However, the `Rc` values are entirely self-contained within the Connection
72/// and are not shared externally.  When wrapped in a `Mutex`,
73/// exclusive access is guaranteed, making cross-thread transfer safe.
74pub struct SendFrankenConnection(FrankenConnection, i64, u64);
75
76// Safety: Rc fields inside FrankenConnection are not cloned or shared externally.
77// The Mutex<Option<SendFrankenConnection>> ensures exclusive access.
78unsafe impl Send for SendFrankenConnection {}
79
80impl SendFrankenConnection {
81    pub(crate) fn new(conn: FrankenConnection) -> Self {
82        Self(
83            conn,
84            UNSET_INDEX_WRITER_CHECKPOINT_PAGES,
85            UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS,
86        )
87    }
88
89    pub(crate) fn new_with_index_writer_state(
90        conn: FrankenConnection,
91        checkpoint_pages: i64,
92        busy_timeout_ms: u64,
93    ) -> Self {
94        Self(conn, checkpoint_pages, busy_timeout_ms)
95    }
96
97    pub(crate) fn into_parts(self) -> (FrankenConnection, i64, u64) {
98        (self.0, self.1, self.2)
99    }
100}
101
102impl std::ops::Deref for SendFrankenConnection {
103    type Target = FrankenConnection;
104    fn deref(&self) -> &FrankenConnection {
105        &self.0
106    }
107}
108
109/// Lazy-opening wrapper for `FrankenConnection` (frankensqlite).
110///
111/// Constructing a `LazyFrankenDb` is cheap (no I/O).  The underlying
112/// `FrankenConnection` is opened on the first call to [`get`].
113/// Subsequent calls return the cached connection.
114pub struct LazyFrankenDb {
115    path: PathBuf,
116    conn: parking_lot::Mutex<Option<SendFrankenConnection>>,
117}
118
119/// RAII guard that dereferences to the inner `FrankenConnection`.
120pub struct LazyFrankenDbGuard<'a>(parking_lot::MutexGuard<'a, Option<SendFrankenConnection>>);
121
122impl std::fmt::Debug for LazyFrankenDbGuard<'_> {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        f.debug_tuple("LazyFrankenDbGuard")
125            .field(&self.0.is_some())
126            .finish()
127    }
128}
129
130impl std::ops::Deref for LazyFrankenDbGuard<'_> {
131    type Target = FrankenConnection;
132    fn deref(&self) -> &FrankenConnection {
133        self.0
134            .as_ref()
135            .expect("LazyFrankenDb connection must be initialized before access")
136    }
137}
138
139impl LazyFrankenDb {
140    /// Create a lazy handle pointing at `path`.  No I/O is performed.
141    pub fn new(path: PathBuf) -> Self {
142        Self {
143            path,
144            conn: parking_lot::Mutex::new(None),
145        }
146    }
147
148    /// Resolve path from optional CLI overrides.
149    ///
150    /// Uses `data_dir / agent_search.db` as fallback.
151    pub fn from_overrides(data_dir: &Option<PathBuf>, db_override: Option<PathBuf>) -> Self {
152        let data_dir = data_dir.clone().unwrap_or_else(crate::default_data_dir);
153        let path = db_override.unwrap_or_else(|| data_dir.join("agent_search.db"));
154        Self::new(path)
155    }
156
157    /// Get the connection, opening the database on first access.
158    ///
159    /// `reason` is logged alongside the open duration so callers can
160    /// identify which command triggered the open.
161    pub fn get(&self, reason: &str) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
162        let mut guard = self.conn.lock();
163        if guard.is_none() {
164            if !self.path.exists() {
165                return Err(LazyDbError::NotFound(self.path.clone()));
166            }
167            let start = Instant::now();
168            let _doctor_guard = acquire_doctor_mutation_db_open_guard(
169                &self.path,
170                DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT,
171            )
172            .map_err(|err| LazyDbError::FrankenOpenFailed {
173                path: self.path.clone(),
174                source: frankensqlite::FrankenError::Internal(err.to_string()),
175            })?;
176            let conn =
177                FrankenConnection::open(self.path.to_string_lossy().into_owned()).map_err(|e| {
178                    LazyDbError::FrankenOpenFailed {
179                        path: self.path.clone(),
180                        source: e,
181                    }
182                })?;
183            let elapsed_ms = start.elapsed().as_millis();
184            info!(
185                path = %self.path.display(),
186                elapsed_ms = elapsed_ms,
187                reason = reason,
188                "lazily opened FrankenSQLite database"
189            );
190            *guard = Some(SendFrankenConnection::new(conn));
191        }
192        Ok(LazyFrankenDbGuard(guard))
193    }
194
195    /// Get the connection with a timeout, opening the database on first access.
196    ///
197    /// Like [`get`] but spawns the open in a background thread and waits up to
198    /// `timeout` for it to complete. Returns `LazyDbError::FrankenOpenFailed`
199    /// with a descriptive message if the timeout elapses. Fix for #128.
200    pub fn get_with_timeout(
201        &self,
202        reason: &str,
203        timeout: Duration,
204    ) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
205        let mut guard = self.conn.lock();
206        if guard.is_none() {
207            if !self.path.exists() {
208                return Err(LazyDbError::NotFound(self.path.clone()));
209            }
210            let start = Instant::now();
211            let path_owned = self.path.to_string_lossy().into_owned();
212            let path_for_guard = self.path.clone();
213            let (tx, rx) = std::sync::mpsc::channel();
214            std::thread::spawn(move || {
215                let _doctor_guard =
216                    match acquire_doctor_mutation_db_open_guard(&path_for_guard, timeout) {
217                        Ok(guard) => guard,
218                        Err(err) => {
219                            let _ = tx
220                                .send(Err(frankensqlite::FrankenError::Internal(err.to_string())));
221                            return;
222                        }
223                    };
224                let _ =
225                    tx.send(FrankenConnection::open(path_owned).map(SendFrankenConnection::new));
226            });
227            let conn = rx
228                .recv_timeout(timeout)
229                .map_err(|_| LazyDbError::FrankenOpenFailed {
230                    path: self.path.clone(),
231                    source: frankensqlite::FrankenError::Internal(format!(
232                        "database open timed out after {}s (possible corruption or lock contention)",
233                        timeout.as_secs()
234                    )),
235                })?
236                .map_err(|e| LazyDbError::FrankenOpenFailed {
237                    path: self.path.clone(),
238                    source: e,
239                })?;
240            let elapsed_ms = start.elapsed().as_millis();
241            info!(
242                path = %self.path.display(),
243                elapsed_ms = elapsed_ms,
244                reason = reason,
245                "lazily opened FrankenSQLite database (with timeout)"
246            );
247            *guard = Some(conn);
248        }
249        Ok(LazyFrankenDbGuard(guard))
250    }
251
252    /// Path to the database file (even if not yet opened).
253    pub fn path(&self) -> &Path {
254        &self.path
255    }
256
257    /// Whether the connection has been opened.
258    pub fn is_open(&self) -> bool {
259        self.conn.lock().is_some()
260    }
261}
262
263static FRANKEN_RETRY_JITTER_STATE: AtomicU64 = AtomicU64::new(0x9e37_79b9_7f4a_7c15);
264static DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH: AtomicUsize = AtomicUsize::new(0);
265static MESSAGE_LOOKUP_TRACE_ENABLED: AtomicBool = AtomicBool::new(false);
266static MESSAGE_LOOKUP_EXACT_IDX_PROBES: AtomicU64 = AtomicU64::new(0);
267static MESSAGE_LOOKUP_BOUNDED_QUERIES: AtomicU64 = AtomicU64::new(0);
268static MESSAGE_LOOKUP_FULL_SCAN_QUERIES: AtomicU64 = AtomicU64::new(0);
269static MESSAGE_LOOKUP_ROWS_MATERIALIZED: AtomicU64 = AtomicU64::new(0);
270
271#[derive(Debug, Clone, Copy, Default, Serialize)]
272pub(crate) struct MessageLookupTraceCounters {
273    pub exact_idx_probes: u64,
274    pub bounded_lookup_queries: u64,
275    pub full_scan_queries: u64,
276    pub rows_materialized: u64,
277}
278
279impl MessageLookupTraceCounters {
280    pub(crate) fn saturating_sub(self, before: Self) -> Self {
281        Self {
282            exact_idx_probes: self
283                .exact_idx_probes
284                .saturating_sub(before.exact_idx_probes),
285            bounded_lookup_queries: self
286                .bounded_lookup_queries
287                .saturating_sub(before.bounded_lookup_queries),
288            full_scan_queries: self
289                .full_scan_queries
290                .saturating_sub(before.full_scan_queries),
291            rows_materialized: self
292                .rows_materialized
293                .saturating_sub(before.rows_materialized),
294        }
295    }
296
297    pub(crate) fn lookups_against_global(self) -> u64 {
298        self.exact_idx_probes.saturating_add(self.rows_materialized)
299    }
300}
301
302pub(crate) fn set_message_lookup_trace_enabled(enabled: bool) -> bool {
303    MESSAGE_LOOKUP_TRACE_ENABLED.swap(enabled, Ordering::Relaxed)
304}
305
306pub(crate) fn message_lookup_trace_snapshot() -> MessageLookupTraceCounters {
307    MessageLookupTraceCounters {
308        exact_idx_probes: MESSAGE_LOOKUP_EXACT_IDX_PROBES.load(Ordering::Relaxed),
309        bounded_lookup_queries: MESSAGE_LOOKUP_BOUNDED_QUERIES.load(Ordering::Relaxed),
310        full_scan_queries: MESSAGE_LOOKUP_FULL_SCAN_QUERIES.load(Ordering::Relaxed),
311        rows_materialized: MESSAGE_LOOKUP_ROWS_MATERIALIZED.load(Ordering::Relaxed),
312    }
313}
314
315fn record_message_lookup_exact_idx_probe() {
316    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
317        MESSAGE_LOOKUP_EXACT_IDX_PROBES.fetch_add(1, Ordering::Relaxed);
318    }
319}
320
321fn record_message_lookup_bounded_queries(query_count: u64, rows: usize) {
322    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
323        MESSAGE_LOOKUP_BOUNDED_QUERIES.fetch_add(query_count, Ordering::Relaxed);
324        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
325    }
326}
327
328fn record_message_lookup_full_scan_query(rows: usize) {
329    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
330        MESSAGE_LOOKUP_FULL_SCAN_QUERIES.fetch_add(1, Ordering::Relaxed);
331        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
332    }
333}
334
335pub(crate) struct DoctorMutationDbOpenBypassGuard;
336
337impl Drop for DoctorMutationDbOpenBypassGuard {
338    fn drop(&mut self) {
339        DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_sub(1, Ordering::SeqCst);
340    }
341}
342
343pub(crate) fn enter_doctor_mutation_db_open_bypass() -> DoctorMutationDbOpenBypassGuard {
344    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_add(1, Ordering::SeqCst);
345    DoctorMutationDbOpenBypassGuard
346}
347
348fn doctor_mutation_db_open_bypass_active() -> bool {
349    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.load(Ordering::SeqCst) > 0
350}
351
352fn next_franken_retry_jitter_ms(max_inclusive: u64) -> u64 {
353    let mut value = FRANKEN_RETRY_JITTER_STATE.fetch_add(0x9e37_79b9_7f4a_7c15, Ordering::Relaxed);
354    value ^= value >> 30;
355    value = value.wrapping_mul(0xbf58_476d_1ce4_e5b9);
356    value ^= value >> 27;
357    value = value.wrapping_mul(0x94d0_49bb_1331_11eb);
358    value ^= value >> 31;
359    value % max_inclusive.saturating_add(1)
360}
361
362/// Sleep with jittered exponential backoff to avoid lock-step retry storms
363/// when many threads hit the same transient SQLite/frankensqlite contention.
364pub(crate) fn sleep_with_franken_retry_backoff(
365    backoff: &mut Duration,
366    remaining: Duration,
367    max_backoff: Duration,
368) {
369    let capped = (*backoff).min(remaining);
370    let extra_budget = remaining.saturating_sub(capped).min(capped);
371    let extra_ms = extra_budget.as_millis().min(u128::from(u64::MAX)) as u64;
372    let sleep_for = if extra_ms == 0 {
373        capped
374    } else {
375        capped
376            .saturating_add(Duration::from_millis(next_franken_retry_jitter_ms(
377                extra_ms,
378            )))
379            .min(remaining)
380    };
381    std::thread::sleep(sleep_for);
382    *backoff = backoff.saturating_mul(2).min(max_backoff);
383}
384
385struct DoctorMutationDbOpenGuard(Option<fs::File>);
386
387impl Drop for DoctorMutationDbOpenGuard {
388    fn drop(&mut self) {
389        if let Some(file) = self.0.as_ref() {
390            let _ = fs2::FileExt::unlock(file);
391        }
392    }
393}
394
395fn doctor_mutation_lock_path_for_db_open(db_path: &Path) -> Option<PathBuf> {
396    if db_path.file_name().and_then(|name| name.to_str()) != Some("agent_search.db") {
397        return None;
398    }
399
400    Some(
401        db_path
402            .parent()?
403            .join("doctor")
404            .join("locks")
405            .join("doctor-repair.lock"),
406    )
407}
408
409fn doctor_lock_metadata_pid_is_current_process(raw: &str) -> bool {
410    raw.lines().any(|line| {
411        let Some((key, value)) = line.split_once('=') else {
412            return false;
413        };
414        key.trim() == "pid"
415            && value
416                .trim()
417                .parse::<u32>()
418                .is_ok_and(|pid| pid == std::process::id())
419    })
420}
421
422fn doctor_lock_file_pid_is_current_process(file: &fs::File) -> bool {
423    use std::io::Read as _;
424
425    let Ok(mut file) = file.try_clone() else {
426        return false;
427    };
428    let mut raw = String::new();
429    let _ = std::io::Read::take(&mut file, DOCTOR_MUTATION_LOCK_MAX_METADATA_READ)
430        .read_to_string(&mut raw);
431    doctor_lock_metadata_pid_is_current_process(&raw)
432}
433
434fn acquire_doctor_mutation_db_open_guard(
435    db_path: &Path,
436    timeout: Duration,
437) -> Result<DoctorMutationDbOpenGuard> {
438    let Some(lock_path) = doctor_mutation_lock_path_for_db_open(db_path) else {
439        return Ok(DoctorMutationDbOpenGuard(None));
440    };
441    if doctor_mutation_db_open_bypass_active() {
442        return Ok(DoctorMutationDbOpenGuard(None));
443    }
444
445    if let Some(parent) = lock_path.parent() {
446        fs::create_dir_all(parent).with_context(|| {
447            format!(
448                "creating doctor mutation lock directory {} before opening {}",
449                parent.display(),
450                db_path.display()
451            )
452        })?;
453    }
454
455    let deadline = Instant::now() + timeout;
456    let mut backoff = Duration::from_millis(4);
457    loop {
458        let file = fs::OpenOptions::new()
459            .create(true)
460            .truncate(false)
461            .read(true)
462            .write(true)
463            .open(&lock_path)
464            .with_context(|| {
465                format!(
466                    "opening doctor mutation lock {} before opening {}",
467                    lock_path.display(),
468                    db_path.display()
469                )
470            })?;
471
472        if doctor_lock_file_pid_is_current_process(&file) {
473            return Ok(DoctorMutationDbOpenGuard(None));
474        }
475
476        match fs2::FileExt::try_lock_shared(&file) {
477            Ok(()) => return Ok(DoctorMutationDbOpenGuard(Some(file))),
478            Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
479                let now = Instant::now();
480                if now >= deadline {
481                    return Err(anyhow!(
482                        "doctor mutation lock {} is active while opening {}; refusing to open during repair after waiting {}ms",
483                        lock_path.display(),
484                        db_path.display(),
485                        timeout.as_millis()
486                    ));
487                }
488                let remaining = deadline.saturating_duration_since(now);
489                sleep_with_franken_retry_backoff(
490                    &mut backoff,
491                    remaining,
492                    Duration::from_millis(128),
493                );
494            }
495            Err(err) => {
496                return Err(anyhow!(
497                    "failed to acquire shared doctor mutation lock {} before opening {}: {}",
498                    lock_path.display(),
499                    db_path.display(),
500                    err
501                ));
502            }
503        }
504    }
505}
506
507pub(crate) fn open_franken_storage_with_timeout(
508    path: &Path,
509    timeout: Duration,
510) -> Result<FrankenStorage> {
511    if !path.exists() {
512        return Err(anyhow!("Database not found at {}", path.display()));
513    }
514
515    let deadline = Instant::now() + timeout;
516    let mut backoff = Duration::from_millis(4);
517    loop {
518        match FrankenStorage::open(path) {
519            Ok(storage) => return Ok(storage),
520            Err(err) if retryable_franken_anyhow(&err) => {
521                let now = Instant::now();
522                if now >= deadline {
523                    return Err(err);
524                }
525                let remaining = deadline.saturating_duration_since(now);
526                sleep_with_franken_retry_backoff(
527                    &mut backoff,
528                    remaining,
529                    Duration::from_millis(128),
530                );
531            }
532            Err(err) => return Err(err),
533        }
534    }
535}
536
537pub(crate) fn open_current_schema_storage_with_timeout(
538    path: &Path,
539    timeout: Duration,
540) -> Result<Option<FrankenStorage>> {
541    if !path.exists() {
542        return Ok(None);
543    }
544
545    let mut storage = FrankenStorage::new(
546        open_franken_raw_connection_with_timeout(path, timeout)?,
547        path.to_path_buf(),
548    );
549    storage.apply_open_stage_busy_timeout();
550
551    let version = storage
552        .raw()
553        .query("SELECT value FROM meta WHERE key = 'schema_version';")
554        .ok()
555        .and_then(|rows| rows.first().cloned())
556        .and_then(|row| row.get_typed::<String>(0).ok())
557        .and_then(|raw| raw.parse::<i64>().ok());
558
559    if version != Some(CURRENT_SCHEMA_VERSION) {
560        if let Err(close_err) = storage.close_without_checkpoint_in_place() {
561            tracing::debug!(
562                error = %close_err,
563                db_path = %path.display(),
564                "open_current_schema_storage_with_timeout: close_without_checkpoint_in_place failed; falling back to best-effort close"
565            );
566            storage.close_best_effort_in_place();
567        }
568        return Ok(None);
569    }
570
571    transition_from_meta_version(&storage.conn)?;
572    storage.repair_missing_current_schema_objects()?;
573    storage.apply_config()?;
574    Ok(Some(storage))
575}
576
577pub(crate) fn open_franken_readonly_storage_with_timeout(
578    path: &Path,
579    timeout: Duration,
580) -> Result<FrankenStorage> {
581    if !path.exists() {
582        return Err(anyhow!("Database not found at {}", path.display()));
583    }
584
585    let deadline = Instant::now() + timeout;
586    let mut backoff = Duration::from_millis(4);
587    loop {
588        match FrankenStorage::open_readonly(path) {
589            Ok(storage) => return Ok(storage),
590            Err(err) if retryable_franken_anyhow(&err) => {
591                let now = Instant::now();
592                if now >= deadline {
593                    return Err(err);
594                }
595                let remaining = deadline.saturating_duration_since(now);
596                sleep_with_franken_retry_backoff(
597                    &mut backoff,
598                    remaining,
599                    Duration::from_millis(128),
600                );
601            }
602            Err(err) => return Err(err),
603        }
604    }
605}
606
607pub(crate) fn open_franken_raw_connection_with_timeout(
608    path: &Path,
609    timeout: Duration,
610) -> Result<FrankenConnection> {
611    if !path.exists() {
612        return Err(anyhow!("Database not found at {}", path.display()));
613    }
614
615    let path_str = path.to_string_lossy().to_string();
616    let deadline = Instant::now() + timeout;
617    let mut backoff = Duration::from_millis(4);
618    loop {
619        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
620        match FrankenConnection::open(&path_str)
621            .with_context(|| format!("opening raw frankensqlite db at {}", path.display()))
622        {
623            Ok(conn) => return Ok(conn),
624            Err(err) if retryable_franken_anyhow(&err) => {
625                let now = Instant::now();
626                if now >= deadline {
627                    return Err(err);
628                }
629                let remaining = deadline.saturating_duration_since(now);
630                sleep_with_franken_retry_backoff(
631                    &mut backoff,
632                    remaining,
633                    Duration::from_millis(128),
634                );
635            }
636            Err(err) => return Err(err),
637        }
638    }
639}
640
641pub(crate) fn open_franken_raw_readonly_connection_with_timeout(
642    path: &Path,
643    timeout: Duration,
644) -> Result<FrankenConnection> {
645    if !path.exists() {
646        return Err(anyhow!("Database not found at {}", path.display()));
647    }
648
649    let path_str = path.to_string_lossy().to_string();
650    let deadline = Instant::now() + timeout;
651    let mut backoff = Duration::from_millis(4);
652    loop {
653        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
654        match open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
655            .with_context(|| {
656                format!(
657                    "opening raw frankensqlite db readonly at {}",
658                    path.display()
659                )
660            }) {
661            Ok(conn) => return Ok(conn),
662            Err(err) if retryable_franken_anyhow(&err) => {
663                let now = Instant::now();
664                if now >= deadline {
665                    return Err(err);
666                }
667                let remaining = deadline.saturating_duration_since(now);
668                sleep_with_franken_retry_backoff(
669                    &mut backoff,
670                    remaining,
671                    Duration::from_millis(128),
672                );
673            }
674            Err(err) => return Err(err),
675        }
676    }
677}
678
679pub(crate) fn retryable_franken_error(err: &frankensqlite::FrankenError) -> bool {
680    matches!(
681        err,
682        frankensqlite::FrankenError::Busy
683            | frankensqlite::FrankenError::BusyRecovery
684            | frankensqlite::FrankenError::BusySnapshot { .. }
685            | frankensqlite::FrankenError::DatabaseLocked { .. }
686            | frankensqlite::FrankenError::LockFailed { .. }
687            | frankensqlite::FrankenError::WriteConflict { .. }
688            | frankensqlite::FrankenError::SerializationFailure { .. }
689    ) || retryable_storage_error_message(&err.to_string())
690}
691
692pub(crate) fn retryable_storage_error_message(message: &str) -> bool {
693    let lower = message.to_ascii_lowercase();
694    lower.contains("busy")
695        || lower.contains("locked")
696        || lower.contains("locking")
697        || lower.contains("contention")
698        || lower.contains("temporarily unavailable")
699        || lower.contains("would block")
700}
701
702pub(crate) fn retryable_franken_anyhow(err: &anyhow::Error) -> bool {
703    err.chain().any(|cause| {
704        cause
705            .downcast_ref::<frankensqlite::FrankenError>()
706            .is_some_and(retryable_franken_error)
707            || retryable_storage_error_message(&cause.to_string())
708    })
709}
710
711impl Drop for LazyFrankenDb {
712    fn drop(&mut self) {
713        let Some(mut conn) = self.conn.get_mut().take() else {
714            return;
715        };
716        conn.0.close_best_effort_in_place();
717    }
718}
719
720// -------------------------------------------------------------------------
721// FrankenSQLite Connection Manager (bead 3rlf8)
722// -------------------------------------------------------------------------
723// Multi-connection management: reader pool + concurrent writer connections.
724// Replaces the LazyFrankenDb single-connection bottleneck for high-throughput
725// scenarios (indexer parallel writes, concurrent TUI reads + indexer writes).
726
727/// Configuration for the [`FrankenConnectionManager`].
728#[derive(Debug, Clone)]
729pub struct ConnectionManagerConfig {
730    /// Number of pre-opened reader connections (default: 4).
731    pub reader_count: usize,
732    /// Maximum concurrent writer connections (default: available parallelism).
733    pub max_writers: usize,
734}
735
736impl Default for ConnectionManagerConfig {
737    fn default() -> Self {
738        let cpus = std::thread::available_parallelism()
739            .map(|n| n.get())
740            .unwrap_or(4);
741        Self {
742            reader_count: 4,
743            max_writers: cpus,
744        }
745    }
746}
747
748/// Multi-connection manager for frankensqlite.
749///
750/// Provides:
751/// - A pool of pre-opened reader connections (round-robin, Mutex-protected)
752/// - Controlled creation of writer connections with token-based limits
753/// - RAII guards that auto-rollback uncommitted transactions on drop
754///
755/// Thread-safe: reader connections are wrapped in Mutex (FrankenConnection is !Sync).
756/// Writer connections are created per-request (each thread gets its own).
757pub struct FrankenConnectionManager {
758    db_path: PathBuf,
759    readers: Vec<parking_lot::Mutex<SendFrankenConnection>>,
760    reader_idx: std::sync::atomic::AtomicUsize,
761    /// Token-based writer limit: channel pre-filled with `max_writers` tokens.
762    /// `recv()` = acquire slot, `send()` = release slot.
763    writer_tokens: (
764        crossbeam_channel::Sender<()>,
765        crossbeam_channel::Receiver<()>,
766    ),
767    config: ConnectionManagerConfig,
768}
769
770// Safety: FrankenConnectionManager is Send+Sync because:
771// - readers wrapped in Mutex<SendFrankenConnection> (exclusive access)
772// - writer_tokens uses crossbeam (Send+Sync)
773// - db_path is PathBuf (Send+Sync)
774unsafe impl Send for FrankenConnectionManager {}
775unsafe impl Sync for FrankenConnectionManager {}
776
777impl FrankenConnectionManager {
778    /// Create a new connection manager.
779    ///
780    /// Opens `config.reader_count` reader connections immediately.
781    /// Writer connections are created on demand (up to `config.max_writers`).
782    pub fn new(db_path: impl Into<PathBuf>, config: ConnectionManagerConfig) -> Result<Self> {
783        let db_path = db_path.into();
784        let path_str = db_path.to_string_lossy().to_string();
785
786        let reader_count = config.reader_count.max(1);
787        let mut readers = Vec::with_capacity(reader_count);
788        for _ in 0..reader_count {
789            let conn = FrankenConnection::open(&path_str)
790                .with_context(|| format!("opening reader connection at {}", db_path.display()))?;
791            // Apply read-tuned config (no migration, no write PRAGMAs)
792            let _ = conn.execute("PRAGMA busy_timeout = 5000;"); // match writer config
793            let _ = conn.execute("PRAGMA cache_size = -16384;"); // 16MB reader cache
794            readers.push(parking_lot::Mutex::new(SendFrankenConnection::new(conn)));
795        }
796
797        let max_writers = config.max_writers.max(1);
798
799        // Pre-fill bounded channel with tokens (acts as counting semaphore).
800        // A zero-capacity channel with no initial tokens would make the first
801        // writer acquisition block forever.
802        let (tx, rx) = crossbeam_channel::bounded(max_writers);
803        for _ in 0..max_writers {
804            tx.send(())
805                .map_err(|_| anyhow!("writer token channel closed during initialization"))?;
806        }
807
808        Ok(Self {
809            db_path,
810            readers,
811            reader_idx: std::sync::atomic::AtomicUsize::new(0),
812            writer_tokens: (tx, rx),
813            config: ConnectionManagerConfig {
814                reader_count,
815                max_writers,
816            },
817        })
818    }
819
820    /// Get a reader connection (round-robin from the pool).
821    ///
822    /// Returns a mutex guard wrapping the connection. The guard prevents
823    /// concurrent access to the same connection (FrankenConnection is !Sync).
824    pub fn reader(&self) -> parking_lot::MutexGuard<'_, SendFrankenConnection> {
825        let idx = self
826            .reader_idx
827            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
828        self.readers[idx % self.readers.len()].lock()
829    }
830
831    /// Acquire a writer connection.
832    ///
833    /// Opens a new frankensqlite connection with full config (no migration).
834    /// Blocks if `max_writers` connections are already in use.
835    /// The returned [`WriterGuard`] auto-rolls back on drop.
836    pub fn writer(&self) -> Result<WriterGuard<'_>> {
837        self.writer_tokens
838            .1
839            .recv()
840            .map_err(|_| anyhow!("writer token channel closed"))?;
841        let path_str = self.db_path.to_string_lossy().to_string();
842        let conn = match FrankenConnection::open(&path_str) {
843            Ok(c) => c,
844            Err(e) => {
845                let _ = self.writer_tokens.0.send(());
846                return Err(anyhow::Error::from(e).context(format!(
847                    "opening writer connection at {}",
848                    self.db_path.display()
849                )));
850            }
851        };
852        let storage = FrankenStorage::new(conn, self.db_path.clone());
853        if let Err(e) = storage.apply_config() {
854            let _ = self.writer_tokens.0.send(());
855            return Err(e);
856        }
857        Ok(WriterGuard {
858            storage,
859            mgr: self,
860            committed: false,
861        })
862    }
863
864    /// Acquire a concurrent writer connection (BEGIN CONCURRENT via MVCC).
865    ///
866    /// Similar to [`writer`] but tuned for the parallel indexer write pool.
867    /// Uses reduced cache size and is designed for short-lived batch inserts.
868    pub fn concurrent_writer(&self) -> Result<WriterGuard<'_>> {
869        self.writer_tokens
870            .1
871            .recv()
872            .map_err(|_| anyhow!("writer token channel closed"))?;
873        let path_str = self.db_path.to_string_lossy().to_string();
874        let conn = match FrankenConnection::open(&path_str) {
875            Ok(c) => c,
876            Err(e) => {
877                let _ = self.writer_tokens.0.send(());
878                return Err(anyhow::Error::from(e).context(format!(
879                    "opening concurrent writer at {}",
880                    self.db_path.display()
881                )));
882            }
883        };
884        let storage = FrankenStorage::new(conn, self.db_path.clone());
885        if let Err(e) = storage.apply_config() {
886            let _ = self.writer_tokens.0.send(());
887            return Err(e);
888        }
889        // Reduced cache for concurrent writers (they're short-lived)
890        let _ = storage.raw().execute("PRAGMA cache_size = -4096;");
891        Ok(WriterGuard {
892            storage,
893            mgr: self,
894            committed: false,
895        })
896    }
897
898    /// Database path managed by this pool.
899    pub fn db_path(&self) -> &Path {
900        &self.db_path
901    }
902
903    /// Number of reader connections in the pool.
904    pub fn reader_count(&self) -> usize {
905        self.readers.len()
906    }
907
908    /// Maximum concurrent writers allowed.
909    pub fn max_writers(&self) -> usize {
910        self.config.max_writers
911    }
912}
913
914impl Drop for FrankenConnectionManager {
915    fn drop(&mut self) {
916        for reader in &mut self.readers {
917            reader.get_mut().0.close_best_effort_in_place();
918        }
919    }
920}
921
922/// RAII guard for a writer connection.
923///
924/// Provides access to a [`FrankenStorage`] for write operations.
925/// Releases the writer semaphore slot when dropped.
926pub struct WriterGuard<'a> {
927    storage: FrankenStorage,
928    mgr: &'a FrankenConnectionManager,
929    committed: bool,
930}
931
932impl<'a> WriterGuard<'a> {
933    /// Access the underlying storage for read/write operations.
934    pub fn storage(&self) -> &FrankenStorage {
935        &self.storage
936    }
937
938    /// Mark this writer as successfully committed.
939    ///
940    /// Call after your transaction's `commit()` succeeds. Prevents the drop
941    /// guard from attempting a rollback.
942    pub fn mark_committed(&mut self) {
943        self.committed = true;
944    }
945}
946
947impl Drop for WriterGuard<'_> {
948    fn drop(&mut self) {
949        if !self.committed {
950            // Best-effort rollback — connection may already be in autocommit
951            let _ = self.storage.raw().execute("ROLLBACK;");
952        }
953        self.storage.close_best_effort_in_place();
954        // Release writer token
955        let _ = self.mgr.writer_tokens.0.send(());
956    }
957}
958
959// -------------------------------------------------------------------------
960// Binary Metadata Serialization (Opt 3.1)
961// -------------------------------------------------------------------------
962// MessagePack provides 50-70% storage reduction vs JSON and faster parsing.
963// New rows use binary columns; existing JSON is read on fallback.
964
965/// Serialize a JSON value to MessagePack bytes.
966/// Returns None for null/empty values to save storage.
967fn serialize_json_to_msgpack(value: &serde_json::Value) -> Option<Vec<u8>> {
968    if value.is_null() || value.as_object().is_some_and(|o| o.is_empty()) {
969        return None;
970    }
971    rmp_serde::to_vec(value).ok()
972}
973
974/// Deserialize MessagePack bytes to a JSON value.
975/// Returns default Value::Object({}) on error or empty input.
976fn deserialize_msgpack_to_json(bytes: &[u8]) -> serde_json::Value {
977    if bytes.is_empty() {
978        return serde_json::Value::Object(serde_json::Map::new());
979    }
980    rmp_serde::from_slice(bytes).unwrap_or_else(|e| {
981        tracing::debug!(
982            error = %e,
983            bytes_len = bytes.len(),
984            "Failed to deserialize metadata - returning empty object"
985        );
986        serde_json::Value::Object(serde_json::Map::new())
987    })
988}
989
990/// Read metadata from a frankensqlite Row, preferring binary (msgpack) over JSON.
991fn franken_read_metadata_compat(
992    row: &FrankenRow,
993    json_idx: usize,
994    bin_idx: usize,
995) -> serde_json::Value {
996    // Try binary column first (new format)
997    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
998        && !bytes.is_empty()
999    {
1000        return deserialize_msgpack_to_json(&bytes);
1001    }
1002
1003    // Fall back to JSON column (old format or migration in progress)
1004    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1005        return serde_json::from_str(&json_str)
1006            .unwrap_or_else(|_| serde_json::Value::Object(serde_json::Map::new()));
1007    }
1008
1009    serde_json::Value::Object(serde_json::Map::new())
1010}
1011
1012fn franken_read_message_extra_compat(
1013    row: &FrankenRow,
1014    json_idx: usize,
1015    bin_idx: usize,
1016) -> serde_json::Value {
1017    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1018        && !bytes.is_empty()
1019    {
1020        return deserialize_msgpack_to_json(&bytes);
1021    }
1022
1023    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1024        return serde_json::from_str(&json_str).unwrap_or(serde_json::Value::Null);
1025    }
1026
1027    serde_json::Value::Null
1028}
1029
1030// -------------------------------------------------------------------------
1031// Migration Error Types (P1.5)
1032// -------------------------------------------------------------------------
1033
1034/// Error type for schema migration operations.
1035#[derive(Debug, Error)]
1036pub enum MigrationError {
1037    /// The schema requires a full rebuild. The database has been backed up.
1038    #[error("Rebuild required: {reason}")]
1039    RebuildRequired {
1040        reason: String,
1041        backup_path: Option<std::path::PathBuf>,
1042    },
1043
1044    /// A database error occurred during migration.
1045    #[error("Database error: {0}")]
1046    Database(#[from] frankensqlite::FrankenError),
1047
1048    /// An I/O error occurred during backup.
1049    #[error("I/O error: {0}")]
1050    Io(#[from] std::io::Error),
1051
1052    /// Other migration error.
1053    #[error("{0}")]
1054    Other(String),
1055}
1056
1057impl From<anyhow::Error> for MigrationError {
1058    fn from(e: anyhow::Error) -> Self {
1059        MigrationError::Other(e.to_string())
1060    }
1061}
1062
1063/// Maximum number of backup files to retain.
1064const MAX_BACKUPS: usize = 3;
1065const BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA: &str = "PRAGMA busy_timeout = 30000;";
1066
1067/// Files that contain user-authored state and must NEVER be deleted during rebuild.
1068const USER_DATA_FILES: &[&str] = &["bookmarks.db", "tui_state.json", "sources.toml", ".env"];
1069
1070/// Check if a file is user-authored data that must be preserved during rebuild.
1071pub fn is_user_data_file(path: &Path) -> bool {
1072    path.file_name()
1073        .and_then(|n| n.to_str())
1074        .map(|name| USER_DATA_FILES.contains(&name))
1075        .unwrap_or(false)
1076}
1077
1078/// SQL to register the FTS5 virtual table on a frankensqlite connection.
1079///
1080/// FrankenSQLite skips virtual-table entries (rootpage=0) when loading
1081/// `sqlite_master` from a stock-SQLite database.  Executing this CREATE
1082/// triggers the legacy FTS5 fallback path and materialises the table so
1083/// subsequent FTS queries work.
1084pub const FTS5_REGISTER_SQL: &str = "\
1085    CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(\
1086        content, title, agent, workspace, source_path, \
1087        created_at UNINDEXED, \
1088        content='', tokenize='porter'\
1089    )";
1090
1091const FTS_FRANKEN_REBUILD_META_KEY: &str = "fts_frankensqlite_rebuild_generation";
1092const FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY: &str = "fts_frankensqlite_archive_fingerprint";
1093const FTS_FRANKEN_REBUILD_GENERATION: i64 = 1;
1094const DAILY_STATS_HEALTH_META_KEY: &str = "daily_stats_archive_fingerprint";
1095const DAILY_STATS_HEALTH_GENERATION_META_KEY: &str = "daily_stats_health_generation";
1096const DAILY_STATS_HEALTH_GENERATION: i64 = 1;
1097
1098/// SQL to clear all rows from the contentless `fts_messages` table.
1099///
1100/// Contentless FTS5 tables reject ordinary `DELETE FROM ...` statements.
1101pub const FTS5_DELETE_ALL_SQL: &str =
1102    "INSERT INTO fts_messages(fts_messages) VALUES('delete-all');";
1103
1104pub const FTS_MESSAGES_REQUIRED_SHADOW_TABLES: [&str; 5] = [
1105    "fts_messages_config",
1106    "fts_messages_content",
1107    "fts_messages_data",
1108    "fts_messages_docsize",
1109    "fts_messages_idx",
1110];
1111
1112pub const FTS_MESSAGES_INTEGRITY_PROBE_SQL: &str = "SELECT * FROM fts_messages LIMIT 0";
1113
1114pub const FTS_MESSAGES_CORRUPTION_RECOVERY_HINT: &str = "Stop all cass index/watch processes, back up the current database, then run \
1115     'cass doctor check --json' for a read-only diagnosis before using a supported \
1116     repair/rebuild path.";
1117
1118#[derive(Debug, Clone, PartialEq, Eq)]
1119pub struct FtsMessagesIntegrityError {
1120    missing_shadow_tables: Vec<&'static str>,
1121    failed_sql: Option<&'static str>,
1122    source_error: Option<String>,
1123}
1124
1125impl FtsMessagesIntegrityError {
1126    fn new(
1127        missing_shadow_tables: Vec<&'static str>,
1128        failed_sql: Option<&'static str>,
1129        source_error: Option<String>,
1130    ) -> Self {
1131        Self {
1132            missing_shadow_tables,
1133            failed_sql,
1134            source_error,
1135        }
1136    }
1137
1138    pub fn missing_shadow_tables(&self) -> &[&'static str] {
1139        &self.missing_shadow_tables
1140    }
1141}
1142
1143impl std::fmt::Display for FtsMessagesIntegrityError {
1144    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1145        write!(
1146            f,
1147            "CASS database FTS5 index is corrupt: fts_messages exists, but required FTS5 shadow tables are missing or unreadable"
1148        )?;
1149        if !self.missing_shadow_tables.is_empty() {
1150            write!(
1151                f,
1152                "; missing shadow tables: {}",
1153                self.missing_shadow_tables.join(", ")
1154            )?;
1155        }
1156        if let Some(sql) = self.failed_sql {
1157            write!(f, "; failed SQL: {sql}")?;
1158        }
1159        if let Some(source_error) = &self.source_error {
1160            write!(f, "; error: {source_error}")?;
1161        }
1162        write!(
1163            f,
1164            ". Suggested recovery: {FTS_MESSAGES_CORRUPTION_RECOVERY_HINT}"
1165        )
1166    }
1167}
1168
1169impl std::error::Error for FtsMessagesIntegrityError {}
1170
1171pub fn fts_messages_integrity_error_from_message(
1172    source_error: impl Into<String>,
1173) -> Option<FtsMessagesIntegrityError> {
1174    let source_error = source_error.into();
1175    let lower = source_error.to_ascii_lowercase();
1176    if !lower.contains("fts_messages") {
1177        return None;
1178    }
1179
1180    let mentions_structural_fts_failure = lower.contains("shadow table")
1181        || lower.contains("vtable constructor failed")
1182        || lower.contains("sqlite_corrupt")
1183        || lower.contains("databasecorrupt")
1184        || lower.contains("database corrupt")
1185        || lower.contains("missing required");
1186    if !mentions_structural_fts_failure {
1187        return None;
1188    }
1189
1190    let missing_shadow_tables = FTS_MESSAGES_REQUIRED_SHADOW_TABLES
1191        .iter()
1192        .copied()
1193        .filter(|table| lower.contains(&table.to_ascii_lowercase()))
1194        .collect::<Vec<_>>();
1195
1196    Some(FtsMessagesIntegrityError::new(
1197        missing_shadow_tables,
1198        Some(FTS_MESSAGES_INTEGRITY_PROBE_SQL),
1199        Some(source_error),
1200    ))
1201}
1202
1203fn fts_schema_tolerates_missing_shadow_metadata(sql: &str) -> bool {
1204    let normalized = sql
1205        .chars()
1206        .filter(|ch| !ch.is_whitespace())
1207        .collect::<String>()
1208        .to_ascii_lowercase();
1209    normalized.contains("usingfts5(")
1210        && normalized.contains("content=''")
1211        && !normalized.contains("message_id")
1212}
1213
1214pub fn validate_fts_messages_integrity_for_connection(conn: &FrankenConnection) -> Result<()> {
1215    let fts_schema_sql: Vec<String> = conn
1216        .query_map_collect(
1217            "SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'fts_messages'",
1218            fparams![],
1219            |row: &FrankenRow| row.get_typed::<String>(0),
1220        )
1221        .with_context(|| "checking for fts_messages in sqlite_master")?;
1222    if fts_schema_sql.is_empty() {
1223        return Ok(());
1224    }
1225
1226    let probe_error = conn.query(FTS_MESSAGES_INTEGRITY_PROBE_SQL).err();
1227    if probe_error.is_none()
1228        && fts_schema_sql
1229            .iter()
1230            .all(|sql| fts_schema_tolerates_missing_shadow_metadata(sql))
1231    {
1232        return Ok(());
1233    }
1234
1235    let present_shadow_tables: HashSet<String> = conn
1236        .query_map_collect(
1237            "SELECT name FROM sqlite_master
1238             WHERE type = 'table'
1239               AND name IN (
1240                 'fts_messages_config',
1241                 'fts_messages_content',
1242                 'fts_messages_data',
1243                 'fts_messages_docsize',
1244                 'fts_messages_idx'
1245               )",
1246            fparams![],
1247            |row: &FrankenRow| row.get_typed::<String>(0),
1248        )
1249        .map(|rows| rows.into_iter().collect())
1250        .map_err(|err| {
1251            FtsMessagesIntegrityError::new(
1252                Vec::new(),
1253                Some(
1254                    "SELECT name FROM sqlite_master WHERE name IN \
1255                     ('fts_messages_config','fts_messages_content','fts_messages_data','fts_messages_docsize','fts_messages_idx')",
1256                ),
1257                Some(err.to_string()),
1258            )
1259        })?;
1260    let missing_shadow_tables = FTS_MESSAGES_REQUIRED_SHADOW_TABLES
1261        .iter()
1262        .copied()
1263        .filter(|table| !present_shadow_tables.contains(*table))
1264        .collect::<Vec<_>>();
1265
1266    // If every required shadow table is present, the FTS5 schema is
1267    // structurally sound. A probe-SQL failure here typically reflects an
1268    // incomplete FTS5 runtime emulation (e.g. frankensqlite's vtable path)
1269    // rather than fixture corruption — and conflating the two would
1270    // wrongly reject every database with the new message_id schema that
1271    // frankensqlite happens to serve via a different code path. Returning
1272    // Ok here keeps the false-positive surface narrow; the truly-missing-
1273    // shadow case below still surfaces as before.
1274    if missing_shadow_tables.is_empty() {
1275        return Ok(());
1276    }
1277
1278    Err(FtsMessagesIntegrityError::new(
1279        missing_shadow_tables,
1280        probe_error
1281            .as_ref()
1282            .map(|_| FTS_MESSAGES_INTEGRITY_PROBE_SQL),
1283        probe_error.map(|err| err.to_string()),
1284    )
1285    .into())
1286}
1287
1288#[cfg(test)]
1289pub(crate) fn materialize_fresh_fts_schema_via_rusqlite(db_path: &Path) -> Result<()> {
1290    // Delegate to FrankenStorage: DROP TABLE IF EXISTS + CREATE VIRTUAL TABLE
1291    // is fully supported by the frankensqlite FTS5 path at
1292    // FrankenStorage::rebuild_fts_via_frankensqlite. We call rebuild which
1293    // also populates rows, matching the historical semantics ("fresh FTS"
1294    // means the schema exists and is consistent with message rows).
1295    let storage = FrankenStorage::open(db_path).with_context(|| {
1296        format!(
1297            "opening frankensqlite db at {} for FTS materialization",
1298            db_path.display()
1299        )
1300    })?;
1301    storage.rebuild_fts_via_frankensqlite().map(|_| ())
1302}
1303
1304#[cfg(test)]
1305pub(crate) fn rebuild_fts_via_rusqlite(db_path: &Path) -> Result<usize> {
1306    let storage = FrankenStorage::open(db_path).with_context(|| {
1307        format!(
1308            "opening frankensqlite db at {} for FTS rebuild",
1309            db_path.display()
1310        )
1311    })?;
1312    let inserted = storage.rebuild_fts_via_frankensqlite()?;
1313    storage.record_fts_franken_rebuild_generation()?;
1314    Ok(inserted)
1315}
1316
1317pub(crate) fn ensure_fts_consistency_via_rusqlite(db_path: &Path) -> Result<FtsConsistencyRepair> {
1318    // Delegates to the FrankenStorage-native path. The function name retains
1319    // the `_via_rusqlite` suffix only for backwards compatibility with the
1320    // few test-site callers; all operations now run through frankensqlite.
1321    let storage = FrankenStorage::open(db_path).with_context(|| {
1322        format!(
1323            "opening frankensqlite db at {} for FTS consistency check",
1324            db_path.display()
1325        )
1326    })?;
1327    storage.ensure_search_fallback_fts_consistency()
1328}
1329
1330/// Create a uniquely named backup of the database file.
1331///
1332/// Returns the path to the backup file, or None if the source doesn't exist.
1333pub fn create_backup(db_path: &Path) -> Result<Option<std::path::PathBuf>, MigrationError> {
1334    if !bundle_path_exists(db_path)? {
1335        return Ok(None);
1336    }
1337
1338    if !copyable_bundle_file_exists(db_path)? {
1339        return Ok(None);
1340    }
1341    let _ = copyable_bundle_sidecar_sources(db_path)?;
1342
1343    let backup_path = unique_backup_path(db_path);
1344    let vacuum_stage_path = vacuum_stage_backup_path(&backup_path);
1345
1346    // Try to use SQLite's VACUUM INTO command first, which safely handles WAL files
1347    // and produces a clean, minimized backup.
1348    match vacuum_into_backup_stage(db_path, &vacuum_stage_path) {
1349        Ok(()) => {
1350            fs::rename(&vacuum_stage_path, &backup_path)?;
1351        }
1352        Err(err) if backup_vacuum_error_requires_consistent_retry(&err) => {
1353            tracing::warn!(
1354                db_path = %db_path.display(),
1355                error = %err,
1356                "create_backup: VACUUM INTO hit transient contention; refusing raw WAL bundle copy"
1357            );
1358            return Err(MigrationError::Database(err));
1359        }
1360        Err(err) => {
1361            tracing::warn!(
1362                db_path = %db_path.display(),
1363                error = %err,
1364                "create_backup: VACUUM INTO failed; falling back to raw evidence copy"
1365            );
1366        }
1367    }
1368
1369    if backup_path.exists() {
1370        sync_file_if_exists(&backup_path)?;
1371        if let Some(parent) = backup_path.parent() {
1372            sync_parent_directory(parent)?;
1373        }
1374        return Ok(Some(backup_path));
1375    }
1376
1377    // Fallback to a raw evidence copy if VACUUM INTO failed (e.g., older SQLite
1378    // or corruption). Keep this on the same symlink-safe bundle path as
1379    // historical seeding so a malformed archive root cannot make us copy an
1380    // arbitrary symlink target or publish a partial sidecar backup.
1381    copy_database_bundle(db_path, &backup_path)?;
1382
1383    Ok(Some(backup_path))
1384}
1385
1386fn vacuum_into_backup_stage(
1387    db_path: &Path,
1388    stage_path: &Path,
1389) -> std::result::Result<(), frankensqlite::FrankenError> {
1390    let mut conn = open_franken_with_flags(
1391        &db_path.to_string_lossy(),
1392        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
1393    )?;
1394    let result = (|| {
1395        conn.execute(BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA)?;
1396        let path_str = stage_path.to_string_lossy();
1397        conn.execute_compat("VACUUM INTO ?", fparams![path_str.as_ref()])?;
1398        Ok(())
1399    })();
1400    if let Err(close_err) = conn.close_in_place() {
1401        tracing::warn!(
1402            error = %close_err,
1403            db_path = %db_path.display(),
1404            "create_backup: close_in_place failed after VACUUM INTO; falling back to best-effort close"
1405        );
1406        conn.close_best_effort_in_place();
1407    }
1408    result
1409}
1410
1411fn backup_vacuum_error_requires_consistent_retry(err: &frankensqlite::FrankenError) -> bool {
1412    retryable_franken_error(err)
1413}
1414
1415#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1416pub struct DatabaseBundleMoveResult {
1417    pub database: bool,
1418    pub wal: bool,
1419    pub shm: bool,
1420}
1421
1422impl DatabaseBundleMoveResult {
1423    pub fn moved_any(&self) -> bool {
1424        self.database || self.wal || self.shm
1425    }
1426}
1427
1428fn database_sidecar_path(path: &Path, suffix: &str) -> PathBuf {
1429    PathBuf::from(format!("{}{}", path.to_string_lossy(), suffix))
1430}
1431
1432/// Move a database file and its WAL/SHM sidecars to a new basename.
1433///
1434/// This is used for non-destructive quarantine of a corrupted bundle before a
1435/// rebuild. If the main database file is already missing but orphaned sidecars
1436/// remain, those sidecars are still moved so a fresh database can be created
1437/// without inheriting stale WAL state.
1438pub(crate) fn move_database_bundle(
1439    source_root: &Path,
1440    destination_root: &Path,
1441) -> std::io::Result<DatabaseBundleMoveResult> {
1442    let mut moved = DatabaseBundleMoveResult::default();
1443    if let Some(parent) = destination_root.parent() {
1444        fs::create_dir_all(parent)?;
1445        sync_parent_directory(parent)?;
1446    }
1447
1448    if bundle_path_exists(source_root)? {
1449        fs::rename(source_root, destination_root)?;
1450        moved.database = true;
1451    }
1452
1453    let wal_source = database_sidecar_path(source_root, "-wal");
1454    if bundle_path_exists(&wal_source)? {
1455        fs::rename(&wal_source, database_sidecar_path(destination_root, "-wal"))?;
1456        moved.wal = true;
1457    }
1458
1459    let shm_source = database_sidecar_path(source_root, "-shm");
1460    if bundle_path_exists(&shm_source)? {
1461        fs::rename(&shm_source, database_sidecar_path(destination_root, "-shm"))?;
1462        moved.shm = true;
1463    }
1464
1465    if moved.moved_any() {
1466        if let Some(parent) = source_root.parent() {
1467            sync_parent_directory(parent)?;
1468        }
1469        if let Some(parent) = destination_root.parent() {
1470            sync_parent_directory(parent)?;
1471        }
1472    }
1473
1474    Ok(moved)
1475}
1476
1477fn bundle_path_exists(path: &Path) -> std::io::Result<bool> {
1478    match fs::symlink_metadata(path) {
1479        Ok(_) => Ok(true),
1480        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1481        Err(err) => Err(err),
1482    }
1483}
1484
1485fn copy_database_bundle(source_root: &Path, destination_root: &Path) -> Result<()> {
1486    if let Some(parent) = destination_root.parent() {
1487        fs::create_dir_all(parent).with_context(|| {
1488            format!(
1489                "creating destination directory for database bundle copy: {}",
1490                parent.display()
1491            )
1492        })?;
1493        sync_parent_directory(parent)
1494            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1495    }
1496
1497    if !copyable_bundle_file_exists(source_root)? {
1498        bail!(
1499            "database bundle root is missing before copy: {}",
1500            source_root.display()
1501        );
1502    }
1503
1504    let sidecars = copyable_bundle_sidecar_sources(source_root)?;
1505
1506    fs::copy(source_root, destination_root).with_context(|| {
1507        format!(
1508            "copying database bundle {} -> {}",
1509            source_root.display(),
1510            destination_root.display()
1511        )
1512    })?;
1513    sync_file_if_exists(destination_root).with_context(|| {
1514        format!(
1515            "syncing copied database bundle {}",
1516            destination_root.display()
1517        )
1518    })?;
1519
1520    for (source_sidecar, suffix) in sidecars {
1521        let destination_sidecar = database_sidecar_path(destination_root, suffix);
1522        fs::copy(&source_sidecar, &destination_sidecar).with_context(|| {
1523            format!(
1524                "copying database bundle sidecar {} -> {}",
1525                source_sidecar.display(),
1526                destination_sidecar.display()
1527            )
1528        })?;
1529        sync_file_if_exists(&destination_sidecar).with_context(|| {
1530            format!(
1531                "syncing copied database bundle sidecar {}",
1532                destination_sidecar.display()
1533            )
1534        })?;
1535    }
1536
1537    if let Some(parent) = destination_root.parent() {
1538        sync_parent_directory(parent)
1539            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1540    }
1541
1542    Ok(())
1543}
1544
1545fn copyable_bundle_sidecar_sources(source_root: &Path) -> Result<Vec<(PathBuf, &'static str)>> {
1546    let mut sidecars = Vec::new();
1547    for suffix in ["-wal", "-shm"] {
1548        let source_sidecar = database_sidecar_path(source_root, suffix);
1549        if copyable_bundle_file_exists(&source_sidecar)? {
1550            sidecars.push((source_sidecar, suffix));
1551        }
1552    }
1553    Ok(sidecars)
1554}
1555
1556fn copyable_bundle_file_exists(path: &Path) -> Result<bool> {
1557    match fs::symlink_metadata(path) {
1558        Ok(metadata) => {
1559            let file_type = metadata.file_type();
1560            if file_type.is_symlink() {
1561                bail!(
1562                    "refusing to copy database bundle symlink: {}",
1563                    path.display()
1564                );
1565            }
1566            if !file_type.is_file() {
1567                bail!(
1568                    "refusing to copy non-file database bundle path: {}",
1569                    path.display()
1570                );
1571            }
1572            Ok(true)
1573        }
1574        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1575        Err(err) => Err(err).with_context(|| {
1576            format!(
1577                "checking database bundle path before copy: {}",
1578                path.display()
1579            )
1580        }),
1581    }
1582}
1583
1584/// Helper to safely remove a database file and its potential WAL/SHM sidecars.
1585pub(crate) fn remove_database_files(path: &Path) -> std::io::Result<()> {
1586    let mut removed_any = false;
1587
1588    match fs::remove_file(path) {
1589        Ok(()) => removed_any = true,
1590        Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1591        Err(err) => return Err(err),
1592    }
1593
1594    // Best-effort removal of sidecar files (ignore errors if they don't exist)
1595    for suffix in ["-wal", "-shm"] {
1596        match fs::remove_file(database_sidecar_path(path, suffix)) {
1597            Ok(()) => removed_any = true,
1598            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1599            Err(err) => return Err(err),
1600        }
1601    }
1602
1603    if removed_any && let Some(parent) = path.parent() {
1604        sync_parent_directory(parent)?;
1605    }
1606
1607    Ok(())
1608}
1609
1610#[cfg(not(windows))]
1611fn sync_parent_directory(path: &Path) -> std::io::Result<()> {
1612    fs::File::open(path)?.sync_all()
1613}
1614
1615#[cfg(windows)]
1616fn sync_parent_directory(_path: &Path) -> std::io::Result<()> {
1617    Ok(())
1618}
1619
1620fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1621    if path.exists() {
1622        fs::File::open(path)?.sync_all()?;
1623    }
1624    Ok(())
1625}
1626
1627/// Remove old backup files, keeping only the most recent `keep_count`.
1628pub fn cleanup_old_backups(db_path: &Path, keep_count: usize) -> Result<(), std::io::Error> {
1629    let parent = match db_path.parent() {
1630        Some(p) => p,
1631        None => return Ok(()),
1632    };
1633
1634    let db_name = db_path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
1635
1636    let prefix = format!("{}.backup.", db_name);
1637
1638    // Collect backup files matching the pattern
1639    let mut backups: Vec<(std::path::PathBuf, SystemTime)> = Vec::new();
1640
1641    if let Ok(entries) = fs::read_dir(parent) {
1642        for entry in entries.flatten() {
1643            let path = entry.path();
1644            if let Some(name) = path.file_name().and_then(|n| n.to_str())
1645                && is_backup_root_name(name, &prefix)
1646                && let Ok(meta) = fs::metadata(&path)
1647                && meta.is_file()
1648                && let Ok(mtime) = meta.modified()
1649            {
1650                backups.push((path, mtime));
1651            }
1652        }
1653    }
1654
1655    // Sort by modification time, newest first
1656    backups.sort_by_key(|entry| std::cmp::Reverse(entry.1));
1657
1658    // Delete oldest backups beyond keep_count
1659    for (path, _) in backups.into_iter().skip(keep_count) {
1660        let _ = fs::remove_file(&path);
1661
1662        // Also try to cleanup potential sidecars from fs::copy fallback
1663        let _ = fs::remove_file(database_sidecar_path(&path, "-wal"));
1664        let _ = fs::remove_file(database_sidecar_path(&path, "-shm"));
1665    }
1666
1667    Ok(())
1668}
1669
1670#[derive(Debug, Clone)]
1671pub(crate) struct HistoricalDatabaseBundle {
1672    root_path: PathBuf,
1673    total_bytes: u64,
1674    modified_at_ms: i64,
1675    supports_direct_readonly: bool,
1676    probe: HistoricalBundleProbe,
1677}
1678
1679#[derive(Debug, Clone, Copy, Default)]
1680struct HistoricalBundleProbe {
1681    schema_version: Option<i64>,
1682    fts_schema_rows: Option<i64>,
1683    fts_queryable: bool,
1684    max_message_id: i64,
1685}
1686
1687#[cfg(test)]
1688#[allow(dead_code)]
1689#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1690pub(crate) struct SqliteDatabaseHealthProbe {
1691    pub schema_version: Option<i64>,
1692    pub quick_check_ok: bool,
1693    pub fts_schema_rows: i64,
1694    pub fts_queryable: bool,
1695    pub message_count: i64,
1696    pub max_message_id: i64,
1697}
1698
1699#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1700pub(crate) enum FtsConsistencyRepair {
1701    AlreadyHealthy {
1702        rows: usize,
1703    },
1704    IncrementalCatchUp {
1705        inserted_rows: usize,
1706        total_rows: usize,
1707    },
1708    Rebuilt {
1709        inserted_rows: usize,
1710    },
1711}
1712
1713#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
1714pub struct HistoricalSalvageOutcome {
1715    pub bundles_considered: usize,
1716    pub bundles_imported: usize,
1717    pub conversations_imported: usize,
1718    pub messages_imported: usize,
1719}
1720
1721impl HistoricalSalvageOutcome {
1722    pub(crate) fn accumulate(&mut self, other: Self) {
1723        self.bundles_considered += other.bundles_considered;
1724        self.bundles_imported += other.bundles_imported;
1725        self.conversations_imported += other.conversations_imported;
1726        self.messages_imported += other.messages_imported;
1727    }
1728}
1729
1730#[derive(Debug)]
1731struct HistoricalReadConnection {
1732    conn: FrankenConnection,
1733    method: &'static str,
1734    root_path: PathBuf,
1735    _tempdir: Option<tempfile::TempDir>,
1736}
1737
1738const HISTORICAL_RECOVERY_CORE_SCHEMA: &str = r"
1739CREATE TABLE sources (
1740    id TEXT PRIMARY KEY,
1741    kind TEXT,
1742    host_label TEXT,
1743    machine_id TEXT,
1744    platform TEXT,
1745    config_json TEXT,
1746    created_at INTEGER,
1747    updated_at INTEGER
1748);
1749CREATE TABLE agents (
1750    id INTEGER PRIMARY KEY,
1751    slug TEXT,
1752    name TEXT,
1753    version TEXT,
1754    kind TEXT,
1755    created_at INTEGER,
1756    updated_at INTEGER
1757);
1758CREATE TABLE workspaces (
1759    id INTEGER PRIMARY KEY,
1760    path TEXT,
1761    display_name TEXT
1762);
1763CREATE TABLE conversations (
1764    id INTEGER PRIMARY KEY,
1765    agent_id INTEGER,
1766    workspace_id INTEGER,
1767    source_id TEXT,
1768    external_id TEXT,
1769    title TEXT,
1770    source_path TEXT,
1771    started_at INTEGER,
1772    ended_at INTEGER,
1773    approx_tokens INTEGER,
1774    metadata_json TEXT,
1775    origin_host TEXT,
1776    metadata_bin BLOB,
1777    total_input_tokens INTEGER,
1778    total_output_tokens INTEGER,
1779    total_cache_read_tokens INTEGER,
1780    total_cache_creation_tokens INTEGER,
1781    grand_total_tokens INTEGER,
1782    estimated_cost_usd REAL,
1783    primary_model TEXT,
1784    api_call_count INTEGER,
1785    tool_call_count INTEGER,
1786    user_message_count INTEGER,
1787    assistant_message_count INTEGER,
1788    last_message_idx INTEGER,
1789    last_message_created_at INTEGER
1790);
1791CREATE TABLE messages (
1792    id INTEGER PRIMARY KEY,
1793    conversation_id INTEGER,
1794    idx INTEGER,
1795    role TEXT,
1796    author TEXT,
1797    created_at INTEGER,
1798    content TEXT,
1799    extra_json TEXT,
1800    extra_bin BLOB
1801);
1802CREATE TABLE snippets (
1803    id INTEGER PRIMARY KEY,
1804    message_id INTEGER,
1805    file_path TEXT,
1806    start_line INTEGER,
1807    end_line INTEGER,
1808    language TEXT,
1809    snippet_text TEXT
1810);
1811";
1812const HISTORICAL_SALVAGE_LEDGER_VERSION: u32 = 2;
1813const HISTORICAL_SALVAGE_PROGRESS_VERSION: u32 = 1;
1814const SOURCE_PATH_MERGE_START_TOLERANCE_MS: i64 = 5 * 60 * 1000;
1815
1816#[derive(Debug, Clone, Serialize, Deserialize)]
1817struct HistoricalBundleProgress {
1818    progress_version: u32,
1819    path: String,
1820    bytes: u64,
1821    modified_at_ms: i64,
1822    method: String,
1823    last_completed_source_row_id: i64,
1824    conversations_imported: usize,
1825    messages_imported: usize,
1826    updated_at_ms: i64,
1827}
1828
1829#[derive(Debug, Clone)]
1830struct HistoricalBatchEntry {
1831    source_row_id: i64,
1832    agent_id: i64,
1833    workspace_id: Option<i64>,
1834    conversation: Conversation,
1835}
1836
1837#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1838struct HistoricalBatchImportTotals {
1839    inserted_source_rows: usize,
1840    inserted_messages: usize,
1841}
1842
1843fn historical_bundle_root_paths(db_path: &Path) -> Vec<PathBuf> {
1844    let mut roots = Vec::new();
1845    let Some(parent) = db_path.parent() else {
1846        return roots;
1847    };
1848    let db_name = db_path
1849        .file_name()
1850        .and_then(|n| n.to_str())
1851        .unwrap_or("agent_search.db");
1852    let db_stem = db_path
1853        .file_stem()
1854        .and_then(|n| n.to_str())
1855        .unwrap_or("agent_search");
1856
1857    let mut push_root = |path: PathBuf| {
1858        if path == db_path {
1859            return;
1860        }
1861        if !roots.iter().any(|existing| existing == &path) {
1862            roots.push(path);
1863        }
1864    };
1865
1866    if let Ok(entries) = fs::read_dir(parent) {
1867        for entry in entries.flatten() {
1868            let path = entry.path();
1869            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1870                continue;
1871            };
1872            if has_db_sidecar_suffix(name) {
1873                continue;
1874            }
1875            if name.starts_with(&format!("{db_name}.backup."))
1876                || name.starts_with(&format!("{db_stem}.corrupt."))
1877            {
1878                push_root(path);
1879            }
1880        }
1881    }
1882
1883    let backups_dir = parent.join("backups");
1884    if let Ok(entries) = fs::read_dir(backups_dir) {
1885        for entry in entries.flatten() {
1886            let path = entry.path();
1887            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1888                continue;
1889            };
1890            if has_db_sidecar_suffix(name) {
1891                continue;
1892            }
1893            if name.starts_with(&format!("{db_name}.")) && name.ends_with(".bak") {
1894                push_root(path);
1895            }
1896        }
1897    }
1898
1899    push_named_database_children(&mut roots, db_path, &parent.join("repair-lab"), db_name);
1900    push_named_database_children(&mut roots, db_path, &parent.join("snapshots"), db_name);
1901
1902    roots
1903}
1904
1905fn push_named_database_children(
1906    roots: &mut Vec<PathBuf>,
1907    canonical_db_path: &Path,
1908    dir: &Path,
1909    db_name: &str,
1910) {
1911    if let Ok(entries) = fs::read_dir(dir) {
1912        for entry in entries.flatten() {
1913            let candidate = entry.path().join(db_name);
1914            if candidate == canonical_db_path {
1915                continue;
1916            }
1917            if candidate.exists() && !roots.iter().any(|existing| existing == &candidate) {
1918                roots.push(candidate);
1919            }
1920        }
1921    }
1922}
1923
1924fn file_mtime_ms(path: &Path) -> i64 {
1925    fs::metadata(path)
1926        .and_then(|meta| meta.modified())
1927        .ok()
1928        .and_then(|ts| ts.duration_since(UNIX_EPOCH).ok())
1929        .map(|d| d.as_millis() as i64)
1930        .unwrap_or(0)
1931}
1932
1933fn bundle_total_bytes(root_path: &Path) -> u64 {
1934    let mut total = fs::metadata(root_path).map(|meta| meta.len()).unwrap_or(0);
1935    for suffix in ["-wal", "-shm"] {
1936        let sidecar = database_sidecar_path(root_path, suffix);
1937        total = total.saturating_add(fs::metadata(sidecar).map(|meta| meta.len()).unwrap_or(0));
1938    }
1939    total
1940}
1941
1942pub(crate) fn discover_historical_database_bundles(
1943    db_path: &Path,
1944) -> Vec<HistoricalDatabaseBundle> {
1945    let mut bundles: Vec<_> = historical_bundle_root_paths(db_path)
1946        .into_iter()
1947        .filter(|root| root.exists())
1948        .map(|root_path| {
1949            let modified_at_ms = file_mtime_ms(&root_path);
1950            let total_bytes = bundle_total_bytes(&root_path);
1951            let supports_direct_readonly = historical_bundle_supports_direct_readonly(&root_path);
1952            let probe = probe_historical_bundle(&root_path);
1953            HistoricalDatabaseBundle {
1954                modified_at_ms,
1955                total_bytes,
1956                supports_direct_readonly,
1957                root_path,
1958                probe,
1959            }
1960        })
1961        .filter(|bundle| bundle.total_bytes > 0)
1962        .collect();
1963
1964    fn bundle_priority(path: &Path) -> i32 {
1965        let path_str = path.to_string_lossy();
1966        if path_str.contains("/repair-lab/replay-") {
1967            return 5;
1968        }
1969        if path_str.contains("/repair-lab/") {
1970            return 4;
1971        }
1972        if path_str.contains("/snapshots/") {
1973            return 3;
1974        }
1975        if path_str.contains(".corrupt.") || path_str.contains("failed-baseline-seed") {
1976            return 0;
1977        }
1978        1
1979    }
1980
1981    fn bundle_health_rank(bundle: &HistoricalDatabaseBundle) -> i32 {
1982        // Classify FTS health. The probe only sets `fts_queryable = true`
1983        // when `fts_schema_rows == Some(1)` (see
1984        // `historical_bundle_fts_queryable_via_frankensqlite`), so we have
1985        // two legitimate "clean" shapes for a bundle:
1986        //
1987        //   * `fts_schema_rows == Some(1) && fts_queryable` — a pre-V14
1988        //     bundle where the FTS virtual table was eagerly created by
1989        //     migration and is queryable right now.
1990        //
1991        //   * `fts_schema_rows == Some(0) && schema_version == Some(V14+)` —
1992        //     a modern bundle where `MIGRATION_V14` dropped fts_messages on
1993        //     purpose and cass recreates it lazily via
1994        //     `ensure_search_fallback_fts_consistency` on the first open.
1995        //     Gating on `schema_version == CURRENT_SCHEMA_VERSION` is critical
1996        //     so an incomplete pre-V14 bundle with 0 fts rows is not promoted
1997        //     alongside real lazy-V14+ bundles. A `None` schema_version
1998        //     (schema marker unreadable) is excluded for the same reason.
1999        //
2000        // Everything else — `Some(1)` without queryability, `Some(n)` for
2001        // n >= 2 (duplicated CREATE VIRTUAL TABLE rows from a broken legacy
2002        // rebuild), `None` entirely, or `Some(0)` on a non-current schema —
2003        // is not "fts clean".
2004        let fts_clean = match bundle.probe.fts_schema_rows {
2005            Some(1) => bundle.probe.fts_queryable,
2006            Some(0) => bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION),
2007            _ => false,
2008        };
2009
2010        let clean_schema14_fts =
2011            bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION) && fts_clean;
2012        if clean_schema14_fts {
2013            return 5;
2014        }
2015
2016        if fts_clean {
2017            return 4;
2018        }
2019
2020        if bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION)
2021            && bundle.supports_direct_readonly
2022        {
2023            return 3;
2024        }
2025
2026        if bundle.supports_direct_readonly {
2027            return 2;
2028        }
2029
2030        1
2031    }
2032
2033    bundles.sort_by(|left, right| {
2034        bundle_health_rank(right)
2035            .cmp(&bundle_health_rank(left))
2036            .then_with(|| right.probe.max_message_id.cmp(&left.probe.max_message_id))
2037            .then_with(|| bundle_priority(&right.root_path).cmp(&bundle_priority(&left.root_path)))
2038            .then_with(|| {
2039                right
2040                    .supports_direct_readonly
2041                    .cmp(&left.supports_direct_readonly)
2042            })
2043            .then_with(|| right.total_bytes.cmp(&left.total_bytes))
2044            .then_with(|| right.modified_at_ms.cmp(&left.modified_at_ms))
2045            .then_with(|| right.root_path.cmp(&left.root_path))
2046    });
2047    bundles
2048}
2049
2050fn probe_historical_bundle(root_path: &Path) -> HistoricalBundleProbe {
2051    let Ok(conn) = open_historical_bundle_readonly(root_path) else {
2052        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or_default();
2053    };
2054
2055    let schema_version = read_meta_schema_version(&conn).ok().flatten();
2056    let fts_schema_rows: Option<i64> = conn
2057        .query_row_map(
2058            "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2059            fparams![],
2060            |row| row.get_typed(0),
2061        )
2062        .ok();
2063    let fts_queryable =
2064        historical_bundle_fts_queryable_via_frankensqlite(root_path, fts_schema_rows);
2065    let max_message_id: i64 = conn
2066        .query_row_map(
2067            "SELECT COALESCE(MAX(id), 0) FROM messages",
2068            fparams![],
2069            |row| row.get_typed(0),
2070        )
2071        .unwrap_or(0);
2072
2073    let probe = HistoricalBundleProbe {
2074        schema_version,
2075        fts_schema_rows,
2076        fts_queryable,
2077        max_message_id,
2078    };
2079
2080    if probe.schema_version.is_none()
2081        && probe.fts_schema_rows.is_none()
2082        && probe.max_message_id == 0
2083    {
2084        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or(probe);
2085    }
2086
2087    probe
2088}
2089
2090fn probe_historical_bundle_via_sqlite3_metadata(root_path: &Path) -> Option<HistoricalBundleProbe> {
2091    let bundle_uri = format!("file:{}?immutable=1", root_path.to_string_lossy());
2092    let output = Command::new("sqlite3")
2093        .arg("-batch")
2094        .arg("-noheader")
2095        .arg(&bundle_uri)
2096        .arg(
2097            "PRAGMA writable_schema=ON;
2098             SELECT COALESCE((SELECT value FROM meta WHERE key = 'schema_version'), '');
2099             SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages';
2100             SELECT COALESCE(MAX(id), 0) FROM messages;",
2101        )
2102        .output()
2103        .ok()?;
2104    if !output.status.success() {
2105        return None;
2106    }
2107
2108    let stdout = String::from_utf8(output.stdout).ok()?;
2109    let mut lines = stdout.lines();
2110    let schema_version = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
2111    let fts_schema_rows = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
2112    let max_message_id = lines
2113        .next()
2114        .and_then(|raw| raw.trim().parse::<i64>().ok())
2115        .unwrap_or(0);
2116
2117    Some(HistoricalBundleProbe {
2118        schema_version,
2119        fts_schema_rows,
2120        fts_queryable: false,
2121        max_message_id,
2122    })
2123}
2124
2125fn historical_bundle_fts_queryable_via_frankensqlite(
2126    root_path: &Path,
2127    fts_schema_rows: Option<i64>,
2128) -> bool {
2129    matches!(fts_schema_rows, Some(1))
2130        && FrankenStorage::open_readonly(root_path)
2131            .map(|storage| {
2132                storage
2133                    .raw()
2134                    .query("SELECT COUNT(*) FROM fts_messages")
2135                    .is_ok()
2136            })
2137            .unwrap_or(false)
2138}
2139
2140fn historical_bundle_supports_direct_readonly(root_path: &Path) -> bool {
2141    open_historical_bundle_readonly(root_path)
2142        .and_then(|conn| historical_bundle_has_queryable_core_tables(&conn))
2143        .is_ok()
2144}
2145
2146fn historical_table_exists(conn: &FrankenConnection, table: &str) -> Result<bool> {
2147    let found: Option<i64> = conn
2148        .query_row_map(
2149            "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1 LIMIT 1",
2150            fparams![table],
2151            |row| row.get_typed(0),
2152        )
2153        .optional()
2154        .with_context(|| format!("checking for historical table {table}"))?;
2155    Ok(found.is_some())
2156}
2157
2158fn probe_historical_table_reads(conn: &FrankenConnection, table: &str) -> Result<()> {
2159    if !historical_table_exists(conn, table)? {
2160        return Err(anyhow!(
2161            "historical database missing required table {table}"
2162        ));
2163    }
2164
2165    let sql = format!("SELECT rowid FROM {table} LIMIT 1");
2166    let _: Option<i64> = conn
2167        .query_row_map(&sql, fparams![], |row| row.get_typed(0))
2168        .optional()
2169        .with_context(|| format!("probing rows from historical table {table}"))?;
2170    Ok(())
2171}
2172
2173fn historical_bundle_has_queryable_core_tables(conn: &FrankenConnection) -> Result<()> {
2174    probe_historical_table_reads(conn, "conversations")?;
2175    probe_historical_table_reads(conn, "messages")?;
2176    Ok(())
2177}
2178
2179fn open_historical_bundle_readonly(root_path: &Path) -> Result<FrankenConnection> {
2180    let path_str = root_path.to_string_lossy();
2181    let flags = FrankenOpenFlags::SQLITE_OPEN_READ_ONLY;
2182    let conn = open_franken_with_flags(&path_str, flags)
2183        .with_context(|| format!("opening historical database {}", root_path.display()))?;
2184    Ok(conn)
2185}
2186
2187fn is_recoverable_insert_line(line: &str) -> bool {
2188    [
2189        "sources",
2190        "agents",
2191        "workspaces",
2192        "conversations",
2193        "messages",
2194        "snippets",
2195    ]
2196    .iter()
2197    .any(|table| {
2198        line.starts_with(&format!("INSERT INTO '{table}'"))
2199            || line.starts_with(&format!("INSERT OR IGNORE INTO '{table}'"))
2200            || line.starts_with(&format!("INSERT INTO \"{table}\""))
2201            || line.starts_with(&format!("INSERT OR IGNORE INTO \"{table}\""))
2202    })
2203}
2204
2205fn recover_historical_bundle_via_sqlite3(
2206    bundle: &HistoricalDatabaseBundle,
2207) -> Result<HistoricalReadConnection> {
2208    let tempdir = tempfile::TempDir::new().context("creating temporary salvage directory")?;
2209    let recovered_db = tempdir.path().join("historical-recovered.db");
2210    let temp_conn = FrankenConnection::open(recovered_db.to_string_lossy().as_ref())
2211        .with_context(|| format!("creating recovered database {}", recovered_db.display()))?;
2212    temp_conn
2213        .execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA)
2214        .with_context(|| format!("initializing recovered schema {}", recovered_db.display()))?;
2215    drop(temp_conn);
2216
2217    let bundle_uri = format!("file:{}?immutable=1", bundle.root_path.to_string_lossy());
2218    let mut recover = Command::new("sqlite3")
2219        .arg(&bundle_uri)
2220        .arg(".recover")
2221        .stdout(Stdio::piped())
2222        .spawn()
2223        .with_context(|| {
2224            format!(
2225                "launching sqlite3 .recover for historical bundle {}",
2226                bundle.root_path.display()
2227            )
2228        })?;
2229    let recover_stdout = recover
2230        .stdout
2231        .take()
2232        .context("capturing sqlite3 .recover stdout")?;
2233
2234    let mut importer = Command::new("sqlite3")
2235        .arg(&recovered_db)
2236        .stdin(Stdio::piped())
2237        .spawn()
2238        .with_context(|| {
2239            format!(
2240                "launching sqlite3 importer for recovered bundle {}",
2241                recovered_db.display()
2242            )
2243        })?;
2244
2245    {
2246        let importer_stdin = importer
2247            .stdin
2248            .as_mut()
2249            .context("opening sqlite3 importer stdin")?;
2250        importer_stdin
2251            .write_all(b"BEGIN;\n")
2252            .context("starting recovery import transaction")?;
2253
2254        let reader = BufReader::new(recover_stdout);
2255        for line in reader.lines() {
2256            let line = line.context("reading sqlite3 .recover output")?;
2257            if is_recoverable_insert_line(&line) {
2258                importer_stdin
2259                    .write_all(line.as_bytes())
2260                    .context("writing recovered INSERT")?;
2261                importer_stdin
2262                    .write_all(b"\n")
2263                    .context("writing recovered INSERT newline")?;
2264            }
2265        }
2266
2267        importer_stdin
2268            .write_all(b"COMMIT;\n")
2269            .context("committing recovery import transaction")?;
2270    }
2271
2272    let recover_status = recover
2273        .wait()
2274        .context("waiting for sqlite3 .recover process")?;
2275    if !recover_status.success() {
2276        anyhow::bail!(
2277            "sqlite3 .recover exited with status {} for {}",
2278            recover_status,
2279            bundle.root_path.display()
2280        );
2281    }
2282
2283    let importer_status = importer
2284        .wait()
2285        .context("waiting for sqlite3 recovery importer")?;
2286    if !importer_status.success() {
2287        anyhow::bail!(
2288            "sqlite3 recovery importer exited with status {} for {}",
2289            importer_status,
2290            recovered_db.display()
2291        );
2292    }
2293
2294    let conn = open_historical_bundle_readonly(&recovered_db)?;
2295    historical_bundle_has_queryable_core_tables(&conn)?;
2296    Ok(HistoricalReadConnection {
2297        conn,
2298        method: "sqlite3-recover",
2299        root_path: recovered_db,
2300        _tempdir: Some(tempdir),
2301    })
2302}
2303
2304fn open_historical_bundle_for_salvage(
2305    bundle: &HistoricalDatabaseBundle,
2306) -> Result<HistoricalReadConnection> {
2307    match open_historical_bundle_readonly(&bundle.root_path) {
2308        Ok(conn) => {
2309            if historical_bundle_has_queryable_core_tables(&conn).is_ok() {
2310                return Ok(HistoricalReadConnection {
2311                    conn,
2312                    method: "direct-readonly",
2313                    root_path: bundle.root_path.clone(),
2314                    _tempdir: None,
2315                });
2316            }
2317        }
2318        Err(err) => {
2319            tracing::warn!(
2320                path = %bundle.root_path.display(),
2321                error = %err,
2322                "historical bundle direct open failed; falling back to sqlite3 .recover"
2323            );
2324        }
2325    }
2326
2327    recover_historical_bundle_via_sqlite3(bundle)
2328}
2329
2330fn historical_bundle_counts(conn: &FrankenConnection) -> Result<(usize, usize)> {
2331    let conversations: i64 =
2332        conn.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
2333            row.get_typed(0)
2334        })?;
2335    let messages: i64 = conn.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2336        row.get_typed(0)
2337    })?;
2338    Ok((
2339        usize::try_from(conversations.max(0)).unwrap_or(usize::MAX),
2340        usize::try_from(messages.max(0)).unwrap_or(usize::MAX),
2341    ))
2342}
2343
2344fn clear_seeded_runtime_meta(conn: &FrankenConnection) -> Result<()> {
2345    conn.execute(
2346        "DELETE FROM meta
2347         WHERE key LIKE 'historical_bundle_salvaged:%'
2348            OR key IN ('last_scan_ts', 'last_indexed_at', 'last_embedded_message_id')",
2349    )?;
2350    Ok(())
2351}
2352
2353fn record_historical_bundle_import(
2354    conn: &FrankenConnection,
2355    bundle: &HistoricalDatabaseBundle,
2356    method: &str,
2357    conversations_imported: usize,
2358    messages_imported: usize,
2359) -> Result<()> {
2360    let key = FrankenStorage::historical_bundle_meta_key(bundle);
2361    let value = serde_json::json!({
2362        "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
2363        "path": bundle.root_path.display().to_string(),
2364        "bytes": bundle.total_bytes,
2365        "modified_at_ms": bundle.modified_at_ms,
2366        "method": method,
2367        "conversations_imported": conversations_imported,
2368        "messages_imported": messages_imported,
2369        "recorded_at_ms": FrankenStorage::now_millis(),
2370    });
2371    let value_str = serde_json::to_string(&value)?;
2372    conn.execute_compat(
2373        "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
2374        fparams![key, value_str],
2375    )?;
2376    Ok(())
2377}
2378
2379fn finalize_seeded_canonical_bundle_via_rusqlite(
2380    canonical_db_path: &Path,
2381    bundle: &HistoricalDatabaseBundle,
2382    conversations_imported: usize,
2383    messages_imported: usize,
2384) -> Result<()> {
2385    let _fts_repair =
2386        ensure_fts_consistency_via_rusqlite(canonical_db_path).with_context(|| {
2387            format!(
2388                "repairing staged canonical FTS consistency before finalization: {}",
2389                canonical_db_path.display()
2390            )
2391        })?;
2392
2393    let path_str = canonical_db_path.to_string_lossy();
2394    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2395        format!(
2396            "opening seeded canonical database for post-seed finalization: {}",
2397            canonical_db_path.display()
2398        )
2399    })?;
2400    conn.execute("PRAGMA busy_timeout = 30000;")
2401        .with_context(|| {
2402            format!(
2403                "configuring busy timeout for seeded canonical database {}",
2404                canonical_db_path.display()
2405            )
2406        })?;
2407    let schema_version = read_meta_schema_version(&conn)?;
2408
2409    if let Some(version) = schema_version
2410        && version < CURRENT_SCHEMA_VERSION
2411        && version != 13
2412    {
2413        anyhow::bail!(
2414            "seeded canonical bundle schema_version {version} is too old for baseline import and cannot be finalized automatically"
2415        );
2416    }
2417
2418    clear_seeded_runtime_meta(&conn)?;
2419
2420    conn.execute_compat(
2421        "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
2422        fparams![CURRENT_SCHEMA_VERSION.to_string()],
2423    )?;
2424
2425    conn.execute_compat(
2426        "INSERT OR IGNORE INTO _schema_migrations(version, name) VALUES(?1, 'fts_contentless')",
2427        fparams![CURRENT_SCHEMA_VERSION],
2428    )?;
2429    record_historical_bundle_import(
2430        &conn,
2431        bundle,
2432        "baseline-bulk-sql-copy",
2433        conversations_imported,
2434        messages_imported,
2435    )?;
2436    Ok(())
2437}
2438
2439fn read_meta_schema_version(conn: &FrankenConnection) -> Result<Option<i64>> {
2440    let version: Option<String> = conn
2441        .query_row_map(
2442            "SELECT value FROM meta WHERE key = 'schema_version'",
2443            fparams![],
2444            |row| row.get_typed(0),
2445        )
2446        .optional()?;
2447    Ok(version.and_then(|raw| raw.parse::<i64>().ok()))
2448}
2449
2450#[cfg(test)]
2451fn franken_fts_schema_rows(conn: &FrankenConnection) -> Result<i64> {
2452    conn.query_row_map(
2453        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2454        fparams![],
2455        |row| row.get_typed(0),
2456    )
2457    .context("counting sqlite_master rows for fts_messages via frankensqlite")
2458}
2459
2460#[cfg(test)]
2461fn franken_fts_limit_probe(conn: &FrankenConnection) -> bool {
2462    conn.query("SELECT COUNT(*) FROM fts_messages").is_ok()
2463}
2464
2465#[cfg(test)]
2466#[allow(dead_code)]
2467pub(crate) fn probe_database_health_via_frankensqlite(
2468    db_path: &Path,
2469) -> Result<SqliteDatabaseHealthProbe> {
2470    let path_str = db_path.to_string_lossy();
2471    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2472        format!(
2473            "opening frankensqlite db at {} for database health probe",
2474            db_path.display()
2475        )
2476    })?;
2477    conn.execute_batch("PRAGMA busy_timeout = 30000;")
2478        .with_context(|| {
2479            format!(
2480                "configuring busy timeout for database health probe at {}",
2481                db_path.display()
2482            )
2483        })?;
2484
2485    let schema_version = read_meta_schema_version(&conn)?;
2486    let quick_check_status: String = conn
2487        .query_row_map("PRAGMA quick_check(1)", fparams![], |row| row.get_typed(0))
2488        .with_context(|| format!("running PRAGMA quick_check(1) for {}", db_path.display()))?;
2489    let quick_check_ok = quick_check_status.trim().eq_ignore_ascii_case("ok");
2490    let fts_schema_rows = franken_fts_schema_rows(&conn)?;
2491    let fts_queryable = fts_schema_rows == 1 && franken_fts_limit_probe(&conn);
2492
2493    if !quick_check_ok {
2494        return Ok(SqliteDatabaseHealthProbe {
2495            schema_version,
2496            quick_check_ok,
2497            fts_schema_rows,
2498            fts_queryable,
2499            message_count: 0,
2500            max_message_id: 0,
2501        });
2502    }
2503
2504    let message_count: i64 = conn
2505        .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2506            row.get_typed(0)
2507        })
2508        .context("counting messages during frankensqlite database health probe")?;
2509    let max_message_id: i64 = conn
2510        .query_row_map(
2511            "SELECT COALESCE(MAX(id), 0) FROM messages",
2512            fparams![],
2513            |row| row.get_typed(0),
2514        )
2515        .context("reading max message id during frankensqlite database health probe")?;
2516
2517    Ok(SqliteDatabaseHealthProbe {
2518        schema_version,
2519        quick_check_ok,
2520        fts_schema_rows,
2521        fts_queryable,
2522        message_count,
2523        max_message_id,
2524    })
2525}
2526
2527struct StagedHistoricalSeed {
2528    tempdir: tempfile::TempDir,
2529    db_path: PathBuf,
2530}
2531
2532fn stage_historical_bundle_for_seed(
2533    canonical_db_path: &Path,
2534    source_root_path: &Path,
2535) -> Result<StagedHistoricalSeed> {
2536    let canonical_parent = canonical_db_path.parent().unwrap_or_else(|| Path::new("."));
2537    fs::create_dir_all(canonical_parent).with_context(|| {
2538        format!(
2539            "creating canonical database directory before bulk historical seed import: {}",
2540            canonical_parent.display()
2541        )
2542    })?;
2543    let tempdir = tempfile::TempDir::new_in(canonical_parent)
2544        .context("creating temporary baseline seed directory")?;
2545    let staged_seed_db = tempdir.path().join("baseline-seed-output.db");
2546    copy_database_bundle(source_root_path, &staged_seed_db)?;
2547
2548    Ok(StagedHistoricalSeed {
2549        tempdir,
2550        db_path: staged_seed_db,
2551    })
2552}
2553
2554fn promote_staged_historical_seed(
2555    canonical_db_path: &Path,
2556    staged_seed: &StagedHistoricalSeed,
2557) -> Result<()> {
2558    let canonical_backup = staged_seed
2559        .tempdir
2560        .path()
2561        .join("pre-seed-canonical-backup.db");
2562    let had_canonical = canonical_db_path.exists()
2563        || database_sidecar_path(canonical_db_path, "-wal").exists()
2564        || database_sidecar_path(canonical_db_path, "-shm").exists();
2565
2566    if had_canonical {
2567        move_database_bundle(canonical_db_path, &canonical_backup).with_context(|| {
2568            format!(
2569                "backing up canonical database before promoting staged historical seed import: {}",
2570                canonical_db_path.display()
2571            )
2572        })?;
2573    }
2574
2575    if let Err(err) =
2576        move_database_bundle(&staged_seed.db_path, canonical_db_path).with_context(|| {
2577            format!(
2578                "promoting staged historical seed database bundle {} into canonical path {}",
2579                staged_seed.db_path.display(),
2580                canonical_db_path.display()
2581            )
2582        })
2583    {
2584        if had_canonical {
2585            let _ = move_database_bundle(&canonical_backup, canonical_db_path);
2586        }
2587        return Err(err);
2588    }
2589
2590    Ok(())
2591}
2592
2593pub(crate) fn seed_canonical_from_best_historical_bundle(
2594    canonical_db_path: &Path,
2595) -> Result<Option<HistoricalSalvageOutcome>> {
2596    let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
2597    let mut last_seed_error: Option<anyhow::Error> = None;
2598    for bundle in ordered_bundles {
2599        if let Some(version) = bundle.probe.schema_version
2600            && version < 13
2601        {
2602            let err = anyhow!(
2603                "historical bundle {} schema_version {version} is too old for baseline import",
2604                bundle.root_path.display()
2605            );
2606            tracing::warn!(
2607                path = %bundle.root_path.display(),
2608                schema_version = version,
2609                "historical bundle is too old for baseline seed import"
2610            );
2611            last_seed_error = Some(err);
2612            continue;
2613        }
2614
2615        let source = open_historical_bundle_for_salvage(&bundle).with_context(|| {
2616            format!(
2617                "opening historical seed bundle {} for baseline import",
2618                bundle.root_path.display()
2619            )
2620        })?;
2621        let (conversations_imported, messages_imported) = historical_bundle_counts(&source.conn)?;
2622
2623        let staged_seed = match stage_historical_bundle_for_seed(
2624            canonical_db_path,
2625            &source.root_path,
2626        ) {
2627            Ok(staged_seed) => staged_seed,
2628            Err(err) => {
2629                tracing::warn!(
2630                    path = %bundle.root_path.display(),
2631                    error = %err,
2632                    "bulk baseline seed staging from historical bundle failed; trying next candidate"
2633                );
2634                last_seed_error = Some(err);
2635                continue;
2636            }
2637        };
2638
2639        if let Err(err) = finalize_seeded_canonical_bundle_via_rusqlite(
2640            &staged_seed.db_path,
2641            &bundle,
2642            conversations_imported,
2643            messages_imported,
2644        ) {
2645            tracing::warn!(
2646                path = %bundle.root_path.display(),
2647                error = %err,
2648                "finalizing staged historical seed import failed; trying next candidate"
2649            );
2650            last_seed_error = Some(err);
2651            continue;
2652        }
2653
2654        if let Err(err) = promote_staged_historical_seed(canonical_db_path, &staged_seed) {
2655            tracing::warn!(
2656                path = %bundle.root_path.display(),
2657                error = %err,
2658                "promoting staged historical seed import failed; trying next candidate"
2659            );
2660            last_seed_error = Some(err);
2661            continue;
2662        }
2663
2664        tracing::info!(
2665            path = %bundle.root_path.display(),
2666            conversations_imported,
2667            messages_imported,
2668            "seeded empty canonical database from largest healthy historical bundle"
2669        );
2670
2671        return Ok(Some(HistoricalSalvageOutcome {
2672            bundles_considered: 0,
2673            bundles_imported: 1,
2674            conversations_imported,
2675            messages_imported,
2676        }));
2677    }
2678    if let Some(err) = last_seed_error {
2679        return Err(err);
2680    }
2681    Ok(None)
2682}
2683
2684fn parse_json_column(value: Option<String>) -> serde_json::Value {
2685    value
2686        .and_then(|raw| serde_json::from_str(&raw).ok())
2687        .unwrap_or(serde_json::Value::Null)
2688}
2689
2690const HISTORICAL_RAW_JSON_SENTINEL_KEY: &str = "__cass_historical_raw_json__";
2691
2692fn wrap_historical_raw_json(raw: String) -> serde_json::Value {
2693    serde_json::json!({ HISTORICAL_RAW_JSON_SENTINEL_KEY: raw })
2694}
2695
2696fn historical_raw_json(value: &serde_json::Value) -> Option<&str> {
2697    match value {
2698        serde_json::Value::Object(map) if map.len() == 1 => map
2699            .get(HISTORICAL_RAW_JSON_SENTINEL_KEY)
2700            .and_then(serde_json::Value::as_str),
2701        _ => None,
2702    }
2703}
2704
2705fn parse_historical_json_column(value: Option<String>) -> serde_json::Value {
2706    match value {
2707        Some(raw) if raw.trim().is_empty() => serde_json::Value::Null,
2708        Some(raw) => wrap_historical_raw_json(raw),
2709        None => serde_json::Value::Null,
2710    }
2711}
2712
2713fn historical_salvage_debug_enabled() -> bool {
2714    std::env::var_os("CASS_DEBUG_HISTORICAL_SALVAGE").is_some()
2715}
2716
2717#[derive(Debug, Clone, Copy)]
2718struct HistoricalImportBatchLimits {
2719    conversations: usize,
2720    messages: usize,
2721    payload_chars: usize,
2722}
2723
2724fn env_positive_usize(key: &str) -> Option<usize> {
2725    dotenvy::var(key)
2726        .ok()
2727        .and_then(|value| value.parse::<usize>().ok())
2728        .filter(|value| *value > 0)
2729}
2730
2731fn historical_import_batch_limits() -> HistoricalImportBatchLimits {
2732    let cpu_count = std::thread::available_parallelism()
2733        .map(std::num::NonZeroUsize::get)
2734        .unwrap_or(1);
2735
2736    let default_limits = if cpu_count >= 32 {
2737        HistoricalImportBatchLimits {
2738            conversations: 128,
2739            messages: 16_384,
2740            payload_chars: 12_000_000,
2741        }
2742    } else {
2743        HistoricalImportBatchLimits {
2744            conversations: 32,
2745            messages: 4_096,
2746            payload_chars: 3_000_000,
2747        }
2748    };
2749
2750    HistoricalImportBatchLimits {
2751        conversations: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CONVERSATIONS")
2752            .unwrap_or(default_limits.conversations),
2753        messages: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_MESSAGES")
2754            .unwrap_or(default_limits.messages),
2755        payload_chars: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CHARS")
2756            .unwrap_or(default_limits.payload_chars),
2757    }
2758}
2759
2760fn json_value_size_hint(value: &serde_json::Value) -> usize {
2761    if let Some(raw) = historical_raw_json(value) {
2762        return raw.len();
2763    }
2764    match value {
2765        serde_json::Value::Null => 0,
2766        other => serde_json::to_string(other)
2767            .map(|raw| raw.len())
2768            .unwrap_or(0),
2769    }
2770}
2771
2772fn message_payload_size_hint(message: &Message) -> usize {
2773    message
2774        .content
2775        .len()
2776        .saturating_add(json_value_size_hint(&message.extra_json))
2777}
2778
2779fn is_backup_root_name(name: &str, prefix: &str) -> bool {
2780    name.starts_with(prefix) && !name.ends_with("-wal") && !name.ends_with("-shm")
2781}
2782
2783// Suffixes that mark sqlite sidecar files we must never re-open as a DB root.
2784// Includes the standard -wal/-shm pair plus frankensqlite's Windows advisory-
2785// lock sidecars (-lock-shared/-lock-reserved/-lock-pending). Used by directory
2786// enumeration paths in `historical_bundle_root_paths`; deliberately NOT used
2787// by `is_backup_root_name`, because the existing backup-rotation cleanup must
2788// continue to sweep up any pre-existing orphan lock sidecars.
2789fn has_db_sidecar_suffix(name: &str) -> bool {
2790    const SIDECAR_SUFFIXES: &[&str] = &[
2791        "-wal",
2792        "-shm",
2793        "-lock-shared",
2794        "-lock-reserved",
2795        "-lock-pending",
2796    ];
2797    SIDECAR_SUFFIXES.iter().any(|suffix| name.ends_with(suffix))
2798}
2799
2800/// Public schema version constant for external checks.
2801pub const CURRENT_SCHEMA_VERSION: i64 = 20;
2802const MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION: i64 = 13;
2803
2804/// Result of checking schema compatibility.
2805#[derive(Debug, Clone)]
2806pub enum SchemaCheck {
2807    /// Schema is up to date, no migration needed.
2808    Compatible,
2809    /// Schema needs migration but can be done incrementally.
2810    NeedsMigration,
2811    /// Schema is incompatible and needs a full rebuild (with reason).
2812    NeedsRebuild(String),
2813}
2814
2815fn schema_check_error_requires_rebuild(err: &frankensqlite::FrankenError) -> bool {
2816    // Only on-disk corruption classes justify destructive rebuild.
2817    // Locking, open, and generic I/O failures are often transient and must
2818    // surface as errors rather than deleting the database under the caller.
2819    matches!(
2820        err,
2821        frankensqlite::FrankenError::DatabaseCorrupt { .. }
2822            | frankensqlite::FrankenError::WalCorrupt { .. }
2823            | frankensqlite::FrankenError::NotADatabase { .. }
2824            | frankensqlite::FrankenError::ShortRead { .. }
2825    )
2826}
2827
2828fn unique_backup_path(path: &Path) -> PathBuf {
2829    static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
2830
2831    let timestamp = SystemTime::now()
2832        .duration_since(UNIX_EPOCH)
2833        .map(|d| d.as_nanos())
2834        .unwrap_or(0);
2835    let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
2836    let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
2837
2838    path.with_file_name(format!(
2839        "{file_name}.backup.{}.{}.{}",
2840        std::process::id(),
2841        timestamp,
2842        nonce
2843    ))
2844}
2845
2846fn vacuum_stage_backup_path(backup_path: &Path) -> PathBuf {
2847    let file_name = backup_path
2848        .file_name()
2849        .and_then(|name| name.to_str())
2850        .unwrap_or("db.backup");
2851    backup_path.with_file_name(format!(".{file_name}.vacuum-in-progress"))
2852}
2853
2854/// Check schema compatibility without modifying the database.
2855///
2856/// Opens the database read-only and checks the schema version.
2857fn check_schema_compatibility(
2858    path: &Path,
2859) -> std::result::Result<SchemaCheck, frankensqlite::FrankenError> {
2860    let mut conn = open_franken_with_flags(
2861        &path.to_string_lossy(),
2862        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
2863    )?;
2864
2865    let result = (|| {
2866        // Check if meta table exists
2867        let meta_exists: i32 = conn.query_row_map(
2868            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='meta'",
2869            fparams![],
2870            |row| row.get_typed(0),
2871        )?;
2872
2873        if meta_exists == 0 {
2874            // No meta table - could be empty or very old schema, needs rebuild
2875            // But first check if there are any tables at all
2876            let table_count: i32 = conn.query_row_map(
2877                "SELECT COUNT(*) FROM sqlite_master WHERE type='table'",
2878                fparams![],
2879                |row| row.get_typed(0),
2880            )?;
2881
2882            if table_count == 0 {
2883                // Empty database, will be initialized fresh
2884                return Ok(SchemaCheck::NeedsMigration);
2885            }
2886
2887            // Has tables but no meta - very old or corrupted
2888            return Ok(SchemaCheck::NeedsRebuild(
2889                "Database missing schema version metadata".to_string(),
2890            ));
2891        }
2892
2893        // Get the schema version
2894        let version: Option<i64> = conn
2895            .query_row_map(
2896                "SELECT value FROM meta WHERE key = 'schema_version'",
2897                fparams![],
2898                |row| Ok(row.get_typed::<String>(0)?.parse().ok()),
2899            )
2900            .ok()
2901            .flatten();
2902
2903        match version {
2904            Some(v) if v == SCHEMA_VERSION => Ok(SchemaCheck::Compatible),
2905            Some(v) if (MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION..SCHEMA_VERSION).contains(&v) => {
2906                Ok(SchemaCheck::NeedsMigration)
2907            }
2908            Some(v) if v > 0 && v < MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION => {
2909                Ok(SchemaCheck::NeedsRebuild(format!(
2910                    "Schema version {} is too old for in-place migration; supported upgrade path starts at version {}",
2911                    v, MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION
2912                )))
2913            }
2914            Some(v) => {
2915                // v > SCHEMA_VERSION - database is from a newer version
2916                Ok(SchemaCheck::NeedsRebuild(format!(
2917                    "Schema version {} is newer than supported version {}",
2918                    v, SCHEMA_VERSION
2919                )))
2920            }
2921            None => Ok(SchemaCheck::NeedsRebuild(
2922                "Schema version not found or invalid".to_string(),
2923            )),
2924        }
2925    })();
2926
2927    if let Err(close_err) = conn.close_in_place() {
2928        tracing::warn!(
2929            error = %close_err,
2930            db_path = %path.display(),
2931            "check_schema_compatibility: close_in_place failed; falling back to best-effort close"
2932        );
2933        conn.close_best_effort_in_place();
2934    }
2935
2936    result
2937}
2938
2939const SCHEMA_VERSION: i64 = CURRENT_SCHEMA_VERSION;
2940
2941#[cfg(test)]
2942const MIGRATION_V1: &str = r"
2943PRAGMA foreign_keys = ON;
2944
2945CREATE TABLE IF NOT EXISTS meta (
2946    key TEXT PRIMARY KEY,
2947    value TEXT NOT NULL
2948);
2949
2950CREATE TABLE IF NOT EXISTS agents (
2951    id INTEGER PRIMARY KEY,
2952    slug TEXT NOT NULL UNIQUE,
2953    name TEXT NOT NULL,
2954    version TEXT,
2955    kind TEXT NOT NULL,
2956    created_at INTEGER NOT NULL,
2957    updated_at INTEGER NOT NULL
2958);
2959
2960CREATE TABLE IF NOT EXISTS workspaces (
2961    id INTEGER PRIMARY KEY,
2962    path TEXT NOT NULL UNIQUE,
2963    display_name TEXT
2964);
2965
2966CREATE TABLE IF NOT EXISTS conversations (
2967    id INTEGER PRIMARY KEY,
2968    agent_id INTEGER NOT NULL REFERENCES agents(id),
2969    workspace_id INTEGER REFERENCES workspaces(id),
2970    external_id TEXT,
2971    title TEXT,
2972    source_path TEXT NOT NULL,
2973    started_at INTEGER,
2974    ended_at INTEGER,
2975    approx_tokens INTEGER,
2976    metadata_json TEXT,
2977    UNIQUE(agent_id, external_id)
2978);
2979
2980CREATE TABLE IF NOT EXISTS messages (
2981    id INTEGER PRIMARY KEY,
2982    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
2983    idx INTEGER NOT NULL,
2984    role TEXT NOT NULL,
2985    author TEXT,
2986    created_at INTEGER,
2987    content TEXT NOT NULL,
2988    extra_json TEXT,
2989    UNIQUE(conversation_id, idx)
2990);
2991
2992CREATE TABLE IF NOT EXISTS snippets (
2993    id INTEGER PRIMARY KEY,
2994    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
2995    file_path TEXT,
2996    start_line INTEGER,
2997    end_line INTEGER,
2998    language TEXT,
2999    snippet_text TEXT
3000);
3001
3002CREATE TABLE IF NOT EXISTS tags (
3003    id INTEGER PRIMARY KEY,
3004    name TEXT NOT NULL UNIQUE
3005);
3006
3007CREATE TABLE IF NOT EXISTS conversation_tags (
3008    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
3009    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
3010    PRIMARY KEY (conversation_id, tag_id)
3011);
3012
3013CREATE INDEX IF NOT EXISTS idx_conversations_agent_started
3014    ON conversations(agent_id, started_at DESC);
3015
3016CREATE INDEX IF NOT EXISTS idx_messages_conv_idx
3017    ON messages(conversation_id, idx);
3018
3019";
3020
3021#[cfg(test)]
3022const MIGRATION_V2: &str = r"
3023CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(
3024    content,
3025    title,
3026    agent,
3027    workspace,
3028    source_path,
3029    created_at UNINDEXED,
3030    message_id UNINDEXED,
3031    tokenize='porter'
3032);
3033INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
3034SELECT
3035    m.content,
3036    c.title,
3037    a.slug,
3038    w.path,
3039    c.source_path,
3040    m.created_at,
3041    m.id
3042FROM messages m
3043JOIN conversations c ON m.conversation_id = c.id
3044JOIN agents a ON c.agent_id = a.id
3045LEFT JOIN workspaces w ON c.workspace_id = w.id;
3046";
3047
3048#[cfg(test)]
3049#[allow(dead_code)]
3050const MIGRATION_V3: &str = r"
3051DROP TABLE IF EXISTS fts_messages;
3052CREATE VIRTUAL TABLE fts_messages USING fts5(
3053    content,
3054    title,
3055    agent,
3056    workspace,
3057    source_path,
3058    created_at UNINDEXED,
3059    message_id UNINDEXED,
3060    tokenize='porter'
3061);
3062INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
3063SELECT
3064    m.content,
3065    c.title,
3066    a.slug,
3067    w.path,
3068    c.source_path,
3069    m.created_at,
3070    m.id
3071FROM messages m
3072JOIN conversations c ON m.conversation_id = c.id
3073JOIN agents a ON c.agent_id = a.id
3074LEFT JOIN workspaces w ON c.workspace_id = w.id;
3075";
3076
3077#[cfg(test)]
3078const MIGRATION_V4: &str = r"
3079-- Sources table for tracking where conversations come from
3080CREATE TABLE IF NOT EXISTS sources (
3081    id TEXT PRIMARY KEY,           -- source_id (e.g., 'local', 'work-laptop')
3082    kind TEXT NOT NULL,            -- 'local', 'ssh', etc.
3083    host_label TEXT,               -- display label
3084    machine_id TEXT,               -- optional stable machine id
3085    platform TEXT,                 -- 'macos', 'linux', 'windows'
3086    config_json TEXT,              -- JSON blob for extra config (SSH params, path rewrites)
3087    created_at INTEGER NOT NULL,
3088    updated_at INTEGER NOT NULL
3089);
3090
3091-- Bootstrap: Insert the default 'local' source
3092INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
3093VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
3094";
3095
3096#[cfg(test)]
3097const MIGRATION_V5: &str = r"
3098-- Add provenance columns to conversations table
3099-- SQLite cannot alter unique constraints, so we need to recreate the table
3100
3101-- Create new table with provenance columns and updated unique constraint
3102CREATE TABLE conversations_new (
3103    id INTEGER PRIMARY KEY,
3104    agent_id INTEGER NOT NULL REFERENCES agents(id),
3105    workspace_id INTEGER REFERENCES workspaces(id),
3106    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
3107    external_id TEXT,
3108    title TEXT,
3109    source_path TEXT NOT NULL,
3110    started_at INTEGER,
3111    ended_at INTEGER,
3112    approx_tokens INTEGER,
3113    metadata_json TEXT,
3114    origin_host TEXT,
3115    UNIQUE(source_id, agent_id, external_id)
3116);
3117
3118-- Copy data from old table (all existing conversations get source_id='local')
3119INSERT INTO conversations_new (id, agent_id, workspace_id, source_id, external_id, title,
3120                               source_path, started_at, ended_at, approx_tokens, metadata_json, origin_host)
3121SELECT id, agent_id, workspace_id, 'local', external_id, title,
3122       source_path, started_at, ended_at, approx_tokens, metadata_json, NULL
3123FROM conversations;
3124
3125-- Drop old table and rename new
3126DROP TABLE conversations;
3127ALTER TABLE conversations_new RENAME TO conversations;
3128
3129-- Recreate indexes
3130CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
3131CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
3132";
3133
3134#[cfg(test)]
3135const MIGRATION_V6: &str = r"
3136-- Optimize lookup by source_path (used by TUI detail view)
3137CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
3138";
3139
3140#[cfg(test)]
3141const MIGRATION_V7: &str = r"
3142-- Add binary columns for MessagePack serialization (Opt 3.1)
3143-- Binary format is 50-70% smaller than JSON and faster to parse
3144ALTER TABLE conversations ADD COLUMN metadata_bin BLOB;
3145ALTER TABLE messages ADD COLUMN extra_bin BLOB;
3146";
3147
3148#[cfg(test)]
3149const MIGRATION_V8: &str = r"
3150-- Opt 3.2: Daily stats materialized table for O(1) time-range histograms
3151-- Provides fast aggregated queries for stats/dashboard without full table scans
3152
3153CREATE TABLE IF NOT EXISTS daily_stats (
3154    day_id INTEGER NOT NULL,              -- Days since 2020-01-01 (Unix epoch + offset)
3155    agent_slug TEXT NOT NULL,             -- 'all' for totals, or specific agent slug
3156    source_id TEXT NOT NULL DEFAULT 'all', -- 'all' for totals, or specific source
3157    session_count INTEGER NOT NULL DEFAULT 0,
3158    message_count INTEGER NOT NULL DEFAULT 0,
3159    total_chars INTEGER NOT NULL DEFAULT 0,
3160    last_updated INTEGER NOT NULL,
3161    PRIMARY KEY (day_id, agent_slug, source_id)
3162);
3163
3164CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
3165CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
3166";
3167
3168#[cfg(test)]
3169const MIGRATION_V9: &str = r"
3170-- Background embedding jobs tracking table
3171CREATE TABLE IF NOT EXISTS embedding_jobs (
3172    id INTEGER PRIMARY KEY AUTOINCREMENT,
3173    db_path TEXT NOT NULL,
3174    model_id TEXT NOT NULL,
3175    status TEXT NOT NULL DEFAULT 'pending',
3176    total_docs INTEGER NOT NULL DEFAULT 0,
3177    completed_docs INTEGER NOT NULL DEFAULT 0,
3178    error_message TEXT,
3179    created_at TEXT NOT NULL DEFAULT (datetime('now')),
3180    started_at TEXT,
3181    completed_at TEXT
3182);
3183
3184-- Only one pending or running job per (db_path, model_id) at a time.
3185-- Multiple completed/failed/cancelled jobs are allowed for history.
3186CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
3187ON embedding_jobs(db_path, model_id)
3188WHERE status IN ('pending', 'running');
3189";
3190
3191#[cfg(test)]
3192const MIGRATION_V10: &str = r"
3193-- Token analytics: per-message token usage ledger
3194CREATE TABLE IF NOT EXISTS token_usage (
3195    id INTEGER PRIMARY KEY AUTOINCREMENT,
3196    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
3197    conversation_id INTEGER NOT NULL,
3198    agent_id INTEGER NOT NULL,
3199    workspace_id INTEGER,
3200    source_id TEXT NOT NULL DEFAULT 'local',
3201
3202    -- Timing
3203    timestamp_ms INTEGER NOT NULL,
3204    day_id INTEGER NOT NULL,
3205
3206    -- Model identification
3207    model_name TEXT,
3208    model_family TEXT,
3209    model_tier TEXT,
3210    service_tier TEXT,
3211    provider TEXT,
3212
3213    -- Token counts (nullable — not all agents provide all fields)
3214    input_tokens INTEGER,
3215    output_tokens INTEGER,
3216    cache_read_tokens INTEGER,
3217    cache_creation_tokens INTEGER,
3218    thinking_tokens INTEGER,
3219    total_tokens INTEGER,
3220
3221    -- Cost estimation
3222    estimated_cost_usd REAL,
3223
3224    -- Message context
3225    role TEXT NOT NULL,
3226    content_chars INTEGER NOT NULL,
3227    has_tool_calls INTEGER NOT NULL DEFAULT 0,
3228    tool_call_count INTEGER NOT NULL DEFAULT 0,
3229
3230    -- Data quality
3231    data_source TEXT NOT NULL DEFAULT 'api',
3232
3233    UNIQUE(message_id)
3234);
3235
3236CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
3237CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
3238CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
3239CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
3240CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
3241
3242-- Token analytics: pre-aggregated daily rollups
3243CREATE TABLE IF NOT EXISTS token_daily_stats (
3244    day_id INTEGER NOT NULL,
3245    agent_slug TEXT NOT NULL,
3246    source_id TEXT NOT NULL DEFAULT 'all',
3247    model_family TEXT NOT NULL DEFAULT 'all',
3248
3249    api_call_count INTEGER NOT NULL DEFAULT 0,
3250    user_message_count INTEGER NOT NULL DEFAULT 0,
3251    assistant_message_count INTEGER NOT NULL DEFAULT 0,
3252    tool_message_count INTEGER NOT NULL DEFAULT 0,
3253
3254    total_input_tokens INTEGER NOT NULL DEFAULT 0,
3255    total_output_tokens INTEGER NOT NULL DEFAULT 0,
3256    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
3257    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
3258    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
3259    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
3260
3261    total_content_chars INTEGER NOT NULL DEFAULT 0,
3262    total_tool_calls INTEGER NOT NULL DEFAULT 0,
3263
3264    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
3265
3266    session_count INTEGER NOT NULL DEFAULT 0,
3267
3268    last_updated INTEGER NOT NULL,
3269
3270    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
3271);
3272
3273CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
3274CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
3275
3276-- Model pricing lookup table
3277CREATE TABLE IF NOT EXISTS model_pricing (
3278    model_pattern TEXT NOT NULL,
3279    provider TEXT NOT NULL,
3280    input_cost_per_mtok REAL NOT NULL,
3281    output_cost_per_mtok REAL NOT NULL,
3282    cache_read_cost_per_mtok REAL,
3283    cache_creation_cost_per_mtok REAL,
3284    effective_date TEXT NOT NULL,
3285    PRIMARY KEY (model_pattern, effective_date)
3286);
3287
3288-- Seed with current pricing (as of 2026-02)
3289INSERT OR IGNORE INTO model_pricing VALUES
3290    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
3291    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
3292    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
3293    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
3294    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
3295    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3296    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3297    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
3298    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
3299    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
3300
3301-- Extend conversations table with token summary columns
3302ALTER TABLE conversations ADD COLUMN total_input_tokens INTEGER;
3303ALTER TABLE conversations ADD COLUMN total_output_tokens INTEGER;
3304ALTER TABLE conversations ADD COLUMN total_cache_read_tokens INTEGER;
3305ALTER TABLE conversations ADD COLUMN total_cache_creation_tokens INTEGER;
3306ALTER TABLE conversations ADD COLUMN grand_total_tokens INTEGER;
3307ALTER TABLE conversations ADD COLUMN estimated_cost_usd REAL;
3308ALTER TABLE conversations ADD COLUMN primary_model TEXT;
3309ALTER TABLE conversations ADD COLUMN api_call_count INTEGER;
3310ALTER TABLE conversations ADD COLUMN tool_call_count INTEGER;
3311ALTER TABLE conversations ADD COLUMN user_message_count INTEGER;
3312ALTER TABLE conversations ADD COLUMN assistant_message_count INTEGER;
3313";
3314
3315const MIGRATION_V14: &str = r"
3316-- Switch FTS5 from internal-content to contentless mode (CASS #163).
3317-- Drop the old V13 internal-content fts_messages first so that
3318-- sqlite_schema does not contain two conflicting CREATE VIRTUAL TABLE
3319-- entries, which makes the database completely unreadable.
3320-- The current contentless table is recreated lazily after open() only when the
3321-- frankensqlite FTS consistency check finds it missing or malformed.
3322DROP TABLE IF EXISTS fts_messages;
3323";
3324
3325const MIGRATION_V15_TAIL_STATE_TABLE: &str = r"
3326CREATE TABLE IF NOT EXISTS conversation_tail_state (
3327    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3328    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3329    conversation_id INTEGER PRIMARY KEY,
3330    ended_at INTEGER,
3331    last_message_idx INTEGER,
3332    last_message_created_at INTEGER
3333);
3334";
3335
3336const MIGRATION_V16: &str = r"
3337-- UNIQUE(conversation_id, idx) already creates sqlite_autoindex_messages_1,
3338-- which covers the same lookup/order key as idx_messages_conv_idx. Keeping both
3339-- doubles message insert index maintenance on the hot indexing path.
3340DROP INDEX IF EXISTS idx_messages_conv_idx;
3341";
3342
3343const MIGRATION_V17: &str = r"
3344-- Drop the global messages(created_at) secondary index from the ingest hot
3345-- path. Search/time filters are served by the derived search layer and
3346-- conversation/analytics indexes, while this index is maintained on every
3347-- message insert.
3348DROP INDEX IF EXISTS idx_messages_created;
3349";
3350
3351const MIGRATION_V18: &str = r"
3352-- Move append-tail state out of the wide, indexed conversations row. The hot
3353-- append path updates this cache for every appended conversation; keeping it in
3354-- a tiny rowid table avoids rewriting the large conversation record.
3355CREATE TABLE IF NOT EXISTS conversation_tail_state (
3356    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3357    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3358    conversation_id INTEGER PRIMARY KEY,
3359    ended_at INTEGER,
3360    last_message_idx INTEGER,
3361    last_message_created_at INTEGER
3362);
3363
3364INSERT OR REPLACE INTO conversation_tail_state (
3365    conversation_id, ended_at, last_message_idx, last_message_created_at
3366)
3367SELECT id, ended_at, last_message_idx, last_message_created_at
3368FROM conversations
3369WHERE ended_at IS NOT NULL
3370   OR last_message_idx IS NOT NULL
3371   OR last_message_created_at IS NOT NULL;
3372";
3373
3374const MIGRATION_V19: &str = r"
3375-- Materialize external conversation provenance into one compact lookup key.
3376-- This keeps the hot append/new-conversation probe on a single primary-key
3377-- lookup instead of a composite conversations-table predicate.
3378CREATE TABLE IF NOT EXISTS conversation_external_lookup (
3379    lookup_key TEXT PRIMARY KEY,
3380    conversation_id INTEGER NOT NULL
3381);
3382
3383INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
3384SELECT
3385    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
3386    CAST(agent_id AS TEXT) || ':' ||
3387    CAST(length(external_id) AS TEXT) || ':' || external_id,
3388    id
3389FROM conversations
3390WHERE external_id IS NOT NULL;
3391";
3392
3393const MIGRATION_V20: &str = r"
3394-- Fuse external conversation lookup with append-tail state. Append-heavy
3395-- workloads can resolve both the conversation id and tail plan from one
3396-- primary-key probe.
3397CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
3398    lookup_key TEXT PRIMARY KEY,
3399    conversation_id INTEGER NOT NULL,
3400    ended_at INTEGER,
3401    last_message_idx INTEGER,
3402    last_message_created_at INTEGER
3403);
3404
3405INSERT OR REPLACE INTO conversation_external_tail_lookup (
3406    lookup_key,
3407    conversation_id,
3408    ended_at,
3409    last_message_idx,
3410    last_message_created_at
3411)
3412SELECT
3413    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
3414    CAST(c.agent_id AS TEXT) || ':' ||
3415    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
3416    c.id,
3417    (SELECT ts.ended_at
3418     FROM conversation_tail_state ts
3419     WHERE ts.conversation_id = c.id),
3420    (SELECT ts.last_message_idx
3421     FROM conversation_tail_state ts
3422     WHERE ts.conversation_id = c.id),
3423    (SELECT ts.last_message_created_at
3424     FROM conversation_tail_state ts
3425     WHERE ts.conversation_id = c.id)
3426FROM conversations c
3427WHERE c.external_id IS NOT NULL;
3428";
3429
3430/// Row from the embedding_jobs table.
3431#[derive(Debug, Clone)]
3432pub struct EmbeddingJobRow {
3433    pub id: i64,
3434    pub db_path: String,
3435    pub model_id: String,
3436    pub status: String,
3437    pub total_docs: i64,
3438    pub completed_docs: i64,
3439    pub error_message: Option<String>,
3440    pub created_at: String,
3441    pub started_at: Option<String>,
3442    pub completed_at: Option<String>,
3443}
3444
3445/// Lightweight conversation projection used while rebuilding the lexical index.
3446///
3447/// This intentionally omits `metadata_json` / `metadata_bin` and other bulky
3448/// fields because Tantivy only needs the stable envelope plus provenance
3449/// identifiers. Reading full metadata here can force frankensqlite to traverse
3450/// large overflow chains before the first lexical checkpoint is committed.
3451#[derive(Debug, Clone)]
3452pub struct LexicalRebuildConversationRow {
3453    pub id: Option<i64>,
3454    pub agent_slug: String,
3455    pub workspace: Option<PathBuf>,
3456    pub external_id: Option<String>,
3457    pub title: Option<String>,
3458    pub source_path: PathBuf,
3459    pub started_at: Option<i64>,
3460    pub ended_at: Option<i64>,
3461    pub source_id: String,
3462    pub origin_host: Option<String>,
3463}
3464
3465/// Lightweight per-conversation footprint used to pre-plan lexical rebuild
3466/// shard boundaries without re-reading full message bodies in the hot path.
3467#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3468pub struct LexicalRebuildConversationFootprintRow {
3469    pub conversation_id: i64,
3470    pub message_count: usize,
3471    pub message_bytes: usize,
3472}
3473
3474pub(crate) const LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE: usize = 4 * 1024;
3475const LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT: usize = 64;
3476
3477fn lexical_rebuild_tail_metadata_coverage_is_sufficient(
3478    total_conversations: usize,
3479    covered_conversations: usize,
3480) -> bool {
3481    total_conversations == 0
3482        || total_conversations.saturating_sub(covered_conversations.min(total_conversations))
3483            <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT
3484}
3485
3486fn lexical_rebuild_message_count_from_tail_idx(last_message_idx: Option<i64>) -> Option<usize> {
3487    let last_message_idx = u64::try_from(last_message_idx?).ok()?;
3488    let high_water = last_message_idx.checked_add(1)?;
3489    usize::try_from(high_water).ok()
3490}
3491
3492fn lexical_rebuild_conversation_footprint_from_count(
3493    conversation_id: i64,
3494    message_count: usize,
3495) -> LexicalRebuildConversationFootprintRow {
3496    LexicalRebuildConversationFootprintRow {
3497        conversation_id,
3498        message_count,
3499        message_bytes: message_count
3500            .saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE),
3501    }
3502}
3503
3504/// Lightweight message projection used by the streaming lexical rebuild path.
3505#[derive(Debug, Clone)]
3506pub struct LexicalRebuildMessageRow {
3507    pub conversation_id: i64,
3508    pub id: i64,
3509    pub idx: i64,
3510    pub role: String,
3511    pub author: Option<String>,
3512    pub created_at: Option<i64>,
3513    pub content: String,
3514}
3515
3516/// Even lighter message projection used only by the grouped lexical rebuild
3517/// stream hot path. It keeps just the per-message fields the rebuild consumes
3518/// and tracks the final message id at conversation scope instead.
3519#[derive(Debug, Clone, PartialEq, Eq)]
3520pub struct LexicalRebuildGroupedMessageRow {
3521    pub idx: i64,
3522    pub is_tool_role: bool,
3523    pub created_at: Option<i64>,
3524    pub content: String,
3525}
3526
3527pub type LexicalRebuildGroupedMessageRows = SmallVec<[LexicalRebuildGroupedMessageRow; 32]>;
3528
3529/// Compatibility alias retained while call sites finish converging on `FrankenStorage`.
3530pub type SqliteStorage = FrankenStorage;
3531
3532/// Primary frankensqlite-backed storage backend.
3533pub struct FrankenStorage {
3534    conn: FrankenConnection,
3535    db_path: PathBuf,
3536    ephemeral_writer_preflight_verified: AtomicBool,
3537    index_writer_checkpoint_pages: AtomicI64,
3538    index_writer_busy_timeout_ms: AtomicU64,
3539    cached_ephemeral_writer: parking_lot::Mutex<CachedEphemeralWriter>,
3540    ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3541    ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3542    ensured_conversation_sources: Arc<parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>>,
3543    ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3544    fts_messages_present_cache: AtomicI8,
3545}
3546
3547/// Keep ordinary storage commits from tripping over frequent auto-checkpoints
3548/// while still bounding WAL growth. Bulk index paths may override this through
3549/// their explicit checkpoint policy.
3550const DEFAULT_WAL_AUTOCHECKPOINT_PAGES: i64 = 4096;
3551const UNSET_INDEX_WRITER_CHECKPOINT_PAGES: i64 = i64::MIN;
3552const UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS: u64 = 0;
3553const FTS_MESSAGES_PRESENT_UNKNOWN: i8 = 0;
3554const FTS_MESSAGES_PRESENT_ABSENT: i8 = 1;
3555const FTS_MESSAGES_PRESENT_PRESENT: i8 = 2;
3556
3557enum CachedEphemeralWriter {
3558    Uninitialized,
3559    Cached(Box<SendFrankenConnection>),
3560    InUse,
3561}
3562
3563#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3564struct EnsuredAgentKey {
3565    slug: String,
3566    name: String,
3567    version: Option<String>,
3568    kind: String,
3569}
3570
3571impl EnsuredAgentKey {
3572    fn from_agent(agent: &Agent) -> Self {
3573        Self {
3574            slug: agent.slug.clone(),
3575            name: agent.name.clone(),
3576            version: agent.version.clone(),
3577            kind: agent_kind_str(agent.kind.clone()),
3578        }
3579    }
3580}
3581
3582#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3583struct EnsuredWorkspaceKey {
3584    path: String,
3585    display_name: Option<String>,
3586}
3587
3588impl EnsuredWorkspaceKey {
3589    fn new(path: String, display_name: Option<&str>) -> Self {
3590        Self {
3591            path,
3592            display_name: display_name.map(str::to_owned),
3593        }
3594    }
3595}
3596
3597#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3598struct EnsuredConversationSourceKey {
3599    id: String,
3600    kind: SourceKind,
3601    host_label: Option<String>,
3602}
3603
3604impl EnsuredConversationSourceKey {
3605    fn from_source(source: &Source) -> Self {
3606        Self {
3607            id: source.id.clone(),
3608            kind: source.kind,
3609            host_label: source.host_label.clone(),
3610        }
3611    }
3612}
3613
3614#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3615struct EnsuredDailyStatsKey {
3616    day_id: i64,
3617    agent_slug: String,
3618    source_id: String,
3619}
3620
3621impl EnsuredDailyStatsKey {
3622    fn new(day_id: i64, agent_slug: &str, source_id: &str) -> Self {
3623        Self {
3624            day_id,
3625            agent_slug: agent_slug.to_owned(),
3626            source_id: source_id.to_owned(),
3627        }
3628    }
3629}
3630
3631const AUTOCOMMIT_RETAIN_OFF_PRAGMAS: [&str; 2] = [
3632    "PRAGMA fsqlite.autocommit_retain = OFF;",
3633    "PRAGMA autocommit_retain = OFF;",
3634];
3635
3636fn disable_autocommit_retain<E>(
3637    mut execute: impl FnMut(&'static str) -> std::result::Result<(), E>,
3638) -> Result<&'static str>
3639where
3640    E: std::fmt::Display,
3641{
3642    let mut failures = Vec::new();
3643    for pragma in AUTOCOMMIT_RETAIN_OFF_PRAGMAS {
3644        match execute(pragma) {
3645            Ok(()) => return Ok(pragma),
3646            Err(err) => {
3647                let error = err.to_string();
3648                tracing::debug!(
3649                    %pragma,
3650                    error = %error,
3651                    "autocommit_retain PRAGMA variant not supported"
3652                );
3653                failures.push(format!("{pragma}: {error}"));
3654            }
3655        }
3656    }
3657
3658    Err(anyhow!(
3659        "failed to disable autocommit_retain on frankensqlite connection; \
3660         refusing to keep a long-lived MVCC connection that may accumulate \
3661         unbounded write snapshots. Upgrade frankensqlite to a version that \
3662         supports one of these PRAGMAs or use a short-lived connection path. \
3663         attempts: {}",
3664        failures.join("; ")
3665    ))
3666}
3667
3668impl FrankenStorage {
3669    fn new(conn: FrankenConnection, db_path: PathBuf) -> Self {
3670        Self::new_with_shared_caches(
3671            conn,
3672            db_path,
3673            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3674            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3675            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3676            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3677        )
3678    }
3679
3680    fn new_with_shared_caches(
3681        conn: FrankenConnection,
3682        db_path: PathBuf,
3683        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3684        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3685        ensured_conversation_sources: Arc<
3686            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3687        >,
3688        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3689    ) -> Self {
3690        Self {
3691            conn,
3692            db_path,
3693            ephemeral_writer_preflight_verified: AtomicBool::new(false),
3694            index_writer_checkpoint_pages: AtomicI64::new(UNSET_INDEX_WRITER_CHECKPOINT_PAGES),
3695            index_writer_busy_timeout_ms: AtomicU64::new(UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS),
3696            cached_ephemeral_writer: parking_lot::Mutex::new(CachedEphemeralWriter::Uninitialized),
3697            ensured_agents,
3698            ensured_workspaces,
3699            ensured_conversation_sources,
3700            ensured_daily_stats_keys,
3701            fts_messages_present_cache: AtomicI8::new(FTS_MESSAGES_PRESENT_UNKNOWN),
3702        }
3703    }
3704
3705    fn apply_open_stage_busy_timeout(&self) {
3706        if let Err(err) = self.conn.execute("PRAGMA busy_timeout = 5000;") {
3707            tracing::debug!(
3708                error = %err,
3709                "failed to apply open-stage busy_timeout before migrations"
3710            );
3711        }
3712    }
3713
3714    /// Open a frankensqlite connection, run migrations, and apply config.
3715    ///
3716    /// This initializes canonical schema state only. Derived fallback search
3717    /// structures like the in-database `fts_messages` table are repaired
3718    /// separately so ordinary opens never block on heavyweight maintenance.
3719    pub fn open(path: &Path) -> Result<Self> {
3720        if let Some(parent) = path.parent() {
3721            fs::create_dir_all(parent)
3722                .with_context(|| format!("creating db directory {}", parent.display()))?;
3723        }
3724
3725        let path_str = path.to_string_lossy().to_string();
3726        let _doctor_guard =
3727            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3728        let conn = FrankenConnection::open(&path_str)
3729            .with_context(|| format!("opening frankensqlite db at {}", path.display()))?;
3730        let storage = Self::new(conn, path.to_path_buf());
3731        storage.apply_open_stage_busy_timeout();
3732        storage.run_migrations()?;
3733        storage.repair_missing_current_schema_objects()?;
3734        storage.apply_config()?;
3735        Ok(storage)
3736    }
3737
3738    /// Open a writer connection that skips migration (assumes DB already migrated).
3739    ///
3740    /// Used by the BEGIN CONCURRENT parallel writer pool: each writer needs its
3741    /// own connection with config applied, but migrations have already been run
3742    /// by the primary connection.
3743    pub fn open_writer(path: &Path) -> Result<Self> {
3744        Self::open_writer_with_shared_caches(
3745            path,
3746            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3747            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3748            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3749            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3750        )
3751    }
3752
3753    fn open_writer_with_shared_caches(
3754        path: &Path,
3755        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3756        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3757        ensured_conversation_sources: Arc<
3758            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3759        >,
3760        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3761    ) -> Result<Self> {
3762        let path_str = path.to_string_lossy().to_string();
3763        let _doctor_guard =
3764            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3765        let conn = FrankenConnection::open(&path_str)
3766            .with_context(|| format!("opening frankensqlite writer at {}", path.display()))?;
3767        let storage = Self::new_with_shared_caches(
3768            conn,
3769            path.to_path_buf(),
3770            ensured_agents,
3771            ensured_workspaces,
3772            ensured_conversation_sources,
3773            ensured_daily_stats_keys,
3774        );
3775        storage.apply_config()?;
3776        Ok(storage)
3777    }
3778
3779    pub(crate) fn acquire_cached_ephemeral_writer(&self) -> Result<(Self, bool)> {
3780        let mut cached = self.cached_ephemeral_writer.lock();
3781        match std::mem::replace(&mut *cached, CachedEphemeralWriter::InUse) {
3782            CachedEphemeralWriter::Cached(conn) => {
3783                let (conn, checkpoint_pages, busy_timeout_ms) = (*conn).into_parts();
3784                let writer = Self::new_with_shared_caches(
3785                    conn,
3786                    self.db_path.clone(),
3787                    Arc::clone(&self.ensured_agents),
3788                    Arc::clone(&self.ensured_workspaces),
3789                    Arc::clone(&self.ensured_conversation_sources),
3790                    Arc::clone(&self.ensured_daily_stats_keys),
3791                );
3792                writer
3793                    .index_writer_checkpoint_pages
3794                    .store(checkpoint_pages, Ordering::Relaxed);
3795                writer
3796                    .index_writer_busy_timeout_ms
3797                    .store(busy_timeout_ms, Ordering::Relaxed);
3798                Ok((writer, true))
3799            }
3800            CachedEphemeralWriter::Uninitialized => {
3801                drop(cached);
3802                match Self::open_writer_with_shared_caches(
3803                    &self.db_path,
3804                    Arc::clone(&self.ensured_agents),
3805                    Arc::clone(&self.ensured_workspaces),
3806                    Arc::clone(&self.ensured_conversation_sources),
3807                    Arc::clone(&self.ensured_daily_stats_keys),
3808                ) {
3809                    Ok(writer) => Ok((writer, true)),
3810                    Err(err) => {
3811                        let mut cached = self.cached_ephemeral_writer.lock();
3812                        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3813                            *cached = CachedEphemeralWriter::Uninitialized;
3814                        }
3815                        Err(err)
3816                    }
3817                }
3818            }
3819            CachedEphemeralWriter::InUse => {
3820                *cached = CachedEphemeralWriter::InUse;
3821                drop(cached);
3822                Ok((
3823                    Self::open_writer_with_shared_caches(
3824                        &self.db_path,
3825                        Arc::clone(&self.ensured_agents),
3826                        Arc::clone(&self.ensured_workspaces),
3827                        Arc::clone(&self.ensured_conversation_sources),
3828                        Arc::clone(&self.ensured_daily_stats_keys),
3829                    )?,
3830                    false,
3831                ))
3832            }
3833        }
3834    }
3835
3836    pub(crate) fn release_cached_ephemeral_writer(&self, writer: Self) {
3837        let checkpoint_pages = writer.index_writer_checkpoint_pages.load(Ordering::Relaxed);
3838        let busy_timeout_ms = writer.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
3839        let conn = writer.into_raw();
3840        let mut cached = self.cached_ephemeral_writer.lock();
3841        debug_assert!(
3842            matches!(&*cached, CachedEphemeralWriter::InUse),
3843            "cached ephemeral writer state should be in-use when releasing"
3844        );
3845        *cached = CachedEphemeralWriter::Cached(Box::new(
3846            SendFrankenConnection::new_with_index_writer_state(
3847                conn,
3848                checkpoint_pages,
3849                busy_timeout_ms,
3850            ),
3851        ));
3852    }
3853
3854    pub(crate) fn discard_cached_ephemeral_writer(&self, mut writer: Self) {
3855        writer.close_best_effort_in_place();
3856        let mut cached = self.cached_ephemeral_writer.lock();
3857        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3858            *cached = CachedEphemeralWriter::Uninitialized;
3859        }
3860    }
3861
3862    fn cached_agent_id(&self, key: &EnsuredAgentKey) -> Option<i64> {
3863        self.ensured_agents.lock().get(key).copied()
3864    }
3865
3866    fn mark_agent_ensured(&self, key: EnsuredAgentKey, id: i64) {
3867        self.ensured_agents.lock().insert(key, id);
3868    }
3869
3870    fn cached_workspace_id(&self, key: &EnsuredWorkspaceKey) -> Option<i64> {
3871        self.ensured_workspaces.lock().get(key).copied()
3872    }
3873
3874    fn mark_workspace_ensured(&self, key: EnsuredWorkspaceKey, id: i64) {
3875        self.ensured_workspaces.lock().insert(key, id);
3876    }
3877
3878    fn conversation_source_already_ensured(&self, key: &EnsuredConversationSourceKey) -> bool {
3879        self.ensured_conversation_sources.lock().contains(key)
3880    }
3881
3882    fn mark_conversation_source_ensured(&self, key: EnsuredConversationSourceKey) {
3883        self.ensured_conversation_sources.lock().insert(key);
3884    }
3885
3886    fn daily_stats_key_already_ensured(&self, key: &EnsuredDailyStatsKey) -> bool {
3887        self.ensured_daily_stats_keys.lock().contains(key)
3888    }
3889
3890    fn daily_stats_keys_already_ensured(&self, keys: &[EnsuredDailyStatsKey; 4]) -> bool {
3891        let ensured = self.ensured_daily_stats_keys.lock();
3892        keys.iter().all(|key| ensured.contains(key))
3893    }
3894
3895    fn mark_daily_stats_key_ensured(&self, key: EnsuredDailyStatsKey) {
3896        self.ensured_daily_stats_keys.lock().insert(key);
3897    }
3898
3899    fn fts_messages_present_cached(&self, tx: &FrankenTransaction<'_>) -> bool {
3900        match self.fts_messages_present_cache.load(Ordering::Acquire) {
3901            FTS_MESSAGES_PRESENT_PRESENT => return true,
3902            FTS_MESSAGES_PRESENT_ABSENT => return false,
3903            _ => {}
3904        }
3905
3906        let present = tx
3907            .query_row_map(
3908                "SELECT COUNT(*) FROM sqlite_master
3909                 WHERE name = 'fts_messages'
3910                   AND rootpage > 0",
3911                fparams![],
3912                |row| row.get_typed::<i64>(0),
3913            )
3914            .map(|count| count > 0)
3915            .unwrap_or_else(|err| {
3916                tracing::debug!(
3917                    error = %err,
3918                    "failed to probe fts_messages presence; skipping db-resident FTS maintenance"
3919                );
3920                false
3921            });
3922        self.set_fts_messages_present_cache(present);
3923        present
3924    }
3925
3926    fn set_fts_messages_present_cache(&self, present: bool) {
3927        self.fts_messages_present_cache.store(
3928            if present {
3929                FTS_MESSAGES_PRESENT_PRESENT
3930            } else {
3931                FTS_MESSAGES_PRESENT_ABSENT
3932            },
3933            Ordering::Release,
3934        );
3935    }
3936
3937    fn invalidate_fts_messages_present_cache(&self) {
3938        self.fts_messages_present_cache
3939            .store(FTS_MESSAGES_PRESENT_UNKNOWN, Ordering::Release);
3940    }
3941
3942    fn invalidate_conversation_source_cache(&self, source_id: &str) {
3943        self.ensured_conversation_sources
3944            .lock()
3945            .retain(|key| key.id != source_id);
3946    }
3947
3948    fn close_cached_ephemeral_writer_best_effort_in_place(&mut self) {
3949        let cached = self.cached_ephemeral_writer.get_mut();
3950        if let CachedEphemeralWriter::Cached(conn) =
3951            std::mem::replace(cached, CachedEphemeralWriter::Uninitialized)
3952        {
3953            let mut conn = conn;
3954            conn.0.close_best_effort_in_place();
3955        }
3956    }
3957
3958    fn close_cached_ephemeral_writer_without_checkpoint_in_place(&mut self) -> Result<()> {
3959        let cached = self.cached_ephemeral_writer.get_mut();
3960        match std::mem::replace(cached, CachedEphemeralWriter::Uninitialized) {
3961            CachedEphemeralWriter::Cached(mut conn) => conn
3962                .0
3963                .close_without_checkpoint_in_place()
3964                .with_context(|| "closing cached frankensqlite writer without final checkpoint"),
3965            CachedEphemeralWriter::Uninitialized | CachedEphemeralWriter::InUse => Ok(()),
3966        }
3967    }
3968
3969    /// Open in read-only mode using frankensqlite compat flags.
3970    pub fn open_readonly(path: &Path) -> Result<Self> {
3971        Self::open_readonly_with_doctor_lock_timeout(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)
3972    }
3973
3974    /// Open in read-only mode with an explicit doctor mutation-lock timeout.
3975    ///
3976    /// This is primarily useful for probes that need to prove a reader would
3977    /// not enter the archive while `cass doctor --fix` owns the repair lock.
3978    pub fn open_readonly_with_doctor_lock_timeout(path: &Path, timeout: Duration) -> Result<Self> {
3979        let path_str = path.to_string_lossy().to_string();
3980        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
3981        let conn = open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
3982            .with_context(|| format!("opening frankensqlite db readonly at {}", path.display()))?;
3983        let storage = Self::new(conn, path.to_path_buf());
3984        storage.apply_readonly_config()?;
3985        Ok(storage)
3986    }
3987
3988    pub fn close(self) -> Result<()> {
3989        let mut this = self;
3990        this.close_cached_ephemeral_writer_best_effort_in_place();
3991        this.conn
3992            .close()
3993            .with_context(|| "closing frankensqlite connection")
3994    }
3995
3996    pub fn close_without_checkpoint(self) -> Result<()> {
3997        let mut this = self;
3998        this.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
3999        this.conn
4000            .close_without_checkpoint()
4001            .with_context(|| "closing frankensqlite connection without final checkpoint")
4002    }
4003
4004    pub fn close_best_effort_in_place(&mut self) {
4005        self.close_cached_ephemeral_writer_best_effort_in_place();
4006        self.conn.close_best_effort_in_place();
4007    }
4008
4009    pub fn close_without_checkpoint_in_place(&mut self) -> Result<()> {
4010        self.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
4011        self.conn
4012            .close_without_checkpoint_in_place()
4013            .with_context(|| "closing frankensqlite connection without final checkpoint")
4014    }
4015
4016    /// Access the raw frankensqlite connection.
4017    pub fn raw(&self) -> &FrankenConnection {
4018        &self.conn
4019    }
4020
4021    /// Consume the storage wrapper and return the underlying frankensqlite
4022    /// connection after migrations/repair have already been applied.
4023    pub fn into_raw(self) -> FrankenConnection {
4024        let mut this = self;
4025        this.close_cached_ephemeral_writer_best_effort_in_place();
4026        this.conn
4027    }
4028
4029    /// Apply connection PRAGMAs for parity with SqliteStorage's `apply_pragmas()`.
4030    ///
4031    /// Frankensqlite supports all PRAGMAs cass uses (journal_mode, synchronous,
4032    /// cache_size, foreign_keys, busy_timeout). Its default journal_mode is already
4033    /// WAL and default synchronous is NORMAL, matching cass's requirements.
4034    ///
4035    pub fn apply_config(&self) -> Result<()> {
4036        // journal_mode: frankensqlite defaults to WAL, same as cass.
4037        // synchronous: frankensqlite defaults to NORMAL, same as cass.
4038        // Both are set explicitly for clarity.
4039        self.conn
4040            .execute("PRAGMA journal_mode = WAL;")
4041            .with_context(|| "setting journal_mode")?;
4042        self.conn
4043            .execute("PRAGMA synchronous = NORMAL;")
4044            .with_context(|| "setting synchronous")?;
4045
4046        // cache_size: 64MB (negative value = KiB).
4047        self.conn
4048            .execute("PRAGMA cache_size = -65536;")
4049            .with_context(|| "setting cache_size")?;
4050
4051        // foreign_keys: enable constraint enforcement.
4052        self.conn
4053            .execute("PRAGMA foreign_keys = ON;")
4054            .with_context(|| "setting foreign_keys")?;
4055
4056        // busy_timeout: 5 seconds (in milliseconds).
4057        self.conn
4058            .execute("PRAGMA busy_timeout = 5000;")
4059            .with_context(|| "setting busy_timeout")?;
4060
4061        // temp_store = MEMORY and mmap_size are C SQLite performance knobs.
4062        // In frankensqlite's architecture (in-memory MVCC engine with pager
4063        // backend), temp_store is always memory-resident and mmap_size does not
4064        // apply. Skipped intentionally — these are no-ops or errors.
4065
4066        // wal_autocheckpoint: use a bounded cadence that avoids checkpointing
4067        // inside common append batches without deferring checkpoints forever.
4068        let checkpoint_pragma =
4069            format!("PRAGMA wal_autocheckpoint = {DEFAULT_WAL_AUTOCHECKPOINT_PAGES};");
4070        let _ = self.conn.execute(&checkpoint_pragma);
4071        self.index_writer_checkpoint_pages
4072            .store(DEFAULT_WAL_AUTOCHECKPOINT_PAGES, Ordering::Relaxed);
4073        // Explicitly enable concurrent writer mode for BEGIN/transaction paths.
4074        // Try both namespace variants for compatibility across fsqlite builds.
4075        let _ = self.conn.execute("PRAGMA fsqlite.concurrent_mode = ON;");
4076        let _ = self.conn.execute("PRAGMA concurrent_mode = ON;");
4077        // Frankensqlite retained autocommit currently mis-serves same-connection
4078        // read-after-write queries on cass's storage paths; keep it off here
4079        // until the upstream visibility bug is fixed.
4080        //
4081        // CASS #163 item 3: If neither PRAGMA variant succeeds, the MVCC engine
4082        // will accumulate write snapshots for the lifetime of the connection,
4083        // causing unbounded memory growth on long-lived watch-mode handles.
4084        // Log at warn level so the failure is visible instead of silently
4085        // swallowed, and set a flag for callers that need to periodically
4086        // recycle the connection.
4087        let autocommit_pragma =
4088            disable_autocommit_retain(|pragma| self.conn.execute(pragma).map(|_| ()))?;
4089        tracing::debug!(
4090            pragma = autocommit_pragma,
4091            "disabled frankensqlite autocommit_retain for storage connection"
4092        );
4093
4094        Ok(())
4095    }
4096
4097    fn apply_readonly_config(&self) -> Result<()> {
4098        self.conn
4099            .execute("PRAGMA query_only = 1;")
4100            .with_context(|| "setting query_only")?;
4101        self.conn
4102            .execute("PRAGMA busy_timeout = 5000;")
4103            .with_context(|| "setting busy_timeout")?;
4104        self.conn
4105            .execute("PRAGMA cache_size = -65536;")
4106            .with_context(|| "setting cache_size")?;
4107        self.conn
4108            .execute("PRAGMA foreign_keys = ON;")
4109            .with_context(|| "setting foreign_keys")?;
4110        Ok(())
4111    }
4112
4113    /// Run all schema migrations, handling transition from meta table versioning.
4114    ///
4115    /// The existing `SqliteStorage` tracks schema version in a `meta` table entry.
4116    /// The new `MigrationRunner` uses a `_schema_migrations` table. This method:
4117    /// 1. Transitions existing databases from meta table → `_schema_migrations`
4118    /// 2. Runs pending migrations via `MigrationRunner`
4119    /// 3. Syncs `meta.schema_version` for backward compatibility
4120    ///
4121    /// # Fresh vs existing databases
4122    ///
4123    /// Fresh databases use a single combined migration (`MIGRATION_FRESH_SCHEMA`)
4124    /// that creates the complete V13 schema directly. This avoids the incremental
4125    /// V5 migration which uses `DROP TABLE` — an operation that triggers a known
4126    /// frankensqlite autoindex limitation.
4127    ///
4128    /// Existing databases (transitioned from SqliteStorage) are typically at
4129    /// V13 or newer already; additive post-V13 migrations are applied normally.
4130    pub fn run_migrations(&self) -> Result<()> {
4131        transition_from_meta_version(&self.conn)?;
4132
4133        let base_result = build_cass_migrations_before_tail_cache()
4134            .run(&self.conn)
4135            .with_context(|| "running base schema migrations")?;
4136
4137        let mut applied = base_result.applied;
4138        if apply_conversation_tail_state_cache_migration(&self.conn)
4139            .with_context(|| "running conversation tail-state cache migration")?
4140        {
4141            applied.push(15);
4142        }
4143
4144        let post_result = build_cass_migrations_after_tail_cache()
4145            .run(&self.conn)
4146            .with_context(|| "running post-tail-cache schema migrations")?;
4147        applied.extend(post_result.applied);
4148
4149        let current = self.schema_version()?;
4150        if !applied.is_empty() {
4151            info!(
4152                applied = ?applied,
4153                current,
4154                was_fresh = base_result.was_fresh,
4155                "frankensqlite schema migrations applied"
4156            );
4157        }
4158
4159        // Keep meta.schema_version in sync for backward compatibility.
4160        self.sync_meta_schema_version(current)?;
4161
4162        Ok(())
4163    }
4164
4165    /// Some historical canonical rebuild paths produced databases whose
4166    /// version markers claim the current schema while post-V10 analytics
4167    /// tables were never materialized. Detect that drift and backfill the
4168    /// idempotent table/index set from the combined schema migration.
4169    fn repair_missing_current_schema_objects(&self) -> Result<()> {
4170        let mut missing_tables = Vec::new();
4171        for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
4172            if let Err(err) = self.conn.query(probe_sql) {
4173                if error_indicates_missing_table(&err) {
4174                    missing_tables.push(table_name);
4175                    continue;
4176                }
4177                return Err(err).with_context(|| {
4178                    format!("probing required schema table {table_name} for completeness")
4179                });
4180            }
4181        }
4182
4183        if !missing_tables.is_empty() {
4184            info!(
4185                missing_tables = ?missing_tables,
4186                "repairing missing current-schema tables on an already-versioned cass database"
4187            );
4188
4189            for batch in current_schema_repair_batches_for_missing_tables(&missing_tables)? {
4190                self.conn
4191                    .execute_batch(batch.sql)
4192                    .with_context(|| format!("repairing current-schema batch {}", batch.name))?;
4193            }
4194
4195            for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
4196                if !missing_tables.contains(&table_name) {
4197                    continue;
4198                }
4199                self.conn
4200                    .query(probe_sql)
4201                    .with_context(|| format!("verifying repaired schema table {table_name}"))?;
4202            }
4203        }
4204        self.repair_missing_conversation_token_columns()?;
4205        Ok(())
4206    }
4207
4208    fn repair_missing_conversation_token_columns(&self) -> Result<()> {
4209        let columns = franken_table_column_names(&self.conn, "conversations")
4210            .with_context(|| "inspecting conversations columns for token-summary repair")?;
4211        let mut missing_columns = Vec::new();
4212        for &(column_name, column_type) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
4213            if columns.contains(column_name) {
4214                continue;
4215            }
4216            let sql = format!("ALTER TABLE conversations ADD COLUMN {column_name} {column_type};");
4217            self.conn.execute(&sql).with_context(|| {
4218                format!("adding missing conversations.{column_name} token-summary column")
4219            })?;
4220            missing_columns.push(column_name);
4221        }
4222        if !missing_columns.is_empty() {
4223            tracing::warn!(
4224                target: "cass::schema_repair",
4225                db_path = %self.db_path.display(),
4226                missing_columns = ?missing_columns,
4227                "cass#222: repaired missing conversations token-summary columns"
4228            );
4229        }
4230        Ok(())
4231    }
4232
4233    /// Detect and remove orphan rows whose FK parent has gone missing.
4234    ///
4235    /// A `Connection` dropped mid-transaction (the `drop_close` warning emitted
4236    /// by frankensqlite's `Drop` impl) can leave child rows persisted without a
4237    /// matching parent — `messages` referencing a `conversation_id` that does
4238    /// not exist, `message_metrics`/`token_usage`/`snippets` referencing a
4239    /// `message_id` that does not exist, etc. With `PRAGMA foreign_keys = ON`,
4240    /// every subsequent indexer pass then trips `FOREIGN KEY constraint failed`
4241    /// on the next write, the session never gets marked indexed, and the
4242    /// pending backlog grows without bound (issue #202).
4243    ///
4244    /// This pass runs at indexer startup as defense in depth: it scans each
4245    /// child table for rows whose parent row has gone missing and removes them
4246    /// in bounded committed chunks, breaking the failure cycle even when the
4247    /// underlying transaction-discipline bug has not been fully root-caused.
4248    /// The pass is idempotent (a clean database is a no-op), and emits a
4249    /// `WARN` after successful cleanup so the upstream `drop_close` condition
4250    /// stays visible.
4251    pub(crate) fn cleanup_orphan_fk_rows(&self) -> Result<OrphanFkCleanupReport> {
4252        let mut report = OrphanFkCleanupReport::default();
4253        let orphan_message_ids = match collect_orphan_message_ids(&self.conn) {
4254            Ok(ids) => ids,
4255            Err(err) if error_indicates_missing_table(&err) => {
4256                tracing::debug!(
4257                    target: "cass::fk_repair",
4258                    child_table = "messages",
4259                    error = %err,
4260                    "skipping orphan-message probe (table or column unavailable)"
4261                );
4262                Vec::new()
4263            }
4264            Err(err) => return Err(err),
4265        };
4266        if !orphan_message_ids.is_empty() {
4267            report.record("messages", orphan_message_ids.len() as i64);
4268        }
4269
4270        if !orphan_message_ids.is_empty() {
4271            delete_orphan_message_ids_bisecting_oom(&self.conn, &orphan_message_ids)
4272                .context("deleting orphan message rows and dependent children")?;
4273        }
4274
4275        for entry in ORPHAN_DIRECT_CHILD_TABLES {
4276            loop {
4277                let ids = match collect_direct_orphan_id_page(&self.conn, entry) {
4278                    Ok(ids) => ids,
4279                    Err(err)
4280                        if error_indicates_missing_table(&err)
4281                            || error_indicates_missing_column(&err) =>
4282                    {
4283                        // Tolerant probe: a missing child/parent table or FK
4284                        // column on older schemas means there is nothing to
4285                        // clean up for this table.
4286                        tracing::debug!(
4287                            target: "cass::fk_repair",
4288                            child_table = entry.child_table,
4289                            error = %err,
4290                            "skipping orphan probe (table or column unavailable)"
4291                        );
4292                        break;
4293                    }
4294                    Err(err) => {
4295                        return Err(err).with_context(|| {
4296                            format!("probing orphan rows in {}", entry.child_table)
4297                        });
4298                    }
4299                };
4300                if ids.is_empty() {
4301                    break;
4302                }
4303
4304                let deleted = delete_direct_orphan_ids_bisecting_oom(&self.conn, entry, &ids)
4305                    .with_context(|| format!("deleting orphan rows from {}", entry.child_table))?;
4306                if deleted == 0 {
4307                    break;
4308                }
4309                report.record(
4310                    entry.child_table,
4311                    i64::try_from(deleted).unwrap_or(i64::MAX),
4312                );
4313            }
4314        }
4315
4316        if report.total == 0 {
4317            return Ok(report);
4318        }
4319
4320        // WARN only fires after a successful commit so the message accurately
4321        // reflects what actually happened on disk. db_path is included so logs
4322        // from concurrent indexers against different databases stay
4323        // disambiguated.
4324        tracing::warn!(
4325            target: "cass::fk_repair",
4326            db_path = %self.db_path.display(),
4327            total_orphans = report.total,
4328            per_table = ?report.per_table,
4329            "cass#202: removed orphan rows left behind by interrupted index transactions"
4330        );
4331
4332        Ok(report)
4333    }
4334
4335    /// Return the current schema version from `_schema_migrations`.
4336    pub fn schema_version(&self) -> Result<i64> {
4337        let rows = self
4338            .conn
4339            .query("SELECT MAX(version) FROM _schema_migrations;")
4340            .with_context(|| "reading schema version from _schema_migrations")?;
4341
4342        if let Some(row) = rows.first()
4343            && let Ok(v) = row.get_typed::<Option<i64>>(0)
4344        {
4345            return Ok(v.unwrap_or(0));
4346        }
4347        Ok(0)
4348    }
4349
4350    /// Keep `meta.schema_version` in sync for backward compatibility with `SqliteStorage`.
4351    fn sync_meta_schema_version(&self, version: i64) -> Result<()> {
4352        // The meta table is created by V1 migration. If it doesn't exist yet,
4353        // there's nothing to sync.
4354        if self.conn.query("SELECT key FROM meta LIMIT 1;").is_err() {
4355            return Ok(());
4356        }
4357
4358        // Only write if the version needs updating to avoid write lock contention
4359        if let Ok(rows) = self
4360            .conn
4361            .query("SELECT value FROM meta WHERE key = 'schema_version';")
4362            && let Some(row) = rows.first()
4363            && let Ok(val) = row.get_typed::<String>(0)
4364            && val == version.to_string()
4365        {
4366            return Ok(()); // Already up to date
4367        }
4368
4369        self.conn
4370            .execute_compat(
4371                "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1);",
4372                &[ParamValue::from(version.to_string())],
4373            )
4374            .with_context(|| "syncing meta schema_version")?;
4375
4376        Ok(())
4377    }
4378
4379    /// Resolve the database file path for this connection.
4380    pub fn database_path(&self) -> Result<PathBuf> {
4381        Ok(self.db_path.clone())
4382    }
4383
4384    pub(crate) fn ephemeral_writer_preflight_verified(&self) -> bool {
4385        self.ephemeral_writer_preflight_verified
4386            .load(Ordering::Relaxed)
4387    }
4388
4389    pub(crate) fn mark_ephemeral_writer_preflight_verified(&self) {
4390        self.ephemeral_writer_preflight_verified
4391            .store(true, Ordering::Relaxed);
4392    }
4393
4394    pub(crate) fn index_writer_checkpoint_pages(&self) -> Option<i64> {
4395        let pages = self.index_writer_checkpoint_pages.load(Ordering::Relaxed);
4396        (pages != UNSET_INDEX_WRITER_CHECKPOINT_PAGES).then_some(pages)
4397    }
4398
4399    pub(crate) fn mark_index_writer_checkpoint_pages(&self, pages: i64) {
4400        self.index_writer_checkpoint_pages
4401            .store(pages, Ordering::Relaxed);
4402    }
4403
4404    pub(crate) fn index_writer_busy_timeout_ms(&self) -> Option<u64> {
4405        let timeout_ms = self.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
4406        (timeout_ms != UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS).then_some(timeout_ms)
4407    }
4408
4409    pub(crate) fn mark_index_writer_busy_timeout_ms(&self, timeout_ms: u64) {
4410        self.index_writer_busy_timeout_ms
4411            .store(timeout_ms, Ordering::Relaxed);
4412    }
4413
4414    /// Open database with migration, backing up if schema is incompatible.
4415    pub fn open_or_rebuild(path: &Path) -> std::result::Result<Self, MigrationError> {
4416        if let Some(parent) = path.parent() {
4417            fs::create_dir_all(parent)?;
4418        }
4419
4420        if path.exists() {
4421            let check_result = check_schema_compatibility(path);
4422            match check_result {
4423                Ok(SchemaCheck::Compatible) | Ok(SchemaCheck::NeedsMigration) => {
4424                    // Continue with normal open
4425                }
4426                Ok(SchemaCheck::NeedsRebuild(reason)) => {
4427                    let backup_path = create_backup(path)?;
4428                    cleanup_old_backups(path, MAX_BACKUPS)?;
4429                    remove_database_files(path)?;
4430                    return Err(MigrationError::RebuildRequired {
4431                        reason,
4432                        backup_path,
4433                    });
4434                }
4435                Err(err) if schema_check_error_requires_rebuild(&err) => {
4436                    let backup_path = create_backup(path)?;
4437                    cleanup_old_backups(path, MAX_BACKUPS)?;
4438                    remove_database_files(path)?;
4439                    return Err(MigrationError::RebuildRequired {
4440                        reason: format!("Database appears corrupted: {err}"),
4441                        backup_path,
4442                    });
4443                }
4444                Err(err) => return Err(MigrationError::Database(err)),
4445            }
4446        }
4447
4448        let storage = Self::open(path).map_err(|e| MigrationError::Other(e.to_string()))?;
4449        Ok(storage)
4450    }
4451}
4452
4453// -------------------------------------------------------------------------
4454// Frankensqlite migration helpers
4455// -------------------------------------------------------------------------
4456
4457/// Build the `MigrationRunner` for the frankensqlite migration path.
4458///
4459/// Uses a single combined migration (version 13) that creates the complete
4460/// final schema in one step. This avoids the V5 `DROP TABLE conversations`
4461/// operation which triggers a known frankensqlite limitation: autoindex entries
4462/// in sqlite_master are not properly cleaned up during DROP TABLE, causing
4463/// "sqlite_master entry not found" errors.
4464///
4465/// For existing databases transitioned from SqliteStorage, the transition
4466/// function backfills `_schema_migrations`; post-V13 additive migrations then
4467/// run normally.
4468fn build_cass_migrations_before_tail_cache() -> MigrationRunner {
4469    MigrationRunner::new()
4470        .add(13, "full_schema_v13", MIGRATION_FRESH_SCHEMA)
4471        .add(14, "fts_contentless", MIGRATION_V14)
4472}
4473
4474fn build_cass_migrations_after_tail_cache() -> MigrationRunner {
4475    MigrationRunner::new()
4476        .add(16, "drop_redundant_message_conv_idx", MIGRATION_V16)
4477        .add(17, "drop_message_created_idx", MIGRATION_V17)
4478        .add(18, "conversation_tail_state_hot_table", MIGRATION_V18)
4479        .add(19, "conversation_external_lookup", MIGRATION_V19)
4480        .add(20, "conversation_external_tail_lookup", MIGRATION_V20)
4481}
4482
4483fn schema_migration_is_applied(conn: &FrankenConnection, version: i64) -> Result<bool> {
4484    let rows = conn
4485        .query_with_params(
4486            "SELECT 1 FROM _schema_migrations WHERE version = ?1 LIMIT 1;",
4487            &[SqliteValue::from(version)],
4488        )
4489        .with_context(|| format!("checking schema migration version {version}"))?;
4490    Ok(!rows.is_empty())
4491}
4492
4493fn apply_conversation_tail_state_cache_migration(conn: &FrankenConnection) -> Result<bool> {
4494    conn.execute("BEGIN IMMEDIATE;")
4495        .with_context(|| "starting v15 conversation tail-state migration transaction")?;
4496
4497    let result = (|| -> Result<bool> {
4498        if schema_migration_is_applied(conn, 15)? {
4499            conn.execute("COMMIT;")
4500                .with_context(|| "committing already-applied v15 migration transaction")?;
4501            return Ok(false);
4502        }
4503
4504        let started = Instant::now();
4505        let conversation_columns = franken_table_column_names(conn, "conversations")
4506            .with_context(|| "inspecting conversations columns before v15 migration")?;
4507        if !conversation_columns.contains("last_message_idx") {
4508            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_idx INTEGER;")
4509                .with_context(|| "adding v15 conversations.last_message_idx column")?;
4510        }
4511        if !conversation_columns.contains("last_message_created_at") {
4512            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_created_at INTEGER;")
4513                .with_context(|| "adding v15 conversations.last_message_created_at column")?;
4514        }
4515        conn.execute_batch(MIGRATION_V15_TAIL_STATE_TABLE)
4516            .with_context(|| "applying v15 conversation tail-state table schema")?;
4517        conn.execute_compat(
4518            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
4519            fparams![15_i64, "conversation_tail_state_cache"],
4520        )
4521        .with_context(|| "recording v15 conversation tail-state migration")?;
4522        conn.execute("COMMIT;")
4523            .with_context(|| "committing v15 conversation tail-state migration")?;
4524        info!(
4525            elapsed_ms = started.elapsed().as_millis(),
4526            "applied v15 conversation tail-state cache migration"
4527        );
4528        Ok(true)
4529    })();
4530
4531    if result.is_err() {
4532        let _ = conn.execute("ROLLBACK;");
4533    }
4534
4535    result
4536}
4537
4538fn franken_table_column_names(
4539    conn: &FrankenConnection,
4540    table_name: &str,
4541) -> Result<HashSet<String>> {
4542    if !table_name
4543        .chars()
4544        .all(|c| c.is_ascii_alphanumeric() || c == '_')
4545    {
4546        return Err(anyhow!(
4547            "unsafe table name for PRAGMA table_info: {table_name}"
4548        ));
4549    }
4550
4551    conn.query_map_collect(
4552        &format!("PRAGMA table_info({table_name})"),
4553        fparams![],
4554        |row: &FrankenRow| row.get_typed::<String>(1),
4555    )
4556    .with_context(|| format!("reading PRAGMA table_info({table_name})"))
4557    .map(|columns| columns.into_iter().collect())
4558}
4559
4560/// Combined V13 schema for fresh databases.
4561///
4562/// Creates the complete final schema in a single migration, avoiding the
4563/// incremental V5 `DROP TABLE conversations` which triggers a frankensqlite
4564/// autoindex limitation. All columns from V1-V13 are included in their
4565/// respective CREATE TABLE statements.
4566///
4567/// Table creation order respects foreign key references:
4568/// sources → agents/workspaces → conversations → messages → snippets, etc.
4569const MIGRATION_FRESH_SCHEMA: &str = r"
4570-- Core tables (V1)
4571CREATE TABLE IF NOT EXISTS meta (
4572    key TEXT PRIMARY KEY,
4573    value TEXT NOT NULL
4574);
4575
4576CREATE TABLE IF NOT EXISTS agents (
4577    id INTEGER PRIMARY KEY,
4578    slug TEXT NOT NULL UNIQUE,
4579    name TEXT NOT NULL,
4580    version TEXT,
4581    kind TEXT NOT NULL,
4582    created_at INTEGER NOT NULL,
4583    updated_at INTEGER NOT NULL
4584);
4585
4586CREATE TABLE IF NOT EXISTS workspaces (
4587    id INTEGER PRIMARY KEY,
4588    path TEXT NOT NULL UNIQUE,
4589    display_name TEXT
4590);
4591
4592-- Sources (V4)
4593CREATE TABLE IF NOT EXISTS sources (
4594    id TEXT PRIMARY KEY,
4595    kind TEXT NOT NULL,
4596    host_label TEXT,
4597    machine_id TEXT,
4598    platform TEXT,
4599    config_json TEXT,
4600    created_at INTEGER NOT NULL,
4601    updated_at INTEGER NOT NULL
4602);
4603
4604INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4605VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4606
4607-- Conversations: V1 base + V5 provenance + V7 metadata_bin + V10 token summary
4608CREATE TABLE IF NOT EXISTS conversations (
4609    id INTEGER PRIMARY KEY,
4610    agent_id INTEGER NOT NULL REFERENCES agents(id),
4611    workspace_id INTEGER REFERENCES workspaces(id),
4612    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
4613    external_id TEXT,
4614    title TEXT,
4615    source_path TEXT NOT NULL,
4616    started_at INTEGER,
4617    ended_at INTEGER,
4618    approx_tokens INTEGER,
4619    metadata_json TEXT,
4620    origin_host TEXT,
4621    metadata_bin BLOB,
4622    total_input_tokens INTEGER,
4623    total_output_tokens INTEGER,
4624    total_cache_read_tokens INTEGER,
4625    total_cache_creation_tokens INTEGER,
4626    grand_total_tokens INTEGER,
4627    estimated_cost_usd REAL,
4628    primary_model TEXT,
4629    api_call_count INTEGER,
4630    tool_call_count INTEGER,
4631    user_message_count INTEGER,
4632    assistant_message_count INTEGER,
4633    -- V15 columns are included in the fresh schema so fresh DB creation does
4634    -- not need ALTER TABLE on conversations. That ALTER path can duplicate
4635    -- provenance autoindex state in frankensqlite when the named unique
4636    -- provenance index already exists.
4637    last_message_idx INTEGER,
4638    last_message_created_at INTEGER
4639);
4640
4641-- Named unique index avoids autoindex issues if table is ever recreated
4642CREATE UNIQUE INDEX IF NOT EXISTS idx_conversations_provenance
4643    ON conversations(source_id, agent_id, external_id);
4644
4645-- Messages: V1 base + V7 extra_bin
4646CREATE TABLE IF NOT EXISTS messages (
4647    id INTEGER PRIMARY KEY,
4648    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4649    idx INTEGER NOT NULL,
4650    role TEXT NOT NULL,
4651    author TEXT,
4652    created_at INTEGER,
4653    content TEXT NOT NULL,
4654    extra_json TEXT,
4655    extra_bin BLOB,
4656    UNIQUE(conversation_id, idx)
4657);
4658
4659CREATE TABLE IF NOT EXISTS snippets (
4660    id INTEGER PRIMARY KEY,
4661    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4662    file_path TEXT,
4663    start_line INTEGER,
4664    end_line INTEGER,
4665    language TEXT,
4666    snippet_text TEXT
4667);
4668
4669CREATE TABLE IF NOT EXISTS tags (
4670    id INTEGER PRIMARY KEY,
4671    name TEXT NOT NULL UNIQUE
4672);
4673
4674CREATE TABLE IF NOT EXISTS conversation_tags (
4675    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4676    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
4677    PRIMARY KEY (conversation_id, tag_id)
4678);
4679
4680-- Daily stats (V8)
4681CREATE TABLE IF NOT EXISTS daily_stats (
4682    day_id INTEGER NOT NULL,
4683    agent_slug TEXT NOT NULL,
4684    source_id TEXT NOT NULL DEFAULT 'all',
4685    session_count INTEGER NOT NULL DEFAULT 0,
4686    message_count INTEGER NOT NULL DEFAULT 0,
4687    total_chars INTEGER NOT NULL DEFAULT 0,
4688    last_updated INTEGER NOT NULL,
4689    PRIMARY KEY (day_id, agent_slug, source_id)
4690);
4691
4692-- Embedding jobs (V9)
4693CREATE TABLE IF NOT EXISTS embedding_jobs (
4694    id INTEGER PRIMARY KEY AUTOINCREMENT,
4695    db_path TEXT NOT NULL,
4696    model_id TEXT NOT NULL,
4697    status TEXT NOT NULL DEFAULT 'pending',
4698    total_docs INTEGER NOT NULL DEFAULT 0,
4699    completed_docs INTEGER NOT NULL DEFAULT 0,
4700    error_message TEXT,
4701    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4702    started_at TEXT,
4703    completed_at TEXT
4704);
4705
4706CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4707ON embedding_jobs(db_path, model_id)
4708WHERE status IN ('pending', 'running');
4709
4710-- Token usage ledger (V10)
4711CREATE TABLE IF NOT EXISTS token_usage (
4712    id INTEGER PRIMARY KEY AUTOINCREMENT,
4713    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4714    conversation_id INTEGER NOT NULL,
4715    agent_id INTEGER NOT NULL,
4716    workspace_id INTEGER,
4717    source_id TEXT NOT NULL DEFAULT 'local',
4718    timestamp_ms INTEGER NOT NULL,
4719    day_id INTEGER NOT NULL,
4720    model_name TEXT,
4721    model_family TEXT,
4722    model_tier TEXT,
4723    service_tier TEXT,
4724    provider TEXT,
4725    input_tokens INTEGER,
4726    output_tokens INTEGER,
4727    cache_read_tokens INTEGER,
4728    cache_creation_tokens INTEGER,
4729    thinking_tokens INTEGER,
4730    total_tokens INTEGER,
4731    estimated_cost_usd REAL,
4732    role TEXT NOT NULL,
4733    content_chars INTEGER NOT NULL,
4734    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4735    tool_call_count INTEGER NOT NULL DEFAULT 0,
4736    data_source TEXT NOT NULL DEFAULT 'api',
4737    UNIQUE(message_id)
4738);
4739
4740-- Token daily stats (V10)
4741CREATE TABLE IF NOT EXISTS token_daily_stats (
4742    day_id INTEGER NOT NULL,
4743    agent_slug TEXT NOT NULL,
4744    source_id TEXT NOT NULL DEFAULT 'all',
4745    model_family TEXT NOT NULL DEFAULT 'all',
4746    api_call_count INTEGER NOT NULL DEFAULT 0,
4747    user_message_count INTEGER NOT NULL DEFAULT 0,
4748    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4749    tool_message_count INTEGER NOT NULL DEFAULT 0,
4750    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4751    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4752    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4753    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4754    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4755    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4756    total_content_chars INTEGER NOT NULL DEFAULT 0,
4757    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4758    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4759    session_count INTEGER NOT NULL DEFAULT 0,
4760    last_updated INTEGER NOT NULL,
4761    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4762);
4763
4764-- Model pricing (V10)
4765CREATE TABLE IF NOT EXISTS model_pricing (
4766    model_pattern TEXT NOT NULL,
4767    provider TEXT NOT NULL,
4768    input_cost_per_mtok REAL NOT NULL,
4769    output_cost_per_mtok REAL NOT NULL,
4770    cache_read_cost_per_mtok REAL,
4771    cache_creation_cost_per_mtok REAL,
4772    effective_date TEXT NOT NULL,
4773    PRIMARY KEY (model_pattern, effective_date)
4774);
4775
4776INSERT OR IGNORE INTO model_pricing VALUES
4777    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4778    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4779    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4780    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4781    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4782    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4783    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4784    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4785    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4786    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4787
4788-- Message metrics: V11 base + V12 model dimensions
4789CREATE TABLE IF NOT EXISTS message_metrics (
4790    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4791    created_at_ms INTEGER NOT NULL,
4792    hour_id INTEGER NOT NULL,
4793    day_id INTEGER NOT NULL,
4794    agent_slug TEXT NOT NULL,
4795    workspace_id INTEGER NOT NULL DEFAULT 0,
4796    source_id TEXT NOT NULL DEFAULT 'local',
4797    role TEXT NOT NULL,
4798    content_chars INTEGER NOT NULL,
4799    content_tokens_est INTEGER NOT NULL,
4800    api_input_tokens INTEGER,
4801    api_output_tokens INTEGER,
4802    api_cache_read_tokens INTEGER,
4803    api_cache_creation_tokens INTEGER,
4804    api_thinking_tokens INTEGER,
4805    api_service_tier TEXT,
4806    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4807    tool_call_count INTEGER NOT NULL DEFAULT 0,
4808    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4809    has_plan INTEGER NOT NULL DEFAULT 0,
4810    model_name TEXT,
4811    model_family TEXT NOT NULL DEFAULT 'unknown',
4812    model_tier TEXT NOT NULL DEFAULT 'unknown',
4813    provider TEXT NOT NULL DEFAULT 'unknown'
4814);
4815
4816-- Hourly rollups: V11 base + V13 plan columns
4817CREATE TABLE IF NOT EXISTS usage_hourly (
4818    hour_id INTEGER NOT NULL,
4819    agent_slug TEXT NOT NULL,
4820    workspace_id INTEGER NOT NULL DEFAULT 0,
4821    source_id TEXT NOT NULL DEFAULT 'local',
4822    message_count INTEGER NOT NULL DEFAULT 0,
4823    user_message_count INTEGER NOT NULL DEFAULT 0,
4824    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4825    tool_call_count INTEGER NOT NULL DEFAULT 0,
4826    plan_message_count INTEGER NOT NULL DEFAULT 0,
4827    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4828    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4829    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4830    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4831    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4832    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4833    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4834    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4835    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4836    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4837    last_updated INTEGER NOT NULL DEFAULT 0,
4838    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4839    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4840    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4841);
4842
4843-- Daily rollups: V11 base + V13 plan columns
4844CREATE TABLE IF NOT EXISTS usage_daily (
4845    day_id INTEGER NOT NULL,
4846    agent_slug TEXT NOT NULL,
4847    workspace_id INTEGER NOT NULL DEFAULT 0,
4848    source_id TEXT NOT NULL DEFAULT 'local',
4849    message_count INTEGER NOT NULL DEFAULT 0,
4850    user_message_count INTEGER NOT NULL DEFAULT 0,
4851    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4852    tool_call_count INTEGER NOT NULL DEFAULT 0,
4853    plan_message_count INTEGER NOT NULL DEFAULT 0,
4854    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4855    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4856    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4857    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4858    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4859    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4860    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4861    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4862    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4863    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4864    last_updated INTEGER NOT NULL DEFAULT 0,
4865    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4866    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4867    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
4868);
4869
4870-- Model daily rollups (V12)
4871CREATE TABLE IF NOT EXISTS usage_models_daily (
4872    day_id INTEGER NOT NULL,
4873    agent_slug TEXT NOT NULL,
4874    workspace_id INTEGER NOT NULL DEFAULT 0,
4875    source_id TEXT NOT NULL DEFAULT 'local',
4876    model_family TEXT NOT NULL DEFAULT 'unknown',
4877    model_tier TEXT NOT NULL DEFAULT 'unknown',
4878    message_count INTEGER NOT NULL DEFAULT 0,
4879    user_message_count INTEGER NOT NULL DEFAULT 0,
4880    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4881    tool_call_count INTEGER NOT NULL DEFAULT 0,
4882    plan_message_count INTEGER NOT NULL DEFAULT 0,
4883    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4884    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4885    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4886    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4887    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4888    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4889    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4890    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4891    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4892    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4893    last_updated INTEGER NOT NULL DEFAULT 0,
4894    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
4895);
4896
4897-- All indexes
4898CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
4899CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
4900CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
4901CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4902CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4903CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
4904CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
4905CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
4906CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
4907CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
4908CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
4909CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
4910CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
4911CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
4912CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
4913CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
4914CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
4915CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
4916CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
4917CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
4918CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
4919CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
4920CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
4921CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
4922CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
4923CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
4924CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
4925CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
4926CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
4927CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
4928";
4929
4930#[derive(Clone, Copy)]
4931struct SchemaRepairBatch {
4932    name: &'static str,
4933    tables: &'static [&'static str],
4934    sql: &'static str,
4935}
4936
4937const CURRENT_SCHEMA_REPAIR_SOURCES_SQL: &str = r"
4938CREATE TABLE IF NOT EXISTS sources (
4939    id TEXT PRIMARY KEY,
4940    kind TEXT NOT NULL,
4941    host_label TEXT,
4942    machine_id TEXT,
4943    platform TEXT,
4944    config_json TEXT,
4945    created_at INTEGER NOT NULL,
4946    updated_at INTEGER NOT NULL
4947);
4948
4949INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4950VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4951";
4952
4953const CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL: &str = r"
4954CREATE TABLE IF NOT EXISTS daily_stats (
4955    day_id INTEGER NOT NULL,
4956    agent_slug TEXT NOT NULL,
4957    source_id TEXT NOT NULL DEFAULT 'all',
4958    session_count INTEGER NOT NULL DEFAULT 0,
4959    message_count INTEGER NOT NULL DEFAULT 0,
4960    total_chars INTEGER NOT NULL DEFAULT 0,
4961    last_updated INTEGER NOT NULL,
4962    PRIMARY KEY (day_id, agent_slug, source_id)
4963);
4964
4965CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4966CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4967";
4968
4969const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL: &str = r"
4970CREATE TABLE IF NOT EXISTS conversation_external_lookup (
4971    lookup_key TEXT PRIMARY KEY,
4972    conversation_id INTEGER NOT NULL
4973);
4974
4975INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
4976SELECT
4977    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
4978    CAST(agent_id AS TEXT) || ':' ||
4979    CAST(length(external_id) AS TEXT) || ':' || external_id,
4980    id
4981FROM conversations
4982WHERE external_id IS NOT NULL;
4983";
4984
4985const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL: &str = r"
4986CREATE TABLE IF NOT EXISTS conversation_tail_state (
4987    conversation_id INTEGER PRIMARY KEY,
4988    ended_at INTEGER,
4989    last_message_idx INTEGER,
4990    last_message_created_at INTEGER
4991);
4992
4993CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
4994    lookup_key TEXT PRIMARY KEY,
4995    conversation_id INTEGER NOT NULL,
4996    ended_at INTEGER,
4997    last_message_idx INTEGER,
4998    last_message_created_at INTEGER
4999);
5000
5001INSERT OR REPLACE INTO conversation_external_tail_lookup (
5002    lookup_key,
5003    conversation_id,
5004    ended_at,
5005    last_message_idx,
5006    last_message_created_at
5007)
5008SELECT
5009    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
5010    CAST(c.agent_id AS TEXT) || ':' ||
5011    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
5012    c.id,
5013    ts.ended_at,
5014    ts.last_message_idx,
5015    ts.last_message_created_at
5016FROM conversations c
5017LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
5018WHERE c.external_id IS NOT NULL;
5019";
5020
5021const CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL: &str = r"
5022CREATE TABLE IF NOT EXISTS embedding_jobs (
5023    id INTEGER PRIMARY KEY AUTOINCREMENT,
5024    db_path TEXT NOT NULL,
5025    model_id TEXT NOT NULL,
5026    status TEXT NOT NULL DEFAULT 'pending',
5027    total_docs INTEGER NOT NULL DEFAULT 0,
5028    completed_docs INTEGER NOT NULL DEFAULT 0,
5029    error_message TEXT,
5030    created_at TEXT NOT NULL DEFAULT (datetime('now')),
5031    started_at TEXT,
5032    completed_at TEXT
5033);
5034
5035CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
5036ON embedding_jobs(db_path, model_id)
5037WHERE status IN ('pending', 'running');
5038";
5039
5040const CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL: &str = r"
5041CREATE TABLE IF NOT EXISTS token_usage (
5042    id INTEGER PRIMARY KEY AUTOINCREMENT,
5043    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
5044    conversation_id INTEGER NOT NULL,
5045    agent_id INTEGER NOT NULL,
5046    workspace_id INTEGER,
5047    source_id TEXT NOT NULL DEFAULT 'local',
5048    timestamp_ms INTEGER NOT NULL,
5049    day_id INTEGER NOT NULL,
5050    model_name TEXT,
5051    model_family TEXT,
5052    model_tier TEXT,
5053    service_tier TEXT,
5054    provider TEXT,
5055    input_tokens INTEGER,
5056    output_tokens INTEGER,
5057    cache_read_tokens INTEGER,
5058    cache_creation_tokens INTEGER,
5059    thinking_tokens INTEGER,
5060    total_tokens INTEGER,
5061    estimated_cost_usd REAL,
5062    role TEXT NOT NULL,
5063    content_chars INTEGER NOT NULL,
5064    has_tool_calls INTEGER NOT NULL DEFAULT 0,
5065    tool_call_count INTEGER NOT NULL DEFAULT 0,
5066    data_source TEXT NOT NULL DEFAULT 'api',
5067    UNIQUE(message_id)
5068);
5069
5070CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
5071CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
5072CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
5073CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
5074CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
5075
5076CREATE TABLE IF NOT EXISTS token_daily_stats (
5077    day_id INTEGER NOT NULL,
5078    agent_slug TEXT NOT NULL,
5079    source_id TEXT NOT NULL DEFAULT 'all',
5080    model_family TEXT NOT NULL DEFAULT 'all',
5081    api_call_count INTEGER NOT NULL DEFAULT 0,
5082    user_message_count INTEGER NOT NULL DEFAULT 0,
5083    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5084    tool_message_count INTEGER NOT NULL DEFAULT 0,
5085    total_input_tokens INTEGER NOT NULL DEFAULT 0,
5086    total_output_tokens INTEGER NOT NULL DEFAULT 0,
5087    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
5088    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
5089    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
5090    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
5091    total_content_chars INTEGER NOT NULL DEFAULT 0,
5092    total_tool_calls INTEGER NOT NULL DEFAULT 0,
5093    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
5094    session_count INTEGER NOT NULL DEFAULT 0,
5095    last_updated INTEGER NOT NULL,
5096    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
5097);
5098
5099CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
5100CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
5101
5102CREATE TABLE IF NOT EXISTS model_pricing (
5103    model_pattern TEXT NOT NULL,
5104    provider TEXT NOT NULL,
5105    input_cost_per_mtok REAL NOT NULL,
5106    output_cost_per_mtok REAL NOT NULL,
5107    cache_read_cost_per_mtok REAL,
5108    cache_creation_cost_per_mtok REAL,
5109    effective_date TEXT NOT NULL,
5110    PRIMARY KEY (model_pattern, effective_date)
5111);
5112
5113INSERT OR IGNORE INTO model_pricing VALUES
5114    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
5115    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
5116    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
5117    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
5118    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
5119    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
5120    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
5121    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
5122    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
5123    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
5124";
5125
5126const CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL: &str = r"
5127CREATE TABLE IF NOT EXISTS message_metrics (
5128    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
5129    created_at_ms INTEGER NOT NULL,
5130    hour_id INTEGER NOT NULL,
5131    day_id INTEGER NOT NULL,
5132    agent_slug TEXT NOT NULL,
5133    workspace_id INTEGER NOT NULL DEFAULT 0,
5134    source_id TEXT NOT NULL DEFAULT 'local',
5135    role TEXT NOT NULL,
5136    content_chars INTEGER NOT NULL,
5137    content_tokens_est INTEGER NOT NULL,
5138    api_input_tokens INTEGER,
5139    api_output_tokens INTEGER,
5140    api_cache_read_tokens INTEGER,
5141    api_cache_creation_tokens INTEGER,
5142    api_thinking_tokens INTEGER,
5143    api_service_tier TEXT,
5144    api_data_source TEXT NOT NULL DEFAULT 'estimated',
5145    tool_call_count INTEGER NOT NULL DEFAULT 0,
5146    has_tool_calls INTEGER NOT NULL DEFAULT 0,
5147    has_plan INTEGER NOT NULL DEFAULT 0,
5148    model_name TEXT,
5149    model_family TEXT NOT NULL DEFAULT 'unknown',
5150    model_tier TEXT NOT NULL DEFAULT 'unknown',
5151    provider TEXT NOT NULL DEFAULT 'unknown'
5152);
5153
5154CREATE TABLE IF NOT EXISTS usage_hourly (
5155    hour_id INTEGER NOT NULL,
5156    agent_slug TEXT NOT NULL,
5157    workspace_id INTEGER NOT NULL DEFAULT 0,
5158    source_id TEXT NOT NULL DEFAULT 'local',
5159    message_count INTEGER NOT NULL DEFAULT 0,
5160    user_message_count INTEGER NOT NULL DEFAULT 0,
5161    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5162    tool_call_count INTEGER NOT NULL DEFAULT 0,
5163    plan_message_count INTEGER NOT NULL DEFAULT 0,
5164    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5165    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5166    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5167    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5168    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5169    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5170    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5171    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5172    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5173    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5174    last_updated INTEGER NOT NULL DEFAULT 0,
5175    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5176    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5177    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
5178);
5179
5180CREATE TABLE IF NOT EXISTS usage_daily (
5181    day_id INTEGER NOT NULL,
5182    agent_slug TEXT NOT NULL,
5183    workspace_id INTEGER NOT NULL DEFAULT 0,
5184    source_id TEXT NOT NULL DEFAULT 'local',
5185    message_count INTEGER NOT NULL DEFAULT 0,
5186    user_message_count INTEGER NOT NULL DEFAULT 0,
5187    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5188    tool_call_count INTEGER NOT NULL DEFAULT 0,
5189    plan_message_count INTEGER NOT NULL DEFAULT 0,
5190    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5191    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5192    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5193    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5194    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5195    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5196    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5197    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5198    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5199    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5200    last_updated INTEGER NOT NULL DEFAULT 0,
5201    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5202    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5203    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
5204);
5205
5206CREATE TABLE IF NOT EXISTS usage_models_daily (
5207    day_id INTEGER NOT NULL,
5208    agent_slug TEXT NOT NULL,
5209    workspace_id INTEGER NOT NULL DEFAULT 0,
5210    source_id TEXT NOT NULL DEFAULT 'local',
5211    model_family TEXT NOT NULL DEFAULT 'unknown',
5212    model_tier TEXT NOT NULL DEFAULT 'unknown',
5213    message_count INTEGER NOT NULL DEFAULT 0,
5214    user_message_count INTEGER NOT NULL DEFAULT 0,
5215    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5216    tool_call_count INTEGER NOT NULL DEFAULT 0,
5217    plan_message_count INTEGER NOT NULL DEFAULT 0,
5218    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5219    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5220    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5221    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5222    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5223    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5224    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5225    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5226    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5227    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5228    last_updated INTEGER NOT NULL DEFAULT 0,
5229    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5230);
5231
5232CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5233CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5234CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5235CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5236CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5237CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5238CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5239CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5240CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5241CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5242CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5243CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5244CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5245CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5246CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5247CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5248CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5249CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5250";
5251
5252const CURRENT_SCHEMA_REPAIR_BATCHES: &[SchemaRepairBatch] = &[
5253    SchemaRepairBatch {
5254        name: "sources",
5255        tables: &["sources"],
5256        sql: CURRENT_SCHEMA_REPAIR_SOURCES_SQL,
5257    },
5258    SchemaRepairBatch {
5259        name: "daily_stats",
5260        tables: &["daily_stats"],
5261        sql: CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL,
5262    },
5263    SchemaRepairBatch {
5264        name: "conversation_external_lookup",
5265        tables: &["conversation_external_lookup"],
5266        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL,
5267    },
5268    SchemaRepairBatch {
5269        name: "conversation_external_tail_lookup",
5270        tables: &[
5271            "conversation_tail_state",
5272            "conversation_external_tail_lookup",
5273        ],
5274        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL,
5275    },
5276    SchemaRepairBatch {
5277        name: "embedding_jobs",
5278        tables: &["embedding_jobs"],
5279        sql: CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL,
5280    },
5281    SchemaRepairBatch {
5282        name: "token_analytics",
5283        tables: &["token_usage", "token_daily_stats", "model_pricing"],
5284        sql: CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL,
5285    },
5286    SchemaRepairBatch {
5287        name: "message_rollups",
5288        tables: &[
5289            "message_metrics",
5290            "usage_hourly",
5291            "usage_daily",
5292            "usage_models_daily",
5293        ],
5294        sql: CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL,
5295    },
5296];
5297
5298fn current_schema_repair_batches_for_missing_tables(
5299    missing_tables: &[&'static str],
5300) -> Result<Vec<&'static SchemaRepairBatch>> {
5301    let missing_set: HashSet<&'static str> = missing_tables.iter().copied().collect();
5302    let mut selected_batches = Vec::new();
5303    let mut covered_tables = HashSet::new();
5304
5305    for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
5306        if !batch
5307            .tables
5308            .iter()
5309            .any(|table_name| missing_set.contains(table_name))
5310        {
5311            continue;
5312        }
5313        selected_batches.push(batch);
5314        covered_tables.extend(batch.tables.iter().copied());
5315    }
5316
5317    for &table_name in missing_tables {
5318        if !covered_tables.contains(table_name) {
5319            return Err(anyhow!(
5320                "no current-schema repair batch registered for missing table {table_name}"
5321            ));
5322        }
5323    }
5324
5325    Ok(selected_batches)
5326}
5327
5328/// Migration name lookup for backfilling `_schema_migrations` during transition.
5329const MIGRATION_NAMES: [(i64, &str); 20] = [
5330    (1, "core_tables"),
5331    (2, "fts_messages"),
5332    (3, "fts_messages_rebuild"),
5333    (4, "sources"),
5334    (5, "provenance_columns"),
5335    (6, "source_path_index"),
5336    (7, "msgpack_columns"),
5337    (8, "daily_stats"),
5338    (9, "embedding_jobs"),
5339    (10, "token_analytics"),
5340    (11, "message_metrics"),
5341    (12, "model_dimensions"),
5342    (13, "plan_token_rollups"),
5343    (14, "fts_contentless"),
5344    (15, "conversation_tail_state_cache"),
5345    (16, "drop_redundant_message_conv_idx"),
5346    (17, "drop_message_created_idx"),
5347    (18, "conversation_tail_state_hot_table"),
5348    (19, "conversation_external_lookup"),
5349    (20, "conversation_external_tail_lookup"),
5350];
5351
5352/// Transitions an existing database from `meta` table schema versioning to the
5353/// `_schema_migrations` table used by `MigrationRunner`.
5354///
5355/// The existing `SqliteStorage` tracks schema version as a string value in
5356/// `meta WHERE key = 'schema_version'`. The bead spec references
5357/// `PRAGMA user_version`, but the actual cass code uses the `meta` table.
5358/// This function handles the real code path.
5359///
5360/// Behavior:
5361/// - If `_schema_migrations` already exists → skip (already transitioned)
5362/// - If `meta` table has `schema_version > 0` → create `_schema_migrations`
5363///   and backfill entries for versions `1..=current_version`
5364/// - Legacy V10-V12 databases are represented as V13 in `_schema_migrations`
5365///   because frankensqlite uses one combined V13 base migration instead of
5366///   replaying the old incremental V11-V13 steps.
5367/// - If `meta` table missing or `schema_version = 0` with no tables → fresh DB,
5368///   let `MigrationRunner` handle it
5369/// - If `schema_version = 0` but tables exist → corrupted state, log warning
5370fn transition_from_meta_version(conn: &FrankenConnection) -> Result<()> {
5371    // Avoid sqlite_master enumeration here. Databases with FTS virtual tables
5372    // can trigger frankensqlite parse-recovery on sqlite_master reads, which is
5373    // enough to break the transition on otherwise-healthy legacy cass DBs.
5374    if conn
5375        .query("SELECT version FROM \"_schema_migrations\";")
5376        .is_ok()
5377    {
5378        return Ok(());
5379    }
5380
5381    // Check if the meta table exists.
5382    if conn.query("SELECT key FROM meta;").is_err() {
5383        // No meta table → fresh database, let MigrationRunner handle it.
5384        return Ok(());
5385    }
5386
5387    // Read the current schema version from the meta table.
5388    let rows = conn
5389        .query("SELECT value FROM meta WHERE key = 'schema_version';")
5390        .with_context(|| "reading schema_version from meta")?;
5391
5392    let current_version: i64 = rows
5393        .first()
5394        .and_then(|row| row.get_typed::<String>(0).ok())
5395        .and_then(|s| s.parse().ok())
5396        .unwrap_or(0);
5397
5398    if current_version == 0 {
5399        // Check if tables actually exist (corrupted state: tables present but version=0).
5400        if conn.query("SELECT id FROM conversations LIMIT 1;").is_err() {
5401            // Truly fresh DB (meta table exists but empty/reset). Let MigrationRunner handle it.
5402            return Ok(());
5403        }
5404
5405        // Tables exist but version=0: corrupted state. Log and skip transition;
5406        // MigrationRunner will fail on "table already exists" and surface the error.
5407        info!("meta.schema_version=0 but tables exist; skipping transition (corrupted state)");
5408        return Ok(());
5409    }
5410
5411    // Create _schema_migrations and backfill entries for all applied versions.
5412    info!(
5413        current_version,
5414        "transitioning schema tracking from meta table to _schema_migrations"
5415    );
5416
5417    conn.execute(
5418        "CREATE TABLE IF NOT EXISTS _schema_migrations (\
5419            version INTEGER PRIMARY KEY, \
5420            name TEXT NOT NULL, \
5421            applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))\
5422        );",
5423    )
5424    .with_context(|| "creating _schema_migrations table for transition")?;
5425
5426    let backfill_through_version = if (10..13).contains(&current_version) {
5427        13
5428    } else {
5429        current_version
5430    };
5431
5432    for &(version, name) in &MIGRATION_NAMES {
5433        if version > backfill_through_version {
5434            break;
5435        }
5436        conn.execute_compat(
5437            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
5438            &[ParamValue::from(version), ParamValue::from(name)],
5439        )
5440        .with_context(|| format!("backfilling _schema_migrations version {version}"))?;
5441    }
5442
5443    info!(
5444        current_version,
5445        backfill_through_version,
5446        "schema version transition complete: backfilled legacy meta schema versions"
5447    );
5448
5449    Ok(())
5450}
5451
5452const REQUIRED_CURRENT_SCHEMA_TABLE_PROBES: &[(&str, &str)] = &[
5453    ("sources", "SELECT id FROM sources LIMIT 1;"),
5454    ("daily_stats", "SELECT day_id FROM daily_stats LIMIT 1;"),
5455    (
5456        "conversation_external_lookup",
5457        "SELECT lookup_key FROM conversation_external_lookup LIMIT 1;",
5458    ),
5459    (
5460        "conversation_tail_state",
5461        "SELECT conversation_id FROM conversation_tail_state LIMIT 1;",
5462    ),
5463    (
5464        "conversation_external_tail_lookup",
5465        "SELECT lookup_key, last_message_idx FROM conversation_external_tail_lookup LIMIT 1;",
5466    ),
5467    ("embedding_jobs", "SELECT id FROM embedding_jobs LIMIT 1;"),
5468    ("token_usage", "SELECT id FROM token_usage LIMIT 1;"),
5469    (
5470        "token_daily_stats",
5471        "SELECT day_id FROM token_daily_stats LIMIT 1;",
5472    ),
5473    (
5474        "model_pricing",
5475        "SELECT model_pattern FROM model_pricing LIMIT 1;",
5476    ),
5477    (
5478        "message_metrics",
5479        "SELECT message_id FROM message_metrics LIMIT 1;",
5480    ),
5481    ("usage_hourly", "SELECT hour_id FROM usage_hourly LIMIT 1;"),
5482    ("usage_daily", "SELECT day_id FROM usage_daily LIMIT 1;"),
5483    (
5484        "usage_models_daily",
5485        "SELECT day_id FROM usage_models_daily LIMIT 1;",
5486    ),
5487];
5488
5489const REQUIRED_CONVERSATION_TOKEN_COLUMNS: &[(&str, &str)] = &[
5490    ("total_input_tokens", "INTEGER"),
5491    ("total_output_tokens", "INTEGER"),
5492    ("total_cache_read_tokens", "INTEGER"),
5493    ("total_cache_creation_tokens", "INTEGER"),
5494    ("grand_total_tokens", "INTEGER"),
5495    ("estimated_cost_usd", "REAL"),
5496    ("primary_model", "TEXT"),
5497    ("api_call_count", "INTEGER"),
5498    ("tool_call_count", "INTEGER"),
5499    ("user_message_count", "INTEGER"),
5500    ("assistant_message_count", "INTEGER"),
5501];
5502
5503fn error_indicates_missing_table(err: &impl std::fmt::Display) -> bool {
5504    err.to_string()
5505        .to_ascii_lowercase()
5506        .contains("no such table")
5507}
5508
5509fn error_indicates_missing_column(err: &impl std::fmt::Display) -> bool {
5510    err.to_string()
5511        .to_ascii_lowercase()
5512        .contains("no such column")
5513}
5514
5515const ORPHAN_FK_ID_CHUNK_SIZE: usize = 256;
5516
5517fn collect_orphan_message_ids(conn: &FrankenConnection) -> Result<Vec<i64>> {
5518    let min_conversation_id = conn
5519        .query_map_collect(
5520            "SELECT conversation_id
5521             FROM messages
5522             ORDER BY conversation_id ASC
5523             LIMIT 1",
5524            fparams![],
5525            |row| row.get_typed(0),
5526        )
5527        .context("finding minimum message conversation id for orphan FK cleanup")?
5528        .into_iter()
5529        .next();
5530    let Some(min_conversation_id) = min_conversation_id else {
5531        return Ok(Vec::new());
5532    };
5533    let max_conversation_id: i64 = conn
5534        .query_row_map(
5535            "SELECT conversation_id
5536             FROM messages
5537             ORDER BY conversation_id DESC
5538             LIMIT 1",
5539            fparams![],
5540            |row| row.get_typed(0),
5541        )
5542        .context("finding maximum message conversation id for orphan FK cleanup")?;
5543
5544    let parent_conversation_ids: Vec<i64> = conn
5545        .query_map_collect(
5546            "SELECT id
5547             FROM conversations
5548             WHERE id BETWEEN ?1 AND ?2
5549             ORDER BY id",
5550            fparams![min_conversation_id, max_conversation_id],
5551            |row| row.get_typed(0),
5552        )
5553        .context("listing parent conversation ids for orphan FK cleanup")?;
5554
5555    let mut message_ids = Vec::new();
5556    let mut gap_start = min_conversation_id;
5557    for parent_id in parent_conversation_ids {
5558        if parent_id < gap_start {
5559            continue;
5560        }
5561        if parent_id > max_conversation_id {
5562            break;
5563        }
5564        if gap_start < parent_id {
5565            collect_message_ids_for_conversation_gap(
5566                conn,
5567                gap_start,
5568                parent_id.saturating_sub(1),
5569                &mut message_ids,
5570            )?;
5571        }
5572        if parent_id == i64::MAX {
5573            return Ok(message_ids);
5574        }
5575        gap_start = parent_id + 1;
5576    }
5577    if gap_start <= max_conversation_id {
5578        collect_message_ids_for_conversation_gap(
5579            conn,
5580            gap_start,
5581            max_conversation_id,
5582            &mut message_ids,
5583        )?;
5584    }
5585
5586    Ok(message_ids)
5587}
5588
5589fn collect_message_ids_for_conversation_gap(
5590    conn: &FrankenConnection,
5591    gap_start: i64,
5592    gap_end: i64,
5593    message_ids: &mut Vec<i64>,
5594) -> Result<()> {
5595    let (sql, params) = if gap_start == gap_end {
5596        (
5597            "SELECT id FROM messages WHERE conversation_id = ?1",
5598            vec![SqliteValue::from(gap_start)],
5599        )
5600    } else {
5601        (
5602            "SELECT id FROM messages WHERE conversation_id BETWEEN ?1 AND ?2",
5603            vec![SqliteValue::from(gap_start), SqliteValue::from(gap_end)],
5604        )
5605    };
5606    let rows = conn.query_with_params(sql, &params).with_context(|| {
5607        format!("listing orphan message ids for conversation-id gap {gap_start}..={gap_end}")
5608    })?;
5609    message_ids.reserve(rows.len());
5610    for row in rows {
5611        message_ids.push(row.get_typed(0)?);
5612    }
5613    Ok(())
5614}
5615
5616fn delete_rows_by_i64_chunks(
5617    tx: &FrankenTransaction<'_>,
5618    delete_many_sql_prefix: &'static str,
5619    ids: &[i64],
5620) -> Result<usize> {
5621    if ids.is_empty() {
5622        return Ok(0);
5623    }
5624
5625    let full_chunk_sql = delete_rows_by_i64_sql(delete_many_sql_prefix, ORPHAN_FK_ID_CHUNK_SIZE);
5626    let tail_len = ids.len() % ORPHAN_FK_ID_CHUNK_SIZE;
5627    let tail_sql =
5628        (tail_len != 0).then(|| delete_rows_by_i64_sql(delete_many_sql_prefix, tail_len));
5629
5630    let mut deleted = 0;
5631    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5632        let sql = if chunk.len() == ORPHAN_FK_ID_CHUNK_SIZE {
5633            &full_chunk_sql
5634        } else {
5635            tail_sql.as_ref().unwrap_or(&full_chunk_sql)
5636        };
5637        let params = chunk
5638            .iter()
5639            .map(|id| SqliteValue::from(*id))
5640            .collect::<Vec<_>>();
5641        deleted += tx.execute_with_params(sql, &params)?;
5642    }
5643    Ok(deleted)
5644}
5645
5646fn delete_rows_by_i64_sql(delete_many_sql_prefix: &'static str, count: usize) -> String {
5647    let placeholders = sql_placeholders(count);
5648    format!("{delete_many_sql_prefix} ({placeholders})")
5649}
5650
5651fn sql_placeholders(count: usize) -> String {
5652    vec!["?"; count].join(", ")
5653}
5654
5655fn delete_orphan_message_ids_bisecting_oom(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5656    let mut deleted = 0usize;
5657    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5658        deleted = deleted.saturating_add(delete_orphan_message_id_chunk(conn, chunk)?);
5659    }
5660    Ok(deleted)
5661}
5662
5663fn delete_orphan_message_id_chunk(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5664    if ids.is_empty() {
5665        return Ok(0);
5666    }
5667
5668    match delete_orphan_message_id_chunk_once(conn, ids) {
5669        Ok(deleted) => Ok(deleted),
5670        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5671            let split_at = ids.len() / 2;
5672            tracing::warn!(
5673                target: "cass::fk_repair",
5674                rows = ids.len(),
5675                left = split_at,
5676                right = ids.len().saturating_sub(split_at),
5677                error = %err,
5678                "orphan-message cleanup ran out of memory; retrying as smaller batches"
5679            );
5680            let left = delete_orphan_message_id_chunk(conn, &ids[..split_at])?;
5681            let right = delete_orphan_message_id_chunk(conn, &ids[split_at..])?;
5682            Ok(left.saturating_add(right))
5683        }
5684        Err(err) => Err(err),
5685    }
5686}
5687
5688fn delete_orphan_message_id_chunk_once(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5689    let mut tx = conn.transaction()?;
5690    let mut deleted = 0usize;
5691    for entry in ORPHAN_MESSAGE_DEPENDENT_TABLES {
5692        match delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids) {
5693            Ok(count) => {
5694                deleted = deleted.saturating_add(count);
5695            }
5696            Err(err) if error_indicates_missing_table(&err) => {
5697                tracing::debug!(
5698                    target: "cass::fk_repair",
5699                    child_table = entry.child_table,
5700                    error = %err,
5701                    "skipping orphan-message dependent cleanup (table unavailable)"
5702                );
5703            }
5704            Err(err) => {
5705                return Err(err).with_context(|| {
5706                    format!(
5707                        "deleting rows from {} that depend on orphan messages",
5708                        entry.child_table
5709                    )
5710                });
5711            }
5712        }
5713    }
5714    deleted = deleted.saturating_add(
5715        delete_rows_by_i64_chunks(&tx, "DELETE FROM messages WHERE id IN", ids)
5716            .context("deleting orphan rows from messages")?,
5717    );
5718    tx.commit()?;
5719    Ok(deleted)
5720}
5721
5722fn collect_direct_orphan_id_page(
5723    conn: &FrankenConnection,
5724    entry: &'static OrphanFkTable,
5725) -> Result<Vec<i64>> {
5726    Ok(conn.query_map_collect(
5727        entry.orphan_id_page_sql,
5728        fparams![i64::try_from(ORPHAN_FK_ID_CHUNK_SIZE).unwrap_or(i64::MAX)],
5729        |row| row.get_typed(0),
5730    )?)
5731}
5732
5733fn delete_direct_orphan_ids_bisecting_oom(
5734    conn: &FrankenConnection,
5735    entry: &'static OrphanFkTable,
5736    ids: &[i64],
5737) -> Result<usize> {
5738    let mut deleted = 0usize;
5739    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5740        deleted = deleted.saturating_add(delete_direct_orphan_id_chunk(conn, entry, chunk)?);
5741    }
5742    Ok(deleted)
5743}
5744
5745fn delete_direct_orphan_id_chunk(
5746    conn: &FrankenConnection,
5747    entry: &'static OrphanFkTable,
5748    ids: &[i64],
5749) -> Result<usize> {
5750    if ids.is_empty() {
5751        return Ok(0);
5752    }
5753
5754    match delete_direct_orphan_id_chunk_once(conn, entry, ids) {
5755        Ok(deleted) => Ok(deleted),
5756        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5757            let split_at = ids.len() / 2;
5758            tracing::warn!(
5759                target: "cass::fk_repair",
5760                child_table = entry.child_table,
5761                rows = ids.len(),
5762                left = split_at,
5763                right = ids.len().saturating_sub(split_at),
5764                error = %err,
5765                "direct orphan cleanup ran out of memory; retrying as smaller batches"
5766            );
5767            let left = delete_direct_orphan_id_chunk(conn, entry, &ids[..split_at])?;
5768            let right = delete_direct_orphan_id_chunk(conn, entry, &ids[split_at..])?;
5769            Ok(left.saturating_add(right))
5770        }
5771        Err(err) => Err(err),
5772    }
5773}
5774
5775fn delete_direct_orphan_id_chunk_once(
5776    conn: &FrankenConnection,
5777    entry: &'static OrphanFkTable,
5778    ids: &[i64],
5779) -> Result<usize> {
5780    let mut tx = conn.transaction()?;
5781    let deleted = delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids)?;
5782    tx.commit()?;
5783    Ok(deleted)
5784}
5785
5786/// Tables whose FK parent rows can go missing when an index transaction is
5787/// dropped mid-flight. The select and delete SQL strings are intentionally
5788/// static (no dynamic table names) so they can be audited at a glance and so
5789/// they cannot be subverted by injected identifiers. The select statement
5790/// yields the integer FK key used by the matching chunked delete.
5791struct OrphanFkTable {
5792    child_table: &'static str,
5793    orphan_id_page_sql: &'static str,
5794    delete_many_sql_prefix: &'static str,
5795}
5796
5797const ORPHAN_DIRECT_CHILD_TABLES: &[OrphanFkTable] = &[
5798    OrphanFkTable {
5799        child_table: "message_metrics",
5800        orphan_id_page_sql: "SELECT message_id FROM message_metrics \
5801                             WHERE NOT EXISTS (\
5802                                 SELECT 1 FROM messages \
5803                                 WHERE messages.id = message_metrics.message_id\
5804                             ) \
5805                             ORDER BY message_id \
5806                             LIMIT ?1",
5807        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5808    },
5809    OrphanFkTable {
5810        child_table: "token_usage",
5811        orphan_id_page_sql: "SELECT message_id FROM token_usage \
5812                             WHERE NOT EXISTS (\
5813                                 SELECT 1 FROM messages \
5814                                 WHERE messages.id = token_usage.message_id\
5815                             ) \
5816                             ORDER BY message_id \
5817                             LIMIT ?1",
5818        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5819    },
5820    OrphanFkTable {
5821        child_table: "snippets",
5822        orphan_id_page_sql: "SELECT message_id FROM snippets \
5823                             WHERE NOT EXISTS (\
5824                                 SELECT 1 FROM messages \
5825                                 WHERE messages.id = snippets.message_id\
5826                             ) \
5827                             ORDER BY message_id \
5828                             LIMIT ?1",
5829        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5830    },
5831    OrphanFkTable {
5832        child_table: "conversation_tags",
5833        orphan_id_page_sql: "SELECT conversation_id FROM conversation_tags \
5834                             WHERE NOT EXISTS (\
5835                                 SELECT 1 FROM conversations \
5836                                 WHERE conversations.id = conversation_tags.conversation_id\
5837                             ) \
5838                             ORDER BY conversation_id \
5839                             LIMIT ?1",
5840        delete_many_sql_prefix: "DELETE FROM conversation_tags WHERE conversation_id IN",
5841    },
5842];
5843
5844struct OrphanMessageDependentTable {
5845    child_table: &'static str,
5846    delete_many_sql_prefix: &'static str,
5847}
5848
5849const ORPHAN_MESSAGE_DEPENDENT_TABLES: &[OrphanMessageDependentTable] = &[
5850    OrphanMessageDependentTable {
5851        child_table: "message_metrics",
5852        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5853    },
5854    OrphanMessageDependentTable {
5855        child_table: "token_usage",
5856        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5857    },
5858    OrphanMessageDependentTable {
5859        child_table: "snippets",
5860        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5861    },
5862];
5863
5864/// Summary of orphan rows detected and removed by `cleanup_orphan_fk_rows`.
5865///
5866/// Message-root counts come from the probe phase, while direct child counts
5867/// come from bounded page deletes. Under the function's intended use — a single
5868/// indexer-startup pass holding the index run lock — no concurrent writers
5869/// exist, so these counts match the primary orphan roots identified and
5870/// removed during cleanup. Dependent rows below an orphan message
5871/// (`message_metrics` / `token_usage` / `snippets`) are an expected consequence
5872/// of removing that root orphan and are *not* separately counted in `total` or
5873/// `per_table`.
5874#[derive(Debug, Default, Clone)]
5875pub(crate) struct OrphanFkCleanupReport {
5876    pub total: i64,
5877    pub per_table: Vec<(&'static str, i64)>,
5878}
5879
5880impl OrphanFkCleanupReport {
5881    fn record(&mut self, child_table: &'static str, count: i64) {
5882        if let Some((_, existing)) = self
5883            .per_table
5884            .iter_mut()
5885            .find(|(table, _)| *table == child_table)
5886        {
5887            *existing = existing.saturating_add(count);
5888        } else {
5889            self.per_table.push((child_table, count));
5890        }
5891        self.total = self.total.saturating_add(count);
5892    }
5893}
5894
5895pub struct InsertOutcome {
5896    pub conversation_id: i64,
5897    pub conversation_inserted: bool,
5898    pub inserted_indices: Vec<i64>,
5899}
5900
5901#[cfg(test)]
5902#[derive(Debug, Clone, Default)]
5903struct MessageInsertSubstageProfile {
5904    single_row_calls: usize,
5905    batch_calls: usize,
5906    batch_rows: usize,
5907    payload_duration: Duration,
5908    sql_build_duration: Duration,
5909    param_build_duration: Duration,
5910    execute_duration: Duration,
5911    rowid_duration: Duration,
5912}
5913
5914#[cfg(test)]
5915#[derive(Debug, Clone, Default)]
5916struct InsertConversationTreePerfProfile {
5917    invocations: usize,
5918    messages: usize,
5919    inserted_messages: usize,
5920    total_duration: Duration,
5921    source_duration: Duration,
5922    tx_open_duration: Duration,
5923    existing_lookup_duration: Duration,
5924    existing_idx_lookup_duration: Duration,
5925    existing_replay_lookup_duration: Duration,
5926    dedupe_filter_duration: Duration,
5927    conversation_row_duration: Duration,
5928    message_insert_duration: Duration,
5929    message_insert_breakdown: MessageInsertSubstageProfile,
5930    snippet_insert_duration: Duration,
5931    fts_entry_duration: Duration,
5932    fts_flush_duration: Duration,
5933    analytics_duration: Duration,
5934    commit_duration: Duration,
5935}
5936
5937#[cfg(test)]
5938impl InsertConversationTreePerfProfile {
5939    fn millis(duration: Duration) -> f64 {
5940        duration.as_secs_f64() * 1000.0
5941    }
5942
5943    fn log_summary(&self, label: &str) {
5944        let calls = self.invocations.max(1) as f64;
5945        let accounted_duration = self.source_duration
5946            + self.tx_open_duration
5947            + self.existing_lookup_duration
5948            + self.existing_idx_lookup_duration
5949            + self.existing_replay_lookup_duration
5950            + self.dedupe_filter_duration
5951            + self.conversation_row_duration
5952            + self.message_insert_duration
5953            + self.snippet_insert_duration
5954            + self.fts_entry_duration
5955            + self.fts_flush_duration
5956            + self.analytics_duration
5957            + self.commit_duration;
5958        let residual_duration = self.total_duration.saturating_sub(accounted_duration);
5959        eprintln!(
5960            concat!(
5961                "CASS_INSERT_TREE_STAGE_PROFILE ",
5962                "label={} calls={} messages={} inserted_messages={} ",
5963                "total_ms={:.3} source_ms={:.3} tx_open_ms={:.3} existing_lookup_ms={:.3} ",
5964                "existing_idx_lookup_ms={:.3} existing_replay_lookup_ms={:.3} dedupe_filter_ms={:.3} ",
5965                "conversation_row_ms={:.3} message_insert_ms={:.3} snippet_insert_ms={:.3} ",
5966                "fts_entry_ms={:.3} fts_flush_ms={:.3} analytics_ms={:.3} commit_ms={:.3} ",
5967                "msg_payload_ms={:.3} msg_sql_ms={:.3} msg_param_ms={:.3} msg_execute_ms={:.3} msg_rowid_ms={:.3} ",
5968                "residual_ms={:.3} avg_total_ms={:.3} avg_message_insert_ms={:.3} ",
5969                "avg_msg_execute_ms={:.3} avg_msg_payload_ms={:.3} avg_snippet_insert_ms={:.3} avg_fts_entry_ms={:.3} avg_commit_ms={:.3}"
5970            ),
5971            label,
5972            self.invocations,
5973            self.messages,
5974            self.inserted_messages,
5975            Self::millis(self.total_duration),
5976            Self::millis(self.source_duration),
5977            Self::millis(self.tx_open_duration),
5978            Self::millis(self.existing_lookup_duration),
5979            Self::millis(self.existing_idx_lookup_duration),
5980            Self::millis(self.existing_replay_lookup_duration),
5981            Self::millis(self.dedupe_filter_duration),
5982            Self::millis(self.conversation_row_duration),
5983            Self::millis(self.message_insert_duration),
5984            Self::millis(self.snippet_insert_duration),
5985            Self::millis(self.fts_entry_duration),
5986            Self::millis(self.fts_flush_duration),
5987            Self::millis(self.analytics_duration),
5988            Self::millis(self.commit_duration),
5989            Self::millis(self.message_insert_breakdown.payload_duration),
5990            Self::millis(self.message_insert_breakdown.sql_build_duration),
5991            Self::millis(self.message_insert_breakdown.param_build_duration),
5992            Self::millis(self.message_insert_breakdown.execute_duration),
5993            Self::millis(self.message_insert_breakdown.rowid_duration),
5994            Self::millis(residual_duration),
5995            Self::millis(self.total_duration) / calls,
5996            Self::millis(self.message_insert_duration) / calls,
5997            Self::millis(self.message_insert_breakdown.execute_duration) / calls,
5998            Self::millis(self.message_insert_breakdown.payload_duration) / calls,
5999            Self::millis(self.snippet_insert_duration) / calls,
6000            Self::millis(self.fts_entry_duration) / calls,
6001            Self::millis(self.commit_duration) / calls,
6002        );
6003    }
6004}
6005
6006#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6007enum PendingConversationKey {
6008    External {
6009        source_id: String,
6010        agent_id: i64,
6011        external_id: String,
6012    },
6013    SourcePath {
6014        source_id: String,
6015        agent_id: i64,
6016        source_path: String,
6017        started_at: Option<i64>,
6018    },
6019}
6020
6021fn conversation_external_lookup_key(source_id: &str, agent_id: i64, external_id: &str) -> String {
6022    format!(
6023        "{}:{source_id}:{agent_id}:{}:{external_id}",
6024        source_id.chars().count(),
6025        external_id.chars().count()
6026    )
6027}
6028
6029fn conversation_external_lookup_key_for_conv(agent_id: i64, conv: &Conversation) -> Option<String> {
6030    conv.external_id
6031        .as_deref()
6032        .map(|external_id| conversation_external_lookup_key(&conv.source_id, agent_id, external_id))
6033}
6034
6035#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6036struct MessageMergeFingerprint {
6037    idx: i64,
6038    created_at: Option<i64>,
6039    role: MessageRole,
6040    author: Option<String>,
6041    content_hash: [u8; 32],
6042}
6043
6044#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6045struct MessageReplayFingerprint {
6046    created_at: Option<i64>,
6047    role: MessageRole,
6048    author: Option<String>,
6049    content_hash: [u8; 32],
6050}
6051
6052#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6053struct ConversationMergeEvidence {
6054    exact_overlap: usize,
6055    replay_overlap: usize,
6056    smaller_replay_set: usize,
6057    started_close: bool,
6058    start_distance_ms: i64,
6059}
6060
6061struct ExistingConversationNewMessages<'a> {
6062    messages: Vec<&'a Message>,
6063    new_chars: i64,
6064    idx_collision_count: usize,
6065    first_collision_idx: Option<i64>,
6066}
6067
6068#[derive(Debug, Clone, Copy)]
6069struct ExistingConversationTailState {
6070    last_message_idx: i64,
6071    last_message_created_at: i64,
6072    ended_at: Option<i64>,
6073}
6074
6075#[derive(Debug, Clone, Copy)]
6076struct ExistingConversationWithTail {
6077    id: i64,
6078    tail_state: Option<ExistingConversationTailState>,
6079}
6080
6081fn conversation_effective_started_at(conv: &Conversation) -> Option<i64> {
6082    conv.started_at
6083        .or_else(|| conv.messages.iter().filter_map(|msg| msg.created_at).min())
6084}
6085
6086fn conversation_tail_state(conv: &Conversation) -> (Option<i64>, Option<i64>) {
6087    (
6088        conv.messages.iter().map(|msg| msg.idx).max(),
6089        conv.messages.iter().filter_map(|msg| msg.created_at).max(),
6090    )
6091}
6092
6093fn borrowed_messages_tail_state(messages: &[&Message]) -> (Option<i64>, Option<i64>) {
6094    (
6095        messages.iter().map(|msg| msg.idx).max(),
6096        messages.iter().filter_map(|msg| msg.created_at).max(),
6097    )
6098}
6099
6100fn role_from_str(role: &str) -> MessageRole {
6101    match role {
6102        "user" => MessageRole::User,
6103        "agent" | "assistant" => MessageRole::Agent,
6104        "tool" => MessageRole::Tool,
6105        "system" => MessageRole::System,
6106        other => MessageRole::Other(other.to_string()),
6107    }
6108}
6109
6110fn message_merge_fingerprint(msg: &Message) -> MessageMergeFingerprint {
6111    MessageMergeFingerprint {
6112        idx: msg.idx,
6113        created_at: msg.created_at,
6114        role: msg.role.clone(),
6115        author: msg.author.clone(),
6116        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
6117    }
6118}
6119
6120fn message_replay_fingerprint(msg: &Message) -> MessageReplayFingerprint {
6121    MessageReplayFingerprint {
6122        created_at: msg.created_at,
6123        role: msg.role.clone(),
6124        author: msg.author.clone(),
6125        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
6126    }
6127}
6128
6129fn conversation_message_fingerprints(conv: &Conversation) -> HashSet<MessageMergeFingerprint> {
6130    conv.messages
6131        .iter()
6132        .map(message_merge_fingerprint)
6133        .collect()
6134}
6135
6136fn conversation_message_replay_fingerprints(
6137    conv: &Conversation,
6138) -> HashSet<MessageReplayFingerprint> {
6139    conv.messages
6140        .iter()
6141        .map(message_replay_fingerprint)
6142        .collect()
6143}
6144
6145fn replay_fingerprint_from_merge(
6146    fingerprint: &MessageMergeFingerprint,
6147) -> MessageReplayFingerprint {
6148    MessageReplayFingerprint {
6149        created_at: fingerprint.created_at,
6150        role: fingerprint.role.clone(),
6151        author: fingerprint.author.clone(),
6152        content_hash: fingerprint.content_hash,
6153    }
6154}
6155
6156fn replay_fingerprints_from_merge_set(
6157    fingerprints: &HashSet<MessageMergeFingerprint>,
6158) -> HashSet<MessageReplayFingerprint> {
6159    fingerprints
6160        .iter()
6161        .map(replay_fingerprint_from_merge)
6162        .collect()
6163}
6164
6165fn collect_new_messages_for_existing_conversation<'a>(
6166    conversation_id: i64,
6167    conv: &'a Conversation,
6168    existing_messages: &mut HashMap<i64, MessageMergeFingerprint>,
6169    existing_replay_fingerprints: &mut HashSet<MessageReplayFingerprint>,
6170    replay_skip_log: &'static str,
6171) -> ExistingConversationNewMessages<'a> {
6172    let mut idx_collision_count = 0usize;
6173    let mut first_collision_idx: Option<i64> = None;
6174    let mut new_chars: i64 = 0;
6175    let mut messages = Vec::new();
6176
6177    for msg in &conv.messages {
6178        let incoming_fingerprint = message_merge_fingerprint(msg);
6179        if let Some(existing_fingerprint) = existing_messages.get(&msg.idx) {
6180            if existing_fingerprint != &incoming_fingerprint {
6181                idx_collision_count = idx_collision_count.saturating_add(1);
6182                first_collision_idx.get_or_insert(msg.idx);
6183            }
6184            continue;
6185        }
6186
6187        let incoming_replay = replay_fingerprint_from_merge(&incoming_fingerprint);
6188        if existing_replay_fingerprints.contains(&incoming_replay) {
6189            tracing::debug!(
6190                conversation_id,
6191                idx = msg.idx,
6192                source_path = %conv.source_path.display(),
6193                "{replay_skip_log}"
6194            );
6195            continue;
6196        }
6197
6198        existing_messages.insert(msg.idx, incoming_fingerprint);
6199        existing_replay_fingerprints.insert(incoming_replay);
6200        new_chars += msg.content.len() as i64;
6201        messages.push(msg);
6202    }
6203
6204    ExistingConversationNewMessages {
6205        messages,
6206        new_chars,
6207        idx_collision_count,
6208        first_collision_idx,
6209    }
6210}
6211
6212fn franken_existing_conversation_append_tail_state(
6213    tx: &FrankenTransaction<'_>,
6214    conversation_id: i64,
6215) -> Result<Option<ExistingConversationTailState>> {
6216    let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
6217        .query_row_map(
6218            "SELECT last_message_idx, last_message_created_at, ended_at
6219             FROM conversation_tail_state
6220             WHERE conversation_id = ?1",
6221            fparams![conversation_id],
6222            |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6223        )
6224        .optional()?;
6225    if let Some(cached) = cached {
6226        let (_, _, cached_ended_at) = cached;
6227        if let Some(tail_state) =
6228            existing_conversation_tail_state_from_cached(cached.0, cached.1, cached_ended_at)
6229        {
6230            return Ok(Some(tail_state));
6231        }
6232    }
6233
6234    let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
6235        "SELECT last_message_idx, last_message_created_at, ended_at
6236         FROM conversations
6237         WHERE id = ?1",
6238        fparams![conversation_id],
6239        |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6240    )?;
6241    let (_, _, cached_ended_at) = legacy_cached;
6242    if let Some(tail_state) = existing_conversation_tail_state_from_cached(
6243        legacy_cached.0,
6244        legacy_cached.1,
6245        cached_ended_at,
6246    ) {
6247        franken_insert_conversation_tail_state(
6248            tx,
6249            conversation_id,
6250            cached_ended_at,
6251            Some(tail_state.last_message_idx),
6252            Some(tail_state.last_message_created_at),
6253        )?;
6254        return Ok(Some(tail_state));
6255    }
6256
6257    let (max_idx, max_created_at): (Option<i64>, Option<i64>) = tx.query_row_map(
6258        "SELECT MAX(idx), MAX(created_at)
6259         FROM messages
6260         WHERE conversation_id = ?1",
6261        fparams![conversation_id],
6262        |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
6263    )?;
6264    if let Some((last_message_idx, last_message_created_at)) = max_idx.zip(max_created_at) {
6265        franken_update_conversation_tail_state(
6266            tx,
6267            conversation_id,
6268            None,
6269            Some(last_message_idx),
6270            Some(last_message_created_at),
6271        )?;
6272        return Ok(Some(ExistingConversationTailState {
6273            last_message_idx,
6274            last_message_created_at,
6275            ended_at: cached_ended_at,
6276        }));
6277    }
6278    Ok(None)
6279}
6280
6281fn existing_conversation_tail_state_from_cached(
6282    last_message_idx: Option<i64>,
6283    last_message_created_at: Option<i64>,
6284    ended_at: Option<i64>,
6285) -> Option<ExistingConversationTailState> {
6286    let (last_message_idx, last_message_created_at) =
6287        last_message_idx.zip(last_message_created_at)?;
6288    Some(ExistingConversationTailState {
6289        last_message_idx,
6290        last_message_created_at,
6291        ended_at,
6292    })
6293}
6294
6295fn franken_find_existing_conversation_with_tail_by_key(
6296    tx: &FrankenTransaction<'_>,
6297    key: &PendingConversationKey,
6298    conv: Option<&Conversation>,
6299) -> Result<Option<ExistingConversationWithTail>> {
6300    if let PendingConversationKey::External {
6301        source_id,
6302        agent_id,
6303        external_id,
6304    } = key
6305    {
6306        let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
6307        if let Some(existing) = franken_find_external_conversation_tail_lookup(tx, &lookup_key)? {
6308            return Ok(Some(existing));
6309        }
6310        return Ok(None);
6311    }
6312
6313    let Some(id) = franken_find_existing_conversation_by_key(tx, key, conv)? else {
6314        return Ok(None);
6315    };
6316    let tail_state = franken_existing_conversation_append_tail_state(tx, id)?;
6317    Ok(Some(ExistingConversationWithTail { id, tail_state }))
6318}
6319
6320fn franken_insert_conversation_tail_state(
6321    tx: &FrankenTransaction<'_>,
6322    conversation_id: i64,
6323    ended_at: Option<i64>,
6324    last_message_idx: Option<i64>,
6325    last_message_created_at: Option<i64>,
6326) -> Result<()> {
6327    if ended_at.is_none() && last_message_idx.is_none() && last_message_created_at.is_none() {
6328        return Ok(());
6329    }
6330    tx.execute_compat(
6331        "INSERT OR REPLACE INTO conversation_tail_state (
6332             conversation_id, ended_at, last_message_idx, last_message_created_at
6333         ) VALUES (?1, ?2, ?3, ?4)",
6334        fparams![
6335            conversation_id,
6336            ended_at,
6337            last_message_idx,
6338            last_message_created_at
6339        ],
6340    )?;
6341    Ok(())
6342}
6343
6344fn franken_update_conversation_tail_columns(
6345    tx: &FrankenTransaction<'_>,
6346    conversation_id: i64,
6347    ended_at_candidate: Option<i64>,
6348    last_message_idx_candidate: Option<i64>,
6349    last_message_created_at_candidate: Option<i64>,
6350) -> Result<()> {
6351    if ended_at_candidate.is_none()
6352        && last_message_idx_candidate.is_none()
6353        && last_message_created_at_candidate.is_none()
6354    {
6355        return Ok(());
6356    }
6357
6358    tx.execute_compat(
6359        "UPDATE conversations
6360         SET ended_at = CASE
6361                 WHEN ?1 IS NULL THEN ended_at
6362                 WHEN ended_at IS NULL OR ended_at < ?1 THEN ?1
6363                 ELSE ended_at
6364             END,
6365             last_message_idx = CASE
6366                 WHEN ?2 IS NULL THEN last_message_idx
6367                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6368                 ELSE last_message_idx
6369             END,
6370             last_message_created_at = CASE
6371                 WHEN ?3 IS NULL THEN last_message_created_at
6372                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6373                 ELSE last_message_created_at
6374             END
6375         WHERE id = ?4",
6376        fparams![
6377            ended_at_candidate,
6378            last_message_idx_candidate,
6379            last_message_created_at_candidate,
6380            conversation_id
6381        ],
6382    )?;
6383    Ok(())
6384}
6385
6386fn franken_tail_state_insert_ended_at(
6387    tx: &FrankenTransaction<'_>,
6388    conversation_id: i64,
6389    candidate: Option<i64>,
6390) -> Result<Option<i64>> {
6391    let canonical: Option<i64> = tx
6392        .query_row_map(
6393            "SELECT ended_at FROM conversations WHERE id = ?1",
6394            fparams![conversation_id],
6395            |row| row.get_typed(0),
6396        )
6397        .optional()?
6398        .flatten();
6399    Ok(canonical.max(candidate))
6400}
6401
6402fn franken_update_conversation_tail_state(
6403    tx: &FrankenTransaction<'_>,
6404    conversation_id: i64,
6405    ended_at_candidate: Option<i64>,
6406    last_message_idx_candidate: Option<i64>,
6407    last_message_created_at_candidate: Option<i64>,
6408) -> Result<()> {
6409    if ended_at_candidate.is_none()
6410        && last_message_idx_candidate.is_none()
6411        && last_message_created_at_candidate.is_none()
6412    {
6413        return Ok(());
6414    }
6415
6416    let changed = tx.execute_compat(
6417        "UPDATE conversation_tail_state
6418         SET ended_at = CASE
6419                 WHEN ?1 IS NULL THEN ended_at
6420                 ELSE MAX(IFNULL(ended_at, 0), ?1)
6421             END,
6422             last_message_idx = CASE
6423                 WHEN ?2 IS NULL THEN last_message_idx
6424                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6425                 ELSE last_message_idx
6426             END,
6427             last_message_created_at = CASE
6428                 WHEN ?3 IS NULL THEN last_message_created_at
6429                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6430                 ELSE last_message_created_at
6431             END
6432         WHERE conversation_id = ?4",
6433        fparams![
6434            ended_at_candidate,
6435            last_message_idx_candidate,
6436            last_message_created_at_candidate,
6437            conversation_id
6438        ],
6439    )?;
6440    if changed == 0 {
6441        let insert_ended_at =
6442            franken_tail_state_insert_ended_at(tx, conversation_id, ended_at_candidate)?;
6443        franken_insert_conversation_tail_state(
6444            tx,
6445            conversation_id,
6446            insert_ended_at,
6447            last_message_idx_candidate,
6448            last_message_created_at_candidate,
6449        )?;
6450    }
6451    franken_update_conversation_tail_columns(
6452        tx,
6453        conversation_id,
6454        ended_at_candidate,
6455        last_message_idx_candidate,
6456        last_message_created_at_candidate,
6457    )?;
6458    Ok(())
6459}
6460
6461fn franken_set_conversation_tail_state_after_append(
6462    tx: &FrankenTransaction<'_>,
6463    conversation_id: i64,
6464    ended_at: i64,
6465    last_message_idx: i64,
6466    last_message_created_at: i64,
6467) -> Result<()> {
6468    let changed = tx.execute_compat(
6469        "UPDATE conversation_tail_state
6470         SET ended_at = ?1,
6471             last_message_idx = ?2,
6472             last_message_created_at = ?3
6473         WHERE conversation_id = ?4",
6474        fparams![
6475            ended_at,
6476            last_message_idx,
6477            last_message_created_at,
6478            conversation_id
6479        ],
6480    )?;
6481    if changed == 0 {
6482        let insert_ended_at =
6483            franken_tail_state_insert_ended_at(tx, conversation_id, Some(ended_at))?;
6484        franken_insert_conversation_tail_state(
6485            tx,
6486            conversation_id,
6487            insert_ended_at,
6488            Some(last_message_idx),
6489            Some(last_message_created_at),
6490        )?;
6491    }
6492    franken_update_conversation_tail_columns(
6493        tx,
6494        conversation_id,
6495        Some(ended_at),
6496        Some(last_message_idx),
6497        Some(last_message_created_at),
6498    )?;
6499    Ok(())
6500}
6501
6502fn collect_append_only_tail_messages<'a>(
6503    conv: &'a Conversation,
6504    existing_max_idx: i64,
6505    existing_max_created_at: i64,
6506) -> Option<ExistingConversationNewMessages<'a>> {
6507    if conv.messages.is_empty() {
6508        return Some(ExistingConversationNewMessages {
6509            messages: Vec::new(),
6510            new_chars: 0,
6511            idx_collision_count: 0,
6512            first_collision_idx: None,
6513        });
6514    }
6515
6516    let mut split_idx = None;
6517    let mut prev_idx = None;
6518    for (pos, msg) in conv.messages.iter().enumerate() {
6519        if prev_idx.is_some_and(|prev| msg.idx < prev) {
6520            return None;
6521        }
6522        prev_idx = Some(msg.idx);
6523        if split_idx.is_none() && msg.idx > existing_max_idx {
6524            split_idx = Some(pos);
6525        }
6526    }
6527    let split_idx = split_idx?;
6528
6529    let mut seen_tail_idx = HashSet::new();
6530    let mut seen_tail_replay = HashSet::new();
6531    let mut new_chars = 0i64;
6532    let mut messages = Vec::new();
6533    for msg in &conv.messages[split_idx..] {
6534        let created_at = msg.created_at?;
6535        if created_at <= existing_max_created_at {
6536            return None;
6537        }
6538
6539        if !seen_tail_idx.insert(msg.idx) {
6540            return None;
6541        }
6542
6543        let replay_fingerprint = message_replay_fingerprint(msg);
6544        if !seen_tail_replay.insert(replay_fingerprint) {
6545            return None;
6546        }
6547
6548        new_chars += msg.content.len() as i64;
6549        messages.push(msg);
6550    }
6551
6552    Some(ExistingConversationNewMessages {
6553        messages,
6554        new_chars,
6555        idx_collision_count: 0,
6556        first_collision_idx: None,
6557    })
6558}
6559
6560fn start_distance_ms(left: Option<i64>, right: Option<i64>) -> i64 {
6561    match (left, right) {
6562        (Some(left), Some(right)) => (i128::from(left) - i128::from(right))
6563            .abs()
6564            .try_into()
6565            .unwrap_or(i64::MAX),
6566        _ => i64::MAX,
6567    }
6568}
6569
6570fn conversation_merge_evidence(
6571    incoming_exact: &HashSet<MessageMergeFingerprint>,
6572    incoming_replay: &HashSet<MessageReplayFingerprint>,
6573    existing_exact: &HashSet<MessageMergeFingerprint>,
6574    existing_replay: &HashSet<MessageReplayFingerprint>,
6575    incoming_started_at: Option<i64>,
6576    existing_started_at: Option<i64>,
6577) -> Option<ConversationMergeEvidence> {
6578    let exact_overlap = incoming_exact.intersection(existing_exact).count();
6579    let replay_overlap = incoming_replay.intersection(existing_replay).count();
6580    if exact_overlap == 0 && replay_overlap == 0 {
6581        return None;
6582    }
6583
6584    let smaller_replay_set = incoming_replay.len().min(existing_replay.len());
6585    let started_close = timestamps_within_tolerance(
6586        incoming_started_at,
6587        existing_started_at,
6588        SOURCE_PATH_MERGE_START_TOLERANCE_MS,
6589    );
6590    let full_replay_subset_match = smaller_replay_set >= 2 && replay_overlap == smaller_replay_set;
6591
6592    let merge_allowed = if started_close {
6593        exact_overlap >= 1 || replay_overlap >= 2
6594    } else {
6595        exact_overlap >= 2 || full_replay_subset_match
6596    };
6597
6598    merge_allowed.then_some(ConversationMergeEvidence {
6599        exact_overlap,
6600        replay_overlap,
6601        smaller_replay_set,
6602        started_close,
6603        start_distance_ms: start_distance_ms(incoming_started_at, existing_started_at),
6604    })
6605}
6606
6607fn timestamps_within_tolerance(left: Option<i64>, right: Option<i64>, tolerance_ms: i64) -> bool {
6608    match (left, right) {
6609        (Some(left), Some(right)) => {
6610            (i128::from(left) - i128::from(right)).abs() <= i128::from(tolerance_ms)
6611        }
6612        _ => false,
6613    }
6614}
6615
6616fn conversation_merge_key(agent_id: i64, conv: &Conversation) -> PendingConversationKey {
6617    if let Some(external_id) = conv.external_id.clone() {
6618        PendingConversationKey::External {
6619            source_id: conv.source_id.clone(),
6620            agent_id,
6621            external_id,
6622        }
6623    } else {
6624        PendingConversationKey::SourcePath {
6625            source_id: conv.source_id.clone(),
6626            agent_id,
6627            source_path: path_to_string(&conv.source_path),
6628            started_at: conversation_effective_started_at(conv),
6629        }
6630    }
6631}
6632
6633/// Message data needed for semantic embedding generation.
6634pub struct MessageForEmbedding {
6635    pub message_id: i64,
6636    pub created_at: Option<i64>,
6637    pub agent_id: i64,
6638    pub workspace_id: Option<i64>,
6639    pub source_id_hash: u32,
6640    pub role: String,
6641    pub content: String,
6642}
6643
6644// =========================================================================
6645// FrankenStorage CRUD operations
6646// =========================================================================
6647
6648impl FrankenStorage {
6649    /// Ensure an agent exists in the database, returning its ID.
6650    pub fn ensure_agent(&self, agent: &Agent) -> Result<i64> {
6651        let cache_key = EnsuredAgentKey::from_agent(agent);
6652        if let Some(id) = self.cached_agent_id(&cache_key) {
6653            return Ok(id);
6654        }
6655
6656        let now = Self::now_millis();
6657        self.conn.execute_compat(
6658            "INSERT INTO agents(slug, name, version, kind, created_at, updated_at)
6659             VALUES(?1, ?2, ?3, ?4, ?5, ?6)
6660             ON CONFLICT(slug) DO UPDATE SET
6661                 name = excluded.name,
6662                 version = excluded.version,
6663                 kind = excluded.kind,
6664                 updated_at = excluded.updated_at
6665             WHERE NOT (
6666                 agents.name IS excluded.name
6667                 AND agents.version IS excluded.version
6668                 AND agents.kind IS excluded.kind
6669             )",
6670            fparams![
6671                agent.slug.as_str(),
6672                agent.name.as_str(),
6673                agent.version.as_deref(),
6674                cache_key.kind.as_str(),
6675                now,
6676                now
6677            ],
6678        )?;
6679
6680        let id = self
6681            .conn
6682            .query_row_map(
6683                "SELECT id FROM agents WHERE slug = ?1 LIMIT 1",
6684                fparams![agent.slug.as_str()],
6685                |row| row.get_typed(0),
6686            )
6687            .with_context(|| format!("fetching agent id for {}", agent.slug))?;
6688        self.mark_agent_ensured(cache_key, id);
6689        Ok(id)
6690    }
6691
6692    /// Ensure a workspace exists in the database, returning its ID.
6693    pub fn ensure_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
6694        let path_str = path.to_string_lossy().to_string();
6695        let cache_key = EnsuredWorkspaceKey::new(path_str.clone(), display_name);
6696        if let Some(id) = self.cached_workspace_id(&cache_key) {
6697            return Ok(id);
6698        }
6699
6700        if let Some(display_name) = display_name {
6701            self.conn.execute_compat(
6702                "INSERT INTO workspaces(path, display_name)
6703                 VALUES(?1, ?2)
6704                 ON CONFLICT(path) DO UPDATE SET
6705                     display_name = excluded.display_name
6706                 WHERE NOT (workspaces.display_name IS excluded.display_name)",
6707                fparams![path_str.as_str(), display_name],
6708            )?;
6709        } else {
6710            self.conn.execute_compat(
6711                "INSERT OR IGNORE INTO workspaces(path, display_name) VALUES(?1, NULL)",
6712                fparams![path_str.as_str()],
6713            )?;
6714        }
6715
6716        let id = self
6717            .conn
6718            .query_row_map(
6719                "SELECT id FROM workspaces WHERE path = ?1 LIMIT 1",
6720                fparams![path_str.as_str()],
6721                |row| row.get_typed(0),
6722            )
6723            .with_context(|| format!("fetching workspace id for {path_str}"))?;
6724        self.mark_workspace_ensured(cache_key, id);
6725        Ok(id)
6726    }
6727
6728    /// Get current time as milliseconds since epoch.
6729    pub fn now_millis() -> i64 {
6730        SystemTime::now()
6731            .duration_since(UNIX_EPOCH)
6732            .map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
6733            .unwrap_or(0)
6734    }
6735
6736    /// Convert a millisecond timestamp to a day ID (days since 2020-01-01).
6737    pub fn day_id_from_millis(timestamp_ms: i64) -> i64 {
6738        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6739        let secs = timestamp_ms.div_euclid(1000);
6740        (secs - EPOCH_2020_SECS).div_euclid(86400)
6741    }
6742
6743    /// Convert a millisecond timestamp to an hour ID (hours since 2020-01-01 00:00 UTC).
6744    pub fn hour_id_from_millis(timestamp_ms: i64) -> i64 {
6745        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6746        let secs = timestamp_ms.div_euclid(1000);
6747        (secs - EPOCH_2020_SECS).div_euclid(3600)
6748    }
6749
6750    /// Convert a day ID back to milliseconds (start of day).
6751    pub fn millis_from_day_id(day_id: i64) -> i64 {
6752        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6753        (EPOCH_2020_SECS + day_id * 86400) * 1000
6754    }
6755
6756    /// Convert an hour ID back to milliseconds (start of hour).
6757    pub fn millis_from_hour_id(hour_id: i64) -> i64 {
6758        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6759        (EPOCH_2020_SECS + hour_id * 3600) * 1000
6760    }
6761
6762    /// Get the timestamp of the last successful scan.
6763    pub fn get_last_scan_ts(&self) -> Result<Option<i64>> {
6764        let result: Result<String, _> = self.conn.query_row_map(
6765            "SELECT value FROM meta WHERE key = 'last_scan_ts'",
6766            fparams![],
6767            |row| row.get_typed(0),
6768        );
6769        match result.optional() {
6770            Ok(Some(s)) => Ok(s.parse().ok()),
6771            Ok(None) => Ok(None),
6772            Err(e) => Err(e.into()),
6773        }
6774    }
6775
6776    /// Set the timestamp of the last successful scan (milliseconds since epoch).
6777    pub fn set_last_scan_ts(&self, ts: i64) -> Result<()> {
6778        self.conn.execute_compat(
6779            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_scan_ts', ?1)",
6780            fparams![ts.to_string()],
6781        )?;
6782        Ok(())
6783    }
6784
6785    /// Get the timestamp of the last successful index completion.
6786    pub fn get_last_indexed_at(&self) -> Result<Option<i64>> {
6787        let result: Result<String, _> = self.conn.query_row_map(
6788            "SELECT value FROM meta WHERE key = 'last_indexed_at'",
6789            fparams![],
6790            |row| row.get_typed(0),
6791        );
6792        match result.optional() {
6793            Ok(Some(s)) => Ok(s.parse().ok()),
6794            Ok(None) => Ok(None),
6795            Err(e) => Err(e.into()),
6796        }
6797    }
6798
6799    /// Set the timestamp of the last successful index completion (milliseconds since epoch).
6800    pub fn set_last_indexed_at(&self, ts: i64) -> Result<()> {
6801        self.conn.execute_compat(
6802            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_indexed_at', ?1)",
6803            fparams![ts.to_string()],
6804        )?;
6805        Ok(())
6806    }
6807
6808    /// List all registered agents.
6809    pub fn list_agents(&self) -> Result<Vec<Agent>> {
6810        self.conn
6811            .query_map_collect(
6812                "SELECT id, slug, name, version, kind FROM agents ORDER BY slug",
6813                fparams![],
6814                |row| {
6815                    let kind: String = row.get_typed(4)?;
6816                    Ok(Agent {
6817                        id: Some(row.get_typed(0)?),
6818                        slug: row.get_typed(1)?,
6819                        name: row.get_typed(2)?,
6820                        version: row.get_typed(3)?,
6821                        kind: match kind.as_str() {
6822                            "cli" => AgentKind::Cli,
6823                            "vscode" => AgentKind::VsCode,
6824                            _ => AgentKind::Hybrid,
6825                        },
6826                    })
6827                },
6828            )
6829            .with_context(|| "listing agents")
6830    }
6831
6832    /// Count all archived conversations.
6833    pub fn total_conversation_count(&self) -> Result<usize> {
6834        let count: i64 =
6835            self.conn
6836                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6837                    row.get_typed(0)
6838                })?;
6839        Ok(count.max(0) as usize)
6840    }
6841
6842    /// Count all archived messages.
6843    pub fn total_message_count(&self) -> Result<usize> {
6844        let count: i64 =
6845            self.conn
6846                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
6847                    row.get_typed(0)
6848                })?;
6849        Ok(count.max(0) as usize)
6850    }
6851
6852    /// Remove all archived conversations/messages for one agent slug.
6853    ///
6854    /// This only affects cass's local archive database. Source session files on
6855    /// disk are untouched.
6856    pub fn purge_agent_archive_data(&self, agent_slug: &str) -> Result<AgentArchivePurgeResult> {
6857        let normalized = agent_slug.trim().to_ascii_lowercase();
6858        if normalized.is_empty() {
6859            return Err(anyhow!("agent slug cannot be empty"));
6860        }
6861
6862        let Some(agent_id) = self
6863            .conn
6864            .query_row_map(
6865                "SELECT id FROM agents WHERE slug = ?1",
6866                fparams![normalized.as_str()],
6867                |row| row.get_typed::<i64>(0),
6868            )
6869            .optional()?
6870        else {
6871            return Ok(AgentArchivePurgeResult::default());
6872        };
6873
6874        let conversations_deleted: i64 = self.conn.query_row_map(
6875            "SELECT COUNT(*) FROM conversations WHERE agent_id = ?1",
6876            fparams![agent_id],
6877            |row| row.get_typed(0),
6878        )?;
6879        if conversations_deleted == 0 {
6880            return Ok(AgentArchivePurgeResult::default());
6881        }
6882
6883        let messages_deleted: i64 = self.conn.query_row_map(
6884            "SELECT COUNT(*)
6885             FROM messages
6886             WHERE conversation_id IN (
6887                 SELECT id FROM conversations WHERE agent_id = ?1
6888             )",
6889            fparams![agent_id],
6890            |row| row.get_typed(0),
6891        )?;
6892
6893        let mut tx = self.conn.transaction()?;
6894        tx.execute_compat(
6895            "DELETE FROM conversation_external_lookup
6896             WHERE conversation_id IN (
6897                 SELECT id FROM conversations WHERE agent_id = ?1
6898             )",
6899            fparams![agent_id],
6900        )?;
6901        tx.execute_compat(
6902            "DELETE FROM conversation_external_tail_lookup
6903             WHERE conversation_id IN (
6904                 SELECT id FROM conversations WHERE agent_id = ?1
6905             )",
6906            fparams![agent_id],
6907        )?;
6908        tx.execute_compat(
6909            "DELETE FROM conversations WHERE agent_id = ?1",
6910            fparams![agent_id],
6911        )?;
6912        tx.execute_compat(
6913            "DELETE FROM agents
6914             WHERE id = ?1
6915               AND NOT EXISTS (
6916                   SELECT 1 FROM conversations WHERE agent_id = ?1
6917               )",
6918            fparams![agent_id],
6919        )?;
6920        tx.commit()?;
6921
6922        Ok(AgentArchivePurgeResult {
6923            conversations_deleted: conversations_deleted.max(0) as usize,
6924            messages_deleted: messages_deleted.max(0) as usize,
6925        })
6926    }
6927
6928    /// List all registered workspaces.
6929    pub fn list_workspaces(&self) -> Result<Vec<crate::model::types::Workspace>> {
6930        self.conn
6931            .query_map_collect(
6932                "SELECT id, path, display_name FROM workspaces ORDER BY path",
6933                fparams![],
6934                |row| {
6935                    let path_str: String = row.get_typed(1)?;
6936                    Ok(crate::model::types::Workspace {
6937                        id: Some(row.get_typed(0)?),
6938                        path: Path::new(&path_str).to_path_buf(),
6939                        display_name: row.get_typed(2)?,
6940                    })
6941                },
6942            )
6943            .with_context(|| "listing workspaces")
6944    }
6945
6946    /// List conversations with pagination.
6947    pub fn list_conversations(&self, limit: i64, offset: i64) -> Result<Vec<Conversation>> {
6948        // Avoid the multi-table JOIN with LIMIT/OFFSET that triggers
6949        // frankensqlite's materialization fallback (see c38edcd9, 860acb12).
6950        // Use correlated subqueries for the tiny agents (~20 rows) and
6951        // workspaces (~30 rows) lookup tables and degrade NULL agent_id to
6952        // the same 'unknown' sentinel that 8a0c547c established for the
6953        // lexical rebuild path.
6954        self.conn
6955            .query_map_collect(
6956                r"SELECT c.id,
6957                         COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
6958                         (SELECT w.path FROM workspaces w WHERE w.id = c.workspace_id),
6959                         c.external_id, c.title, c.source_path,
6960                         c.started_at,
6961                         COALESCE(
6962                             (SELECT ts.ended_at
6963                              FROM conversation_tail_state ts
6964                              WHERE ts.conversation_id = c.id),
6965                             c.ended_at
6966                         ),
6967                         c.approx_tokens, c.metadata_json,
6968                         c.source_id, c.origin_host, c.metadata_bin
6969                FROM conversations c
6970                ORDER BY CASE WHEN c.started_at IS NULL THEN 1 ELSE 0 END, c.started_at DESC, c.id DESC
6971                LIMIT ?1 OFFSET ?2",
6972                fparams![limit, offset],
6973                |row| {
6974                    let workspace_path: Option<String> = row.get_typed(2)?;
6975                    let source_path: String = row.get_typed(5)?;
6976                    let raw_source_id: Option<String> = row.get_typed(10)?;
6977                    let raw_origin_host: Option<String> = row.get_typed(11)?;
6978                    let (source_id, _, origin_host) = normalized_storage_source_parts(
6979                        raw_source_id.as_deref(),
6980                        None,
6981                        raw_origin_host.as_deref(),
6982                    );
6983                    Ok(Conversation {
6984                        id: Some(row.get_typed(0)?),
6985                        agent_slug: row.get_typed(1)?,
6986                        workspace: workspace_path.map(|p| Path::new(&p).to_path_buf()),
6987                        external_id: row.get_typed(3)?,
6988                        title: row.get_typed(4)?,
6989                        source_path: Path::new(&source_path).to_path_buf(),
6990                        started_at: row.get_typed(6)?,
6991                        ended_at: row.get_typed(7)?,
6992                        approx_tokens: row.get_typed(8)?,
6993                        metadata_json: franken_read_metadata_compat(row, 9, 12),
6994                        messages: Vec::new(),
6995                        source_id,
6996                        origin_host,
6997                    })
6998                },
6999            )
7000            .with_context(|| "listing conversations")
7001    }
7002
7003    /// Build lookup maps for agents and workspaces to avoid JOINs in
7004    /// paged conversation queries.  Both tables are tiny (tens of rows)
7005    /// so this is effectively free.
7006    pub fn build_lexical_rebuild_lookups(
7007        &self,
7008    ) -> Result<(HashMap<i64, String>, HashMap<i64, PathBuf>)> {
7009        let agents: HashMap<i64, String> = self
7010            .conn
7011            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
7012                Ok((row.get_typed::<i64>(0)?, row.get_typed::<String>(1)?))
7013            })
7014            .with_context(|| "loading agent lookup for lexical rebuild")?
7015            .into_iter()
7016            .collect();
7017        let workspaces: HashMap<i64, PathBuf> = self
7018            .conn
7019            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
7020                let path_str: String = row.get_typed(1)?;
7021                Ok((row.get_typed::<i64>(0)?, PathBuf::from(path_str)))
7022            })
7023            .with_context(|| "loading workspace lookup for lexical rebuild")?
7024            .into_iter()
7025            .collect();
7026        Ok((agents, workspaces))
7027    }
7028
7029    /// List per-conversation message footprints in primary-key order.
7030    ///
7031    /// This deliberately avoids rebuild-path JOINs. Instead we merge ordered
7032    /// single-table reads over `conversations` and the narrow
7033    /// `conversation_tail_state` cache in Rust, then use `last_message_idx + 1`
7034    /// as a planning estimate.
7035    ///
7036    /// The planner only needs a sizing heuristic; exact message and byte
7037    /// accounting is performed later by the rebuild packet pipeline as it reads
7038    /// message content for indexing. Rows missing both tail-cache sources fall
7039    /// back to `MAX(messages.idx) + 1`, which preserves legacy upgraded
7040    /// databases without treating populated conversations as empty.
7041    pub fn list_conversation_footprints_for_lexical_rebuild(
7042        &self,
7043    ) -> Result<Vec<LexicalRebuildConversationFootprintRow>> {
7044        let tail_state_rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
7045            "SELECT conversation_id, last_message_idx
7046             FROM conversation_tail_state
7047             ORDER BY conversation_id ASC",
7048            fparams![],
7049            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
7050        ) {
7051            Ok(rows) => rows,
7052            Err(err) if error_indicates_missing_table(&err) => Vec::new(),
7053            Err(err) => {
7054                return Err(err).with_context(|| "listing lexical rebuild tail-state estimates");
7055            }
7056        };
7057        let tail_state_by_conversation: HashMap<i64, Option<i64>> =
7058            tail_state_rows.into_iter().collect();
7059
7060        let rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
7061            "SELECT id, last_message_idx
7062             FROM conversations
7063             ORDER BY id ASC",
7064            fparams![],
7065            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
7066        ) {
7067            Ok(rows) => rows,
7068            Err(err) if error_indicates_missing_column(&err) => self
7069                .conn
7070                .query_map_collect(
7071                    "SELECT id
7072                     FROM conversations
7073                     ORDER BY id ASC",
7074                    fparams![],
7075                    |row| Ok((row.get_typed::<i64>(0)?, None)),
7076                )
7077                .with_context(|| {
7078                    "listing lexical rebuild conversation ids after missing tail column fallback"
7079                })?,
7080            Err(err) => {
7081                return Err(err)
7082                    .with_context(|| "listing lexical rebuild conversation footprint estimates");
7083            }
7084        };
7085
7086        let mut footprints = Vec::with_capacity(rows.len());
7087        let mut missing_tail_positions = HashMap::new();
7088        for (conversation_id, conversation_last_message_idx) in rows {
7089            let last_message_idx = tail_state_by_conversation
7090                .get(&conversation_id)
7091                .copied()
7092                .flatten()
7093                .or(conversation_last_message_idx);
7094            let Some(message_count) = lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7095            else {
7096                missing_tail_positions.insert(conversation_id, footprints.len());
7097                footprints.push(LexicalRebuildConversationFootprintRow {
7098                    conversation_id,
7099                    message_count: 0,
7100                    message_bytes: 0,
7101                });
7102                continue;
7103            };
7104            footprints.push(lexical_rebuild_conversation_footprint_from_count(
7105                conversation_id,
7106                message_count,
7107            ));
7108        }
7109
7110        let every_footprint_was_missing_tail = missing_tail_positions.len() == footprints.len();
7111        if !missing_tail_positions.is_empty() {
7112            self.fill_missing_lexical_rebuild_footprint_tails(
7113                &mut footprints,
7114                &missing_tail_positions,
7115            )?;
7116        }
7117        if !every_footprint_was_missing_tail {
7118            self.raise_lexical_rebuild_footprints_to_exact_message_counts(&mut footprints)?;
7119        }
7120
7121        Ok(footprints)
7122    }
7123
7124    pub fn lexical_rebuild_has_tail_footprint_metadata(&self) -> Result<bool> {
7125        let total_conversations: i64 = self
7126            .conn
7127            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
7128                row.get_typed(0)
7129            })
7130            .with_context(|| "counting conversations for lexical rebuild tail metadata coverage")?;
7131        let total_conversations = usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX);
7132        if total_conversations == 0 {
7133            return Ok(true);
7134        }
7135
7136        let conversation_columns = franken_table_column_names(&self.conn, "conversations")?;
7137        let conversations_have_tail_column = conversation_columns.contains("last_message_idx");
7138        let tail_state_has_tail_column =
7139            match franken_table_column_names(&self.conn, "conversation_tail_state") {
7140                Ok(columns) => columns.contains("last_message_idx"),
7141                Err(err) if error_indicates_missing_table(&err) => false,
7142                Err(err) => {
7143                    return Err(err)
7144                        .with_context(|| "reading lexical rebuild tail-state metadata columns");
7145                }
7146            };
7147        if !conversations_have_tail_column && !tail_state_has_tail_column {
7148            return Ok(false);
7149        }
7150
7151        let covered_sql = match (conversations_have_tail_column, tail_state_has_tail_column) {
7152            (true, true) => {
7153                "SELECT COUNT(*)
7154                 FROM conversations c
7155                 LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
7156                 WHERE c.last_message_idx IS NOT NULL
7157                    OR ts.last_message_idx IS NOT NULL"
7158            }
7159            (true, false) => {
7160                "SELECT COUNT(*)
7161                 FROM conversations
7162                 WHERE last_message_idx IS NOT NULL"
7163            }
7164            (false, true) => {
7165                "SELECT COUNT(*)
7166                 FROM conversations c
7167                 WHERE EXISTS (
7168                     SELECT 1
7169                     FROM conversation_tail_state ts
7170                     WHERE ts.conversation_id = c.id
7171                       AND ts.last_message_idx IS NOT NULL
7172                 )"
7173            }
7174            (false, false) => unreachable!("checked before covered_sql selection"),
7175        };
7176        let covered_conversations: i64 = self
7177            .conn
7178            .query_row_map(covered_sql, fparams![], |row| row.get_typed(0))
7179            .with_context(
7180                || "counting conversations covered by lexical rebuild tail footprint metadata",
7181            )?;
7182        let covered_conversations =
7183            usize::try_from(covered_conversations.max(0)).unwrap_or(usize::MAX);
7184
7185        Ok(lexical_rebuild_tail_metadata_coverage_is_sufficient(
7186            total_conversations,
7187            covered_conversations,
7188        ))
7189    }
7190
7191    fn raise_lexical_rebuild_footprints_to_exact_message_counts(
7192        &self,
7193        footprints: &mut [LexicalRebuildConversationFootprintRow],
7194    ) -> Result<()> {
7195        if footprints.is_empty() {
7196            return Ok(());
7197        }
7198
7199        let positions_by_conversation: HashMap<i64, usize> = footprints
7200            .iter()
7201            .enumerate()
7202            .map(|(position, footprint)| (footprint.conversation_id, position))
7203            .collect();
7204        self.conn
7205            .query_with_params_for_each(
7206                "SELECT conversation_id, COUNT(*) AS message_count
7207                 FROM messages
7208                 GROUP BY conversation_id
7209                 ORDER BY conversation_id ASC",
7210                &[] as &[SqliteValue],
7211                |row| {
7212                    let conversation_id: i64 = row.get_typed(0)?;
7213                    let exact_count: i64 = row.get_typed(1)?;
7214                    let Some(position) = positions_by_conversation.get(&conversation_id) else {
7215                        return Ok(());
7216                    };
7217                    let exact_count = usize::try_from(exact_count.max(0)).unwrap_or(usize::MAX);
7218                    let footprint = &mut footprints[*position];
7219                    if exact_count > footprint.message_count {
7220                        footprint.message_count = exact_count;
7221                        footprint.message_bytes =
7222                            footprint.message_bytes.max(exact_count.saturating_mul(
7223                                LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
7224                            ));
7225                    }
7226                    Ok(())
7227                },
7228            )
7229            .with_context(|| "raising lexical rebuild footprints to exact message counts")?;
7230        Ok(())
7231    }
7232
7233    fn fill_missing_lexical_rebuild_footprint_tails(
7234        &self,
7235        footprints: &mut [LexicalRebuildConversationFootprintRow],
7236        missing_tail_positions: &HashMap<i64, usize>,
7237    ) -> Result<()> {
7238        if missing_tail_positions.len() <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT {
7239            for (conversation_id, position) in missing_tail_positions {
7240                let last_message_idx: Option<i64> = self
7241                    .conn
7242                    .query_row_map(
7243                        "SELECT MAX(idx) FROM messages WHERE conversation_id = ?1",
7244                        fparams![*conversation_id],
7245                        |row| row.get_typed(0),
7246                    )
7247                    .with_context(|| {
7248                        format!(
7249                            "looking up missing lexical rebuild tail estimate for conversation {conversation_id}"
7250                        )
7251                    })?;
7252                if let Some(message_count) =
7253                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7254                {
7255                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7256                        *conversation_id,
7257                        message_count,
7258                    );
7259                }
7260            }
7261            return Ok(());
7262        }
7263
7264        self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7265            footprints,
7266            missing_tail_positions,
7267            "SELECT conversation_id, MAX(idx) AS last_message_idx
7268             FROM messages INDEXED BY idx_messages_conv_idx
7269             GROUP BY conversation_id
7270             ORDER BY conversation_id ASC",
7271        )
7272        .or_else(|err| {
7273            if err
7274                .to_string()
7275                .contains("no such index: idx_messages_conv_idx")
7276            {
7277                return self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7278                    footprints,
7279                    missing_tail_positions,
7280                    "SELECT conversation_id, MAX(idx) AS last_message_idx
7281                     FROM messages
7282                     GROUP BY conversation_id
7283                     ORDER BY conversation_id ASC",
7284                );
7285            }
7286            Err(err)
7287        })
7288        .with_context(|| "grouping missing lexical rebuild tail estimates from messages")?;
7289
7290        Ok(())
7291    }
7292
7293    fn fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7294        &self,
7295        footprints: &mut [LexicalRebuildConversationFootprintRow],
7296        missing_tail_positions: &HashMap<i64, usize>,
7297        sql: &str,
7298    ) -> Result<()> {
7299        self.conn
7300            .query_with_params_for_each(sql, &[] as &[SqliteValue], |row| {
7301                let conversation_id: i64 = row.get_typed(0)?;
7302                let last_message_idx: Option<i64> = row.get_typed(1)?;
7303                let Some(position) = missing_tail_positions.get(&conversation_id) else {
7304                    return Ok(());
7305                };
7306                if let Some(message_count) =
7307                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7308                {
7309                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7310                        conversation_id,
7311                        message_count,
7312                    );
7313                }
7314                Ok(())
7315            })
7316            .with_context(|| "grouping lexical rebuild missing tail estimates")
7317    }
7318
7319    /// List conversation ids in the stable order used by lexical rebuilds.
7320    pub fn list_conversation_ids_for_lexical_rebuild(&self) -> Result<Vec<i64>> {
7321        self.conn
7322            .query_map_collect(
7323                "SELECT id FROM conversations ORDER BY id ASC",
7324                fparams![],
7325                |row| row.get_typed(0),
7326            )
7327            .with_context(|| "listing conversation ids for lexical rebuild")
7328    }
7329    /// Legacy OFFSET-based traversal for one-time checkpoint migration only.
7330    ///
7331    /// New code must use `list_conversations_for_lexical_rebuild_after_id`
7332    /// for keyset pagination.
7333    pub fn list_conversations_for_lexical_rebuild_by_offset(
7334        &self,
7335        limit: i64,
7336        offset: i64,
7337        agent_slugs: &HashMap<i64, String>,
7338        workspace_paths: &HashMap<i64, PathBuf>,
7339    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7340        // Single-table query avoids the 3-table JOIN that triggers
7341        // frankensqlite's full-materialization fallback path.
7342        self.conn
7343            .query_map_collect(
7344                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7345                       started_at,
7346                       COALESCE(
7347                           (SELECT ts.ended_at
7348                            FROM conversation_tail_state ts
7349                            WHERE ts.conversation_id = conversations.id),
7350                           ended_at
7351                       ),
7352                       source_id, origin_host
7353                FROM conversations
7354                ORDER BY id ASC
7355                LIMIT ?1 OFFSET ?2",
7356                fparams![limit, offset],
7357                |row| {
7358                    let agent_id: Option<i64> = row.get_typed(1)?;
7359                    let workspace_id: Option<i64> = row.get_typed(2)?;
7360                    let source_path: String = row.get_typed(5)?;
7361                    let raw_source_id: Option<String> = row.get_typed(8)?;
7362                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7363                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7364                        raw_source_id.as_deref(),
7365                        None,
7366                        raw_origin_host.as_deref(),
7367                    );
7368                    Ok(LexicalRebuildConversationRow {
7369                        id: Some(row.get_typed(0)?),
7370                        agent_slug: agent_id
7371                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7372                            .unwrap_or_else(|| "unknown".to_string()),
7373                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7374                        external_id: row.get_typed(3)?,
7375                        title: row.get_typed(4)?,
7376                        source_path: Path::new(&source_path).to_path_buf(),
7377                        started_at: row.get_typed(6)?,
7378                        ended_at: row.get_typed(7)?,
7379                        source_id,
7380                        origin_host,
7381                    })
7382                },
7383            )
7384            .with_context(|| "listing conversations for lexical rebuild")
7385    }
7386
7387    /// List lexical rebuild conversations strictly after the given primary key.
7388    ///
7389    /// Keyset pagination keeps later rebuild pages as cheap as earlier ones,
7390    /// avoiding the ever-growing `OFFSET` scan cost during large rebuilds.
7391    pub fn list_conversations_for_lexical_rebuild_after_id(
7392        &self,
7393        limit: i64,
7394        after_conversation_id: i64,
7395        agent_slugs: &HashMap<i64, String>,
7396        workspace_paths: &HashMap<i64, PathBuf>,
7397    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7398        self.conn
7399            .query_map_collect(
7400                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7401                       started_at,
7402                       COALESCE(
7403                           (SELECT ts.ended_at
7404                            FROM conversation_tail_state ts
7405                            WHERE ts.conversation_id = conversations.id),
7406                           ended_at
7407                       ),
7408                       source_id, origin_host
7409                FROM conversations
7410                WHERE id > ?2
7411                ORDER BY id ASC
7412                LIMIT ?1",
7413                fparams![limit, after_conversation_id],
7414                |row| {
7415                    let agent_id: Option<i64> = row.get_typed(1)?;
7416                    let workspace_id: Option<i64> = row.get_typed(2)?;
7417                    let source_path: String = row.get_typed(5)?;
7418                    let raw_source_id: Option<String> = row.get_typed(8)?;
7419                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7420                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7421                        raw_source_id.as_deref(),
7422                        None,
7423                        raw_origin_host.as_deref(),
7424                    );
7425                    Ok(LexicalRebuildConversationRow {
7426                        id: Some(row.get_typed(0)?),
7427                        agent_slug: agent_id
7428                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7429                            .unwrap_or_else(|| "unknown".to_string()),
7430                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7431                        external_id: row.get_typed(3)?,
7432                        title: row.get_typed(4)?,
7433                        source_path: Path::new(&source_path).to_path_buf(),
7434                        started_at: row.get_typed(6)?,
7435                        ended_at: row.get_typed(7)?,
7436                        source_id,
7437                        origin_host,
7438                    })
7439                },
7440            )
7441            .with_context(|| {
7442                format!(
7443                    "listing conversations for lexical rebuild after id {after_conversation_id}"
7444                )
7445            })
7446    }
7447
7448    /// List lexical rebuild conversations inside an `(after_id, through_id]`
7449    /// primary-key window.
7450    ///
7451    /// This lets the rebuild producer respect planned shard boundaries without
7452    /// falling back to client-side trimming or multi-table joins.
7453    pub fn list_conversations_for_lexical_rebuild_after_id_through_id(
7454        &self,
7455        limit: i64,
7456        after_conversation_id: i64,
7457        through_conversation_id: i64,
7458        agent_slugs: &HashMap<i64, String>,
7459        workspace_paths: &HashMap<i64, PathBuf>,
7460    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7461        if through_conversation_id <= after_conversation_id {
7462            return Ok(Vec::new());
7463        }
7464        self.conn
7465            .query_map_collect(
7466                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7467                       started_at,
7468                       COALESCE(
7469                           (SELECT ts.ended_at
7470                            FROM conversation_tail_state ts
7471                            WHERE ts.conversation_id = conversations.id),
7472                           ended_at
7473                       ),
7474                       source_id, origin_host
7475                FROM conversations
7476                WHERE id > ?2 AND id <= ?3
7477                ORDER BY id ASC
7478                LIMIT ?1",
7479                fparams![limit, after_conversation_id, through_conversation_id],
7480                |row| {
7481                    let agent_id: Option<i64> = row.get_typed(1)?;
7482                    let workspace_id: Option<i64> = row.get_typed(2)?;
7483                    let source_path: String = row.get_typed(5)?;
7484                    let raw_source_id: Option<String> = row.get_typed(8)?;
7485                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7486                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7487                        raw_source_id.as_deref(),
7488                        None,
7489                        raw_origin_host.as_deref(),
7490                    );
7491                    Ok(LexicalRebuildConversationRow {
7492                        id: Some(row.get_typed(0)?),
7493                        agent_slug: agent_id
7494                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7495                            .unwrap_or_else(|| "unknown".to_string()),
7496                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7497                        external_id: row.get_typed(3)?,
7498                        title: row.get_typed(4)?,
7499                        source_path: Path::new(&source_path).to_path_buf(),
7500                        started_at: row.get_typed(6)?,
7501                        ended_at: row.get_typed(7)?,
7502                        source_id,
7503                        origin_host,
7504                    })
7505                },
7506            )
7507            .with_context(|| {
7508                format!(
7509                    "listing conversations for lexical rebuild after id {after_conversation_id} through id {through_conversation_id}"
7510                )
7511            })
7512    }
7513
7514    /// Fetch messages for a conversation.
7515    pub fn fetch_messages(&self, conversation_id: i64) -> Result<Vec<Message>> {
7516        let hinted_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7517             FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7518             WHERE conversation_id = ?1 ORDER BY idx";
7519        let fallback_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7520             FROM messages \
7521             WHERE conversation_id = ?1 ORDER BY idx";
7522
7523        self.conn
7524            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7525                let role: String = row.get_typed(2)?;
7526                Ok(Message {
7527                    id: Some(row.get_typed(0)?),
7528                    idx: row.get_typed(1)?,
7529                    role: match role.as_str() {
7530                        "user" => MessageRole::User,
7531                        "agent" | "assistant" => MessageRole::Agent,
7532                        "tool" => MessageRole::Tool,
7533                        "system" => MessageRole::System,
7534                        other => MessageRole::Other(other.to_string()),
7535                    },
7536                    author: row.get_typed(3)?,
7537                    created_at: row.get_typed(4)?,
7538                    content: row.get_typed(5)?,
7539                    extra_json: franken_read_message_extra_compat(row, 6, 7),
7540                    snippets: Vec::new(),
7541                })
7542            })
7543            .or_else(|err| {
7544                if err
7545                    .to_string()
7546                    .contains("no such index: sqlite_autoindex_messages_1")
7547                {
7548                    return self.conn.query_map_collect(
7549                        fallback_sql,
7550                        fparams![conversation_id],
7551                        |row| {
7552                            let role: String = row.get_typed(2)?;
7553                            Ok(Message {
7554                                id: Some(row.get_typed(0)?),
7555                                idx: row.get_typed(1)?,
7556                                role: match role.as_str() {
7557                                    "user" => MessageRole::User,
7558                                    "agent" | "assistant" => MessageRole::Agent,
7559                                    "tool" => MessageRole::Tool,
7560                                    "system" => MessageRole::System,
7561                                    other => MessageRole::Other(other.to_string()),
7562                                },
7563                                author: row.get_typed(3)?,
7564                                created_at: row.get_typed(4)?,
7565                                content: row.get_typed(5)?,
7566                                extra_json: franken_read_message_extra_compat(row, 6, 7),
7567                                snippets: Vec::new(),
7568                            })
7569                        },
7570                    );
7571                }
7572                Err(err)
7573            })
7574            .with_context(|| format!("fetching messages for conversation {conversation_id}"))
7575    }
7576
7577    /// Fetch messages for lexical index rebuilds without deserializing extra metadata.
7578    ///
7579    /// Tantivy only needs message text and core envelope fields, so avoiding
7580    /// `extra_json` here prevents rebuilds from rehydrating enormous historical
7581    /// payloads that are irrelevant to lexical search.
7582    pub fn fetch_messages_for_lexical_rebuild(&self, conversation_id: i64) -> Result<Vec<Message>> {
7583        let hinted_sql = "SELECT id, idx, role, author, created_at, content \
7584                 FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7585                 WHERE conversation_id = ?1 ORDER BY idx";
7586        let fallback_sql = "SELECT id, idx, role, author, created_at, content \
7587                 FROM messages \
7588                 WHERE conversation_id = ?1 ORDER BY idx";
7589
7590        self.conn
7591            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7592                let role: String = row.get_typed(2)?;
7593                Ok(Message {
7594                    id: Some(row.get_typed(0)?),
7595                    idx: row.get_typed(1)?,
7596                    role: match role.as_str() {
7597                        "user" => MessageRole::User,
7598                        "agent" | "assistant" => MessageRole::Agent,
7599                        "tool" => MessageRole::Tool,
7600                        "system" => MessageRole::System,
7601                        other => MessageRole::Other(other.to_string()),
7602                    },
7603                    author: row.get_typed(3)?,
7604                    created_at: row.get_typed(4)?,
7605                    content: row.get_typed(5)?,
7606                    extra_json: serde_json::Value::Null,
7607                    snippets: Vec::new(),
7608                })
7609            })
7610            .or_else(|err| {
7611                if err
7612                    .to_string()
7613                    .contains("no such index: sqlite_autoindex_messages_1")
7614                {
7615                    return self.conn.query_map_collect(
7616                        fallback_sql,
7617                        fparams![conversation_id],
7618                        |row| {
7619                            let role: String = row.get_typed(2)?;
7620                            Ok(Message {
7621                                id: Some(row.get_typed(0)?),
7622                                idx: row.get_typed(1)?,
7623                                role: match role.as_str() {
7624                                    "user" => MessageRole::User,
7625                                    "agent" | "assistant" => MessageRole::Agent,
7626                                    "tool" => MessageRole::Tool,
7627                                    "system" => MessageRole::System,
7628                                    other => MessageRole::Other(other.to_string()),
7629                                },
7630                                author: row.get_typed(3)?,
7631                                created_at: row.get_typed(4)?,
7632                                content: row.get_typed(5)?,
7633                                extra_json: serde_json::Value::Null,
7634                                snippets: Vec::new(),
7635                            })
7636                        },
7637                    );
7638                }
7639                Err(err)
7640            })
7641            .with_context(|| {
7642                format!("fetching messages for lexical rebuild of conversation {conversation_id}")
7643            })
7644    }
7645
7646    /// Fetch messages for multiple conversations during lexical rebuilds.
7647    ///
7648    /// This preserves the lightweight lexical-rebuild projection while avoiding
7649    /// one round-trip per conversation when rebuilding large canonical indexes.
7650    pub fn fetch_messages_for_lexical_rebuild_batch(
7651        &self,
7652        conversation_ids: &[i64],
7653        max_messages: Option<usize>,
7654        max_content_bytes: Option<usize>,
7655    ) -> Result<HashMap<i64, Vec<Message>>> {
7656        if conversation_ids.is_empty() {
7657            return Ok(HashMap::new());
7658        }
7659
7660        let mut grouped: HashMap<i64, Vec<Message>> =
7661            HashMap::with_capacity(conversation_ids.len());
7662        let mut fetched_conversation_ids = HashSet::with_capacity(conversation_ids.len());
7663        let mut total_messages = 0usize;
7664        let mut total_content_bytes = 0usize;
7665
7666        // The apparent single-query shape (`WHERE conversation_id IN (...) ORDER BY ...`)
7667        // is a bad frankensqlite plan for large live databases: it can
7668        // materialize far more of `messages` than the requested conversations.
7669        // Reuse the hinted per-conversation primary-key lookup instead.
7670        for conversation_id in conversation_ids {
7671            if !fetched_conversation_ids.insert(*conversation_id) {
7672                continue;
7673            }
7674
7675            let messages = self
7676                .fetch_messages_for_lexical_rebuild(*conversation_id)
7677                .with_context(|| {
7678                    format!("fetching lexical rebuild messages for conversation {conversation_id}")
7679                })?;
7680            total_messages = total_messages.saturating_add(messages.len());
7681            if let Some(limit) = max_messages
7682                && total_messages > limit
7683            {
7684                return Err(anyhow!(
7685                    "lexical rebuild batch fetch exceeded message guardrail: messages={total_messages} limit={limit} conversations={}",
7686                    conversation_ids.len()
7687                ));
7688            }
7689
7690            let message_bytes = messages
7691                .iter()
7692                .map(|message| message.content.len())
7693                .sum::<usize>();
7694            total_content_bytes = total_content_bytes.saturating_add(message_bytes);
7695            if let Some(limit) = max_content_bytes
7696                && total_content_bytes > limit
7697            {
7698                return Err(anyhow!(
7699                    "lexical rebuild batch fetch exceeded content-byte guardrail: bytes={total_content_bytes} limit={limit} conversations={}",
7700                    conversation_ids.len()
7701                ));
7702            }
7703
7704            if !messages.is_empty() {
7705                grouped.insert(*conversation_id, messages);
7706            }
7707        }
7708
7709        Ok(grouped)
7710    }
7711
7712    /// Stream lexical rebuild message rows in `(conversation_id, idx)` order
7713    /// without materializing the full result set.
7714    pub fn stream_messages_for_lexical_rebuild_between_conversation_ids<F>(
7715        &self,
7716        start_conversation_id: i64,
7717        end_conversation_id: i64,
7718        mut f: F,
7719    ) -> Result<()>
7720    where
7721        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7722    {
7723        if end_conversation_id < start_conversation_id {
7724            return Ok(());
7725        }
7726
7727        let conversation_ids: Vec<i64> = self
7728            .conn
7729            .query_map_collect(
7730                "SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
7731                fparams![start_conversation_id, end_conversation_id],
7732                |row| row.get_typed(0),
7733            )
7734            .with_context(|| "listing conversation ids for streamed lexical rebuild")?;
7735
7736        for conversation_id in conversation_ids {
7737            let messages = self
7738                .fetch_messages_for_lexical_rebuild(conversation_id)
7739                .with_context(|| {
7740                    format!("streaming lexical rebuild messages for conversation {conversation_id}")
7741                })?;
7742
7743            for message in messages {
7744                let message_id = message.id.ok_or_else(|| {
7745                    anyhow!(
7746                        "lexical rebuild message missing id for conversation {conversation_id} idx {}",
7747                        message.idx
7748                    )
7749                })?;
7750                f(LexicalRebuildMessageRow {
7751                    conversation_id,
7752                    id: message_id,
7753                    idx: message.idx,
7754                    role: role_str(&message.role),
7755                    author: message.author,
7756                    created_at: message.created_at,
7757                    content: message.content,
7758                })?;
7759            }
7760        }
7761
7762        Ok(())
7763    }
7764
7765    /// Stream grouped lexical rebuild message rows in `(conversation_id, idx)`
7766    /// order by reusing the canonical per-message stream and coalescing rows
7767    /// per conversation.
7768    pub fn stream_grouped_messages_for_lexical_rebuild_between_conversation_ids<F>(
7769        &self,
7770        start_conversation_id: i64,
7771        end_conversation_id: i64,
7772        mut f: F,
7773    ) -> Result<()>
7774    where
7775        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7776    {
7777        if end_conversation_id < start_conversation_id {
7778            return Ok(());
7779        }
7780
7781        let mut current_conversation_id: Option<i64> = None;
7782        let mut current_messages: LexicalRebuildGroupedMessageRows = SmallVec::new();
7783        let mut current_last_message_id = 0i64;
7784        let mut flush_current = |current_conversation_id: &mut Option<i64>,
7785                                 current_messages: &mut LexicalRebuildGroupedMessageRows,
7786                                 current_last_message_id: &mut i64|
7787         -> Result<()> {
7788            let Some(conversation_id) = current_conversation_id.take() else {
7789                return Ok(());
7790            };
7791            let messages = std::mem::take(current_messages);
7792            let last_message_id = std::mem::take(current_last_message_id);
7793            f(conversation_id, messages, last_message_id)
7794        };
7795
7796        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7797            start_conversation_id,
7798            end_conversation_id,
7799            |row| {
7800                if current_conversation_id != Some(row.conversation_id) {
7801                    flush_current(
7802                        &mut current_conversation_id,
7803                        &mut current_messages,
7804                        &mut current_last_message_id,
7805                    )?;
7806                    current_conversation_id = Some(row.conversation_id);
7807                }
7808                current_last_message_id = row.id;
7809                current_messages.push(LexicalRebuildGroupedMessageRow {
7810                    idx: row.idx,
7811                    is_tool_role: row.role == "tool",
7812                    created_at: row.created_at,
7813                    content: row.content,
7814                });
7815                Ok(())
7816            },
7817        )
7818        .with_context(|| "streaming grouped lexical rebuild messages")?;
7819
7820        flush_current(
7821            &mut current_conversation_id,
7822            &mut current_messages,
7823            &mut current_last_message_id,
7824        )
7825        .with_context(|| "flushing grouped lexical rebuild messages")
7826    }
7827
7828    /// Stream grouped lexical rebuild message rows from a starting conversation
7829    /// id to the end of the table.
7830    pub fn stream_grouped_messages_for_lexical_rebuild_from_conversation_id<F>(
7831        &self,
7832        start_conversation_id: i64,
7833        f: F,
7834    ) -> Result<()>
7835    where
7836        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7837    {
7838        self.stream_grouped_messages_for_lexical_rebuild_between_conversation_ids(
7839            start_conversation_id,
7840            i64::MAX,
7841            f,
7842        )
7843    }
7844
7845    /// Stream lexical rebuild message rows from a starting conversation id to
7846    /// the end of the table.
7847    pub fn stream_messages_for_lexical_rebuild_from_conversation_id<F>(
7848        &self,
7849        start_conversation_id: i64,
7850        f: F,
7851    ) -> Result<()>
7852    where
7853        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7854    {
7855        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7856            start_conversation_id,
7857            i64::MAX,
7858            f,
7859        )
7860    }
7861
7862    /// Get a source by ID.
7863    pub fn get_source(&self, id: &str) -> Result<Option<Source>> {
7864        let result = self.conn.query_row_map(
7865            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources WHERE id = ?1",
7866            fparams![id],
7867            |row| {
7868                let kind_str: String = row.get_typed(1)?;
7869                let config_json_str: Option<String> = row.get_typed(5)?;
7870                Ok(Source {
7871                    id: row.get_typed(0)?,
7872                    kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7873                    host_label: row.get_typed(2)?,
7874                    machine_id: row.get_typed(3)?,
7875                    platform: row.get_typed(4)?,
7876                    config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7877                    created_at: row.get_typed(6)?,
7878                    updated_at: row.get_typed(7)?,
7879                })
7880            },
7881        );
7882        Ok(result.optional()?)
7883    }
7884
7885    /// List all sources.
7886    pub fn list_sources(&self) -> Result<Vec<Source>> {
7887        self.conn
7888            .query_map_collect(
7889                "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources ORDER BY id",
7890                fparams![],
7891                |row| {
7892                    let kind_str: String = row.get_typed(1)?;
7893                    let config_json_str: Option<String> = row.get_typed(5)?;
7894                    Ok(Source {
7895                        id: row.get_typed(0)?,
7896                        kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7897                        host_label: row.get_typed(2)?,
7898                        machine_id: row.get_typed(3)?,
7899                        platform: row.get_typed(4)?,
7900                        config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7901                        created_at: row.get_typed(6)?,
7902                        updated_at: row.get_typed(7)?,
7903                    })
7904                },
7905            )
7906            .with_context(|| "listing sources")
7907    }
7908
7909    /// Get IDs of all non-local sources.
7910    pub fn get_source_ids(&self) -> Result<Vec<String>> {
7911        self.conn
7912            .query_map_collect(
7913                "SELECT id FROM sources WHERE id != 'local' ORDER BY id",
7914                fparams![],
7915                |row| row.get_typed(0),
7916            )
7917            .with_context(|| "listing source ids")
7918    }
7919
7920    /// Create or update a source.
7921    pub fn upsert_source(&self, source: &Source) -> Result<()> {
7922        self.invalidate_conversation_source_cache(source.id.as_str());
7923        let now = Self::now_millis();
7924        let kind_str = source.kind.to_string();
7925        let config_json_str = source
7926            .config_json
7927            .as_ref()
7928            .map(serde_json::to_string)
7929            .transpose()?;
7930
7931        // Re-indexing commonly reuses the same normalized source metadata
7932        // across many conversations. Skip the write entirely when the row is
7933        // already identical so we avoid needless WAL churn and timestamp bumps.
7934        self.conn.execute_compat(
7935            "INSERT INTO sources(id, kind, host_label, machine_id, platform, config_json, created_at, updated_at)
7936             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
7937             ON CONFLICT(id) DO UPDATE SET
7938                 kind = excluded.kind,
7939                 host_label = excluded.host_label,
7940                 machine_id = excluded.machine_id,
7941                 platform = excluded.platform,
7942                 config_json = excluded.config_json,
7943                 updated_at = excluded.updated_at
7944             WHERE NOT (
7945                 sources.kind IS excluded.kind
7946                 AND sources.host_label IS excluded.host_label
7947                 AND sources.machine_id IS excluded.machine_id
7948                 AND sources.platform IS excluded.platform
7949                 AND sources.config_json IS excluded.config_json
7950             )",
7951            fparams![
7952                source.id.as_str(),
7953                kind_str.as_str(),
7954                source.host_label.as_deref(),
7955                source.machine_id.as_deref(),
7956                source.platform.as_deref(),
7957                config_json_str.as_deref(),
7958                source.created_at.unwrap_or(now),
7959                now
7960            ],
7961        )?;
7962        Ok(())
7963    }
7964
7965    fn historical_bundle_key_hash(
7966        version: u32,
7967        bundle: &HistoricalDatabaseBundle,
7968        include_bundle_stats: bool,
7969    ) -> String {
7970        let signature = if include_bundle_stats {
7971            format!(
7972                "{}:{}:{}:{}",
7973                version,
7974                bundle.root_path.display(),
7975                bundle.total_bytes,
7976                bundle.modified_at_ms
7977            )
7978        } else {
7979            format!("{}:{}", version, bundle.root_path.display())
7980        };
7981        blake3::hash(signature.as_bytes()).to_hex().to_string()
7982    }
7983
7984    fn historical_bundle_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7985        format!(
7986            "historical_bundle_salvaged:{}",
7987            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_LEDGER_VERSION, bundle, false)
7988        )
7989    }
7990
7991    fn historical_bundle_legacy_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7992        let signature = format!(
7993            "{}:{}:{}:{}",
7994            HISTORICAL_SALVAGE_LEDGER_VERSION,
7995            bundle.root_path.display(),
7996            bundle.total_bytes,
7997            bundle.modified_at_ms
7998        );
7999        format!(
8000            "historical_bundle_salvaged:{}",
8001            blake3::hash(signature.as_bytes()).to_hex()
8002        )
8003    }
8004
8005    fn historical_bundle_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
8006        format!(
8007            "historical_bundle_progress:{}",
8008            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_PROGRESS_VERSION, bundle, false)
8009        )
8010    }
8011
8012    fn historical_bundle_legacy_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
8013        let signature = format!(
8014            "{}:{}:{}:{}",
8015            HISTORICAL_SALVAGE_PROGRESS_VERSION,
8016            bundle.root_path.display(),
8017            bundle.total_bytes,
8018            bundle.modified_at_ms
8019        );
8020        format!(
8021            "historical_bundle_progress:{}",
8022            blake3::hash(signature.as_bytes()).to_hex()
8023        )
8024    }
8025
8026    fn historical_bundle_already_imported(
8027        &self,
8028        bundle: &HistoricalDatabaseBundle,
8029    ) -> Result<bool> {
8030        for key in [
8031            Self::historical_bundle_meta_key(bundle),
8032            Self::historical_bundle_legacy_meta_key(bundle),
8033        ] {
8034            let existing: Option<String> = self
8035                .conn
8036                .query_row_map(
8037                    "SELECT value FROM meta WHERE key = ?1",
8038                    fparams![key.as_str()],
8039                    |row| row.get_typed(0),
8040                )
8041                .optional()?;
8042            if existing.is_some() {
8043                return Ok(true);
8044            }
8045        }
8046        Ok(false)
8047    }
8048
8049    pub(crate) fn has_pending_historical_bundles(&self, canonical_db_path: &Path) -> Result<bool> {
8050        for bundle in discover_historical_database_bundles(canonical_db_path) {
8051            if !self.historical_bundle_already_imported(&bundle)? {
8052                return Ok(true);
8053            }
8054        }
8055        Ok(false)
8056    }
8057
8058    fn load_historical_bundle_progress(
8059        &self,
8060        bundle: &HistoricalDatabaseBundle,
8061    ) -> Result<Option<HistoricalBundleProgress>> {
8062        for key in [
8063            Self::historical_bundle_progress_key(bundle),
8064            Self::historical_bundle_legacy_progress_key(bundle),
8065        ] {
8066            let raw: Option<String> = self
8067                .conn
8068                .query_row_map(
8069                    "SELECT value FROM meta WHERE key = ?1",
8070                    fparams![key.as_str()],
8071                    |row| row.get_typed(0),
8072                )
8073                .optional()?;
8074            let Some(raw) = raw else {
8075                continue;
8076            };
8077            let parsed: HistoricalBundleProgress =
8078                serde_json::from_str(&raw).with_context(|| {
8079                    format!(
8080                        "parsing historical salvage progress checkpoint for {}",
8081                        bundle.root_path.display()
8082                    )
8083                })?;
8084            if parsed.progress_version == HISTORICAL_SALVAGE_PROGRESS_VERSION {
8085                return Ok(Some(parsed));
8086            }
8087        }
8088        Ok(None)
8089    }
8090
8091    fn record_historical_bundle_progress(
8092        &self,
8093        bundle: &HistoricalDatabaseBundle,
8094        method: &str,
8095        last_completed_source_row_id: i64,
8096        conversations_imported: usize,
8097        messages_imported: usize,
8098    ) -> Result<()> {
8099        let key = Self::historical_bundle_progress_key(bundle);
8100        let value = HistoricalBundleProgress {
8101            progress_version: HISTORICAL_SALVAGE_PROGRESS_VERSION,
8102            path: bundle.root_path.display().to_string(),
8103            bytes: bundle.total_bytes,
8104            modified_at_ms: bundle.modified_at_ms,
8105            method: method.to_string(),
8106            last_completed_source_row_id,
8107            conversations_imported,
8108            messages_imported,
8109            updated_at_ms: Self::now_millis(),
8110        };
8111        let value_str = serde_json::to_string(&value)?;
8112        self.conn.execute_compat(
8113            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
8114            fparams![key.as_str(), value_str.as_str()],
8115        )?;
8116        Ok(())
8117    }
8118
8119    fn clear_historical_bundle_progress(&self, bundle: &HistoricalDatabaseBundle) -> Result<()> {
8120        for key in [
8121            Self::historical_bundle_progress_key(bundle),
8122            Self::historical_bundle_legacy_progress_key(bundle),
8123        ] {
8124            self.conn
8125                .execute_compat("DELETE FROM meta WHERE key = ?1", fparams![key.as_str()])?;
8126        }
8127        Ok(())
8128    }
8129
8130    fn record_historical_bundle_import(
8131        &self,
8132        bundle: &HistoricalDatabaseBundle,
8133        method: &str,
8134        conversations_imported: usize,
8135        messages_imported: usize,
8136    ) -> Result<()> {
8137        let key = Self::historical_bundle_meta_key(bundle);
8138        let value = serde_json::json!({
8139            "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
8140            "path": bundle.root_path.display().to_string(),
8141            "bytes": bundle.total_bytes,
8142            "modified_at_ms": bundle.modified_at_ms,
8143            "method": method,
8144            "conversations_imported": conversations_imported,
8145            "messages_imported": messages_imported,
8146            "recorded_at_ms": Self::now_millis(),
8147        });
8148        let value_str = serde_json::to_string(&value)?;
8149        self.conn.execute_compat(
8150            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
8151            fparams![key.as_str(), value_str.as_str()],
8152        )?;
8153        Ok(())
8154    }
8155
8156    fn historical_import_error_is_split_retryable(err: &anyhow::Error) -> bool {
8157        const RETRYABLE_PATTERNS: &[&str] = &[
8158            "out of memory",
8159            "string or blob too big",
8160            "too many sql variables",
8161        ];
8162        err.chain().any(|cause| {
8163            let rendered = cause.to_string().to_ascii_lowercase();
8164            RETRYABLE_PATTERNS
8165                .iter()
8166                .any(|pattern| rendered.contains(pattern))
8167        })
8168    }
8169
8170    fn split_historical_batch_entry_messages(
8171        entry: &HistoricalBatchEntry,
8172    ) -> Option<(HistoricalBatchEntry, HistoricalBatchEntry)> {
8173        if entry.conversation.messages.len() < 2 {
8174            return None;
8175        }
8176        let split_at = entry.conversation.messages.len() / 2;
8177        if split_at == 0 || split_at >= entry.conversation.messages.len() {
8178            return None;
8179        }
8180
8181        let mut left = entry.clone();
8182        left.conversation.messages = entry.conversation.messages[..split_at].to_vec();
8183
8184        let mut right = entry.clone();
8185        right.conversation.messages = entry.conversation.messages[split_at..].to_vec();
8186
8187        Some((left, right))
8188    }
8189
8190    fn import_historical_batch_with_retry<F>(
8191        entries: &[HistoricalBatchEntry],
8192        insert_batch: &mut F,
8193    ) -> Result<HistoricalBatchImportTotals>
8194    where
8195        F: FnMut(&[HistoricalBatchEntry]) -> Result<HistoricalBatchImportTotals>,
8196    {
8197        match insert_batch(entries) {
8198            Ok(totals) => Ok(totals),
8199            Err(err) if Self::historical_import_error_is_split_retryable(&err) => {
8200                if entries.len() > 1 {
8201                    let mid = entries.len() / 2;
8202                    tracing::warn!(
8203                        batch_entries = entries.len(),
8204                        split_left = mid,
8205                        split_right = entries.len() - mid,
8206                        error = %err,
8207                        "historical salvage batch failed; retrying in smaller sub-batches"
8208                    );
8209                    let left =
8210                        Self::import_historical_batch_with_retry(&entries[..mid], insert_batch)?;
8211                    let right =
8212                        Self::import_historical_batch_with_retry(&entries[mid..], insert_batch)?;
8213                    return Ok(HistoricalBatchImportTotals {
8214                        inserted_source_rows: left.inserted_source_rows
8215                            + right.inserted_source_rows,
8216                        inserted_messages: left.inserted_messages + right.inserted_messages,
8217                    });
8218                }
8219
8220                if let Some(entry) = entries.first()
8221                    && let Some((left, right)) = Self::split_historical_batch_entry_messages(entry)
8222                {
8223                    tracing::warn!(
8224                        source_row_id = entry.source_row_id,
8225                        message_count = entry.conversation.messages.len(),
8226                        error = %err,
8227                        "historical salvage conversation failed; retrying in smaller message slices"
8228                    );
8229                    let left_totals = Self::import_historical_batch_with_retry(
8230                        std::slice::from_ref(&left),
8231                        insert_batch,
8232                    )?;
8233                    let right_totals = Self::import_historical_batch_with_retry(
8234                        std::slice::from_ref(&right),
8235                        insert_batch,
8236                    )?;
8237                    return Ok(HistoricalBatchImportTotals {
8238                        inserted_source_rows: usize::from(
8239                            left_totals.inserted_source_rows > 0
8240                                || right_totals.inserted_source_rows > 0,
8241                        ),
8242                        inserted_messages: left_totals
8243                            .inserted_messages
8244                            .saturating_add(right_totals.inserted_messages),
8245                    });
8246                }
8247
8248                Err(err)
8249            }
8250            Err(err) => Err(err),
8251        }
8252    }
8253
8254    fn import_historical_sources(&self, source_conn: &FrankenConnection) -> Result<()> {
8255        let sources: Vec<Source> = match source_conn.query_map_collect(
8256            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at
8257             FROM sources",
8258            fparams![],
8259            |row| {
8260                let raw_source_id: String = row.get_typed(0)?;
8261                let kind_str: String = row.get_typed(1)?;
8262                let raw_host_label: Option<String> = row.get_typed(2)?;
8263                let config_json_raw: Option<String> = row.get_typed(5)?;
8264                let (source_id, source_kind, host_label) = normalized_storage_source_parts(
8265                    Some(raw_source_id.as_str()),
8266                    Some(kind_str.as_str()),
8267                    raw_host_label.as_deref(),
8268                );
8269                Ok(Source {
8270                    id: source_id,
8271                    kind: source_kind,
8272                    host_label,
8273                    machine_id: row.get_typed(3)?,
8274                    platform: row.get_typed(4)?,
8275                    config_json: config_json_raw.and_then(|raw| serde_json::from_str(&raw).ok()),
8276                    created_at: row.get_typed(6)?,
8277                    updated_at: row.get_typed(7)?,
8278                })
8279            },
8280        ) {
8281            Ok(rows) => rows,
8282            Err(err) => {
8283                tracing::warn!(error = %err, "historical sources table unavailable; skipping source import");
8284                return Ok(());
8285            }
8286        };
8287
8288        for source in sources {
8289            self.upsert_source(&source)?;
8290        }
8291        Ok(())
8292    }
8293
8294    fn import_historical_conversations(
8295        &self,
8296        bundle: &HistoricalDatabaseBundle,
8297        salvage_method: &str,
8298        source_conn: &FrankenConnection,
8299    ) -> Result<(usize, usize)> {
8300        let batch_limits = historical_import_batch_limits();
8301        let cache_enabled = IndexingCache::is_enabled();
8302        let mut indexing_cache = IndexingCache::new();
8303        let mut known_sources: HashSet<String> = self
8304            .list_sources()?
8305            .into_iter()
8306            .map(|source| source.id)
8307            .collect();
8308        let resume_progress = self.load_historical_bundle_progress(bundle)?;
8309        let resume_after_row_id = resume_progress
8310            .as_ref()
8311            .map(|progress| progress.last_completed_source_row_id)
8312            .filter(|row_id| *row_id > 0);
8313
8314        tracing::info!(
8315            target: "cass::historical_salvage",
8316            batch_conversations = batch_limits.conversations,
8317            batch_messages = batch_limits.messages,
8318            batch_payload_chars = batch_limits.payload_chars,
8319            cache_enabled,
8320            resume_after_row_id,
8321            "configured historical salvage batch limits"
8322        );
8323
8324        if let Some(progress) = &resume_progress {
8325            tracing::info!(
8326                target: "cass::historical_salvage",
8327                path = %bundle.root_path.display(),
8328                resume_after_row_id = progress.last_completed_source_row_id,
8329                prior_conversations_imported = progress.conversations_imported,
8330                prior_messages_imported = progress.messages_imported,
8331                "resuming historical salvage bundle from durable checkpoint"
8332            );
8333        }
8334
8335        // LEFT JOIN + COALESCE on agents so legacy source databases with NULL
8336        // agent_id (the V1 schema did not require NOT NULL) still have their
8337        // conversations imported, degrading to 'unknown' slug like the other
8338        // rebuild paths.  Using INNER JOIN here would silently drop those
8339        // conversations during historical salvage, which is data loss.
8340        let conv_sql = if resume_after_row_id.is_some() {
8341            "SELECT
8342                c.id,
8343                COALESCE(a.slug, 'unknown'),
8344                w.path,
8345                c.external_id,
8346                c.title,
8347                c.source_path,
8348                c.started_at,
8349                c.ended_at,
8350                c.approx_tokens,
8351                c.metadata_json,
8352                c.source_id,
8353                c.origin_host
8354             FROM conversations c
8355             LEFT JOIN agents a ON c.agent_id = a.id
8356             LEFT JOIN workspaces w ON c.workspace_id = w.id
8357             WHERE c.id > ?1
8358             ORDER BY c.id"
8359        } else {
8360            "SELECT
8361                c.id,
8362                COALESCE(a.slug, 'unknown'),
8363                w.path,
8364                c.external_id,
8365                c.title,
8366                c.source_path,
8367                c.started_at,
8368                c.ended_at,
8369                c.approx_tokens,
8370                c.metadata_json,
8371                c.source_id,
8372                c.origin_host
8373             FROM conversations c
8374             LEFT JOIN agents a ON c.agent_id = a.id
8375             LEFT JOIN workspaces w ON c.workspace_id = w.id
8376             ORDER BY c.id"
8377        };
8378        let conv_params: &[ParamValue] =
8379            if let Some(last_completed_source_row_id) = resume_after_row_id {
8380                &[ParamValue::from(last_completed_source_row_id)]
8381            } else {
8382                &[]
8383            };
8384
8385        #[allow(clippy::type_complexity)]
8386        let conv_rows: Vec<(
8387            i64,
8388            String,
8389            Option<String>,
8390            Option<String>,
8391            Option<String>,
8392            String,
8393            Option<i64>,
8394            Option<i64>,
8395            Option<i64>,
8396            Option<String>,
8397            Option<String>,
8398            Option<String>,
8399        )> = source_conn
8400            .query_map_collect(conv_sql, conv_params, |row| {
8401                Ok((
8402                    row.get_typed::<i64>(0)?,
8403                    row.get_typed::<String>(1)?,
8404                    row.get_typed::<Option<String>>(2)?,
8405                    row.get_typed::<Option<String>>(3)?,
8406                    row.get_typed::<Option<String>>(4)?,
8407                    row.get_typed::<String>(5)?,
8408                    row.get_typed::<Option<i64>>(6)?,
8409                    row.get_typed::<Option<i64>>(7)?,
8410                    row.get_typed::<Option<i64>>(8)?,
8411                    row.get_typed::<Option<String>>(9)?,
8412                    row.get_typed::<Option<String>>(10)?,
8413                    row.get_typed::<Option<String>>(11)?,
8414                ))
8415            })
8416            .context("querying historical conversations")?;
8417
8418        let msg_sql = "SELECT idx, role, author, created_at, content, extra_json
8419             FROM messages
8420             WHERE conversation_id = ?1
8421             ORDER BY idx";
8422
8423        let mut imported_conversations = resume_progress
8424            .as_ref()
8425            .map(|progress| progress.conversations_imported)
8426            .unwrap_or(0);
8427        let mut imported_messages = resume_progress
8428            .as_ref()
8429            .map(|progress| progress.messages_imported)
8430            .unwrap_or(0);
8431        let mut pending_batch: Vec<HistoricalBatchEntry> = Vec::new();
8432        let mut pending_batch_messages = 0usize;
8433        let mut pending_batch_chars = 0usize;
8434        let mut pending_batch_first_row_id: Option<i64> = None;
8435        let mut pending_batch_last_row_id: Option<i64> = None;
8436
8437        let flush_batch = |storage: &FrankenStorage,
8438                           batch: &mut Vec<HistoricalBatchEntry>,
8439                           pending_messages: &mut usize,
8440                           pending_chars: &mut usize,
8441                           first_row_id: &mut Option<i64>,
8442                           last_row_id: &mut Option<i64>,
8443                           imported_conversations: &mut usize,
8444                           imported_messages: &mut usize|
8445         -> Result<()> {
8446            if batch.is_empty() {
8447                return Ok(());
8448            }
8449
8450            let batch_first_row_id = *first_row_id;
8451            let batch_last_row_id = *last_row_id;
8452            if historical_salvage_debug_enabled() {
8453                eprintln!(
8454                    "[historical-salvage] flushing batch rows {:?}..{:?} conversations={} messages={} payload_chars={}",
8455                    batch_first_row_id,
8456                    batch_last_row_id,
8457                    batch.len(),
8458                    *pending_messages,
8459                    *pending_chars
8460                );
8461            }
8462            tracing::info!(
8463                target: "cass::historical_salvage",
8464                batch_conversations = batch.len(),
8465                batch_messages = *pending_messages,
8466                batch_payload_chars = *pending_chars,
8467                first_source_row_id = batch_first_row_id,
8468                last_source_row_id = batch_last_row_id,
8469                "flushing historical salvage batch"
8470            );
8471
8472            let mut insert_batch =
8473                |entries: &[HistoricalBatchEntry]| -> Result<HistoricalBatchImportTotals> {
8474                    let borrowed_batch: Vec<(i64, Option<i64>, &Conversation)> = entries
8475                        .iter()
8476                        .map(|entry| (entry.agent_id, entry.workspace_id, &entry.conversation))
8477                        .collect();
8478                    let outcomes = storage
8479                        .insert_conversations_batched(&borrowed_batch)
8480                        .with_context(|| {
8481                            let first_source_row_id =
8482                                entries.first().map(|entry| entry.source_row_id);
8483                            let last_source_row_id =
8484                                entries.last().map(|entry| entry.source_row_id);
8485                            format!(
8486                                "inserting historical salvage batch source rows {:?}..{:?}",
8487                                first_source_row_id, last_source_row_id
8488                            )
8489                        })?;
8490                    let mut totals = HistoricalBatchImportTotals::default();
8491                    for outcome in outcomes {
8492                        if !outcome.inserted_indices.is_empty() {
8493                            totals.inserted_source_rows += 1;
8494                            totals.inserted_messages += outcome.inserted_indices.len();
8495                        }
8496                    }
8497                    Ok(totals)
8498                };
8499            let totals =
8500                Self::import_historical_batch_with_retry(batch.as_slice(), &mut insert_batch)?;
8501            *imported_conversations =
8502                (*imported_conversations).saturating_add(totals.inserted_source_rows);
8503            *imported_messages = (*imported_messages).saturating_add(totals.inserted_messages);
8504            if let Some(last_completed_row_id) = batch_last_row_id {
8505                storage.record_historical_bundle_progress(
8506                    bundle,
8507                    salvage_method,
8508                    last_completed_row_id,
8509                    *imported_conversations,
8510                    *imported_messages,
8511                )?;
8512            }
8513            tracing::info!(
8514                target: "cass::historical_salvage",
8515                batch_conversations = batch.len(),
8516                batch_messages = *pending_messages,
8517                imported_conversations = *imported_conversations,
8518                imported_messages = *imported_messages,
8519                first_source_row_id = batch_first_row_id,
8520                last_source_row_id = batch_last_row_id,
8521                "historical salvage batch committed"
8522            );
8523            if historical_salvage_debug_enabled() {
8524                eprintln!(
8525                    "[historical-salvage] committed batch rows {:?}..{:?} imported_conversations={} imported_messages={}",
8526                    batch_first_row_id,
8527                    batch_last_row_id,
8528                    *imported_conversations,
8529                    *imported_messages
8530                );
8531            }
8532            batch.clear();
8533            *pending_messages = 0;
8534            *pending_chars = 0;
8535            *first_row_id = None;
8536            *last_row_id = None;
8537            Ok(())
8538        };
8539
8540        for (
8541            conversation_row_id,
8542            agent_slug,
8543            workspace_path,
8544            external_id,
8545            title,
8546            source_path,
8547            started_at,
8548            ended_at,
8549            approx_tokens,
8550            metadata_json_raw,
8551            raw_source_id,
8552            raw_origin_host,
8553        ) in conv_rows
8554        {
8555            let source_id = crate::search::tantivy::normalized_index_source_id(
8556                raw_source_id.as_deref(),
8557                None,
8558                raw_origin_host.as_deref(),
8559            );
8560            let origin_host =
8561                crate::search::tantivy::normalized_index_origin_host(raw_origin_host.as_deref());
8562
8563            let messages: Vec<Message> = source_conn
8564                .query_map_collect(msg_sql, fparams![conversation_row_id], |msg_row| {
8565                    let role: String = msg_row.get_typed(1)?;
8566                    Ok(Message {
8567                        id: None,
8568                        idx: msg_row.get_typed(0)?,
8569                        role: match role.as_str() {
8570                            "user" => MessageRole::User,
8571                            "agent" | "assistant" => MessageRole::Agent,
8572                            "tool" => MessageRole::Tool,
8573                            "system" => MessageRole::System,
8574                            other => MessageRole::Other(other.to_string()),
8575                        },
8576                        author: msg_row.get_typed(2)?,
8577                        created_at: msg_row.get_typed(3)?,
8578                        content: msg_row.get_typed(4)?,
8579                        extra_json: parse_historical_json_column(msg_row.get_typed(5)?),
8580                        snippets: Vec::new(),
8581                    })
8582                })
8583                .context("collecting historical message rows")?;
8584
8585            if messages.is_empty() {
8586                continue;
8587            }
8588
8589            let conversation_message_count = messages.len();
8590            let conversation_chars = messages
8591                .iter()
8592                .map(message_payload_size_hint)
8593                .sum::<usize>();
8594
8595            let conversation = Conversation {
8596                id: None,
8597                agent_slug: agent_slug.clone(),
8598                workspace: workspace_path.map(PathBuf::from),
8599                external_id,
8600                title,
8601                source_path: PathBuf::from(source_path),
8602                started_at,
8603                ended_at,
8604                approx_tokens,
8605                metadata_json: parse_json_column(metadata_json_raw),
8606                messages,
8607                source_id,
8608                origin_host,
8609            };
8610
8611            if !known_sources.contains(&conversation.source_id) {
8612                let placeholder = if conversation.source_id == LOCAL_SOURCE_ID {
8613                    Source::local()
8614                } else {
8615                    Source {
8616                        id: conversation.source_id.clone(),
8617                        kind: SourceKind::Ssh,
8618                        host_label: conversation.origin_host.clone(),
8619                        machine_id: None,
8620                        platform: None,
8621                        config_json: None,
8622                        created_at: None,
8623                        updated_at: None,
8624                    }
8625                };
8626                self.upsert_source(&placeholder)?;
8627                known_sources.insert(conversation.source_id.clone());
8628            }
8629
8630            let agent = Agent {
8631                id: None,
8632                slug: agent_slug.clone(),
8633                name: agent_slug,
8634                version: None,
8635                kind: AgentKind::Cli,
8636            };
8637            let agent_id = if cache_enabled {
8638                indexing_cache.get_or_insert_agent(self, &agent)?
8639            } else {
8640                self.ensure_agent(&agent)?
8641            };
8642            let workspace_id = if let Some(workspace) = &conversation.workspace {
8643                if cache_enabled {
8644                    Some(indexing_cache.get_or_insert_workspace(self, workspace, None)?)
8645                } else {
8646                    Some(self.ensure_workspace(workspace, None)?)
8647                }
8648            } else {
8649                None
8650            };
8651
8652            let exceeds_pending_limits = !pending_batch.is_empty()
8653                && (pending_batch.len() >= batch_limits.conversations
8654                    || pending_batch_messages.saturating_add(conversation_message_count)
8655                        > batch_limits.messages
8656                    || pending_batch_chars.saturating_add(conversation_chars)
8657                        > batch_limits.payload_chars);
8658            if exceeds_pending_limits {
8659                flush_batch(
8660                    self,
8661                    &mut pending_batch,
8662                    &mut pending_batch_messages,
8663                    &mut pending_batch_chars,
8664                    &mut pending_batch_first_row_id,
8665                    &mut pending_batch_last_row_id,
8666                    &mut imported_conversations,
8667                    &mut imported_messages,
8668                )?;
8669            }
8670
8671            if pending_batch_first_row_id.is_none() {
8672                pending_batch_first_row_id = Some(conversation_row_id);
8673            }
8674            pending_batch_last_row_id = Some(conversation_row_id);
8675            pending_batch_messages =
8676                pending_batch_messages.saturating_add(conversation_message_count);
8677            pending_batch_chars = pending_batch_chars.saturating_add(conversation_chars);
8678            pending_batch.push(HistoricalBatchEntry {
8679                source_row_id: conversation_row_id,
8680                agent_id,
8681                workspace_id,
8682                conversation,
8683            });
8684
8685            if pending_batch.len() >= batch_limits.conversations
8686                || pending_batch_messages >= batch_limits.messages
8687                || pending_batch_chars >= batch_limits.payload_chars
8688            {
8689                flush_batch(
8690                    self,
8691                    &mut pending_batch,
8692                    &mut pending_batch_messages,
8693                    &mut pending_batch_chars,
8694                    &mut pending_batch_first_row_id,
8695                    &mut pending_batch_last_row_id,
8696                    &mut imported_conversations,
8697                    &mut imported_messages,
8698                )?;
8699            }
8700        }
8701
8702        flush_batch(
8703            self,
8704            &mut pending_batch,
8705            &mut pending_batch_messages,
8706            &mut pending_batch_chars,
8707            &mut pending_batch_first_row_id,
8708            &mut pending_batch_last_row_id,
8709            &mut imported_conversations,
8710            &mut imported_messages,
8711        )?;
8712
8713        if cache_enabled {
8714            let (hits, misses, hit_rate) = indexing_cache.stats();
8715            tracing::info!(
8716                target: "cass::historical_salvage",
8717                hits,
8718                misses,
8719                hit_rate = format!("{:.1}%", hit_rate * 100.0),
8720                agents = indexing_cache.agent_count(),
8721                workspaces = indexing_cache.workspace_count(),
8722                sources = known_sources.len(),
8723                "historical salvage cache stats"
8724            );
8725        }
8726
8727        Ok((imported_conversations, imported_messages))
8728    }
8729
8730    pub fn salvage_historical_databases(
8731        &self,
8732        canonical_db_path: &Path,
8733    ) -> Result<HistoricalSalvageOutcome> {
8734        let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
8735        let mut outcome = HistoricalSalvageOutcome {
8736            bundles_considered: ordered_bundles.len(),
8737            ..HistoricalSalvageOutcome::default()
8738        };
8739
8740        for bundle in ordered_bundles {
8741            if self.historical_bundle_already_imported(&bundle)? {
8742                self.clear_historical_bundle_progress(&bundle)?;
8743                continue;
8744            }
8745
8746            let source = match open_historical_bundle_for_salvage(&bundle).with_context(|| {
8747                format!(
8748                    "opening historical bundle {} for salvage",
8749                    bundle.root_path.display()
8750                )
8751            }) {
8752                Ok(source) => source,
8753                Err(err) => {
8754                    tracing::warn!(
8755                        path = %bundle.root_path.display(),
8756                        error = %err,
8757                        "skipping unreadable historical cass database bundle during salvage"
8758                    );
8759                    self.clear_historical_bundle_progress(&bundle)?;
8760                    continue;
8761                }
8762            };
8763
8764            // #247 (coding_agent_session_search-r8pcy): if a per-bundle progress
8765            // checkpoint already covers the backup's entire conversation row-id
8766            // space, the bundle was effectively fully imported but the daemon was
8767            // killed (e.g. OOM) before the completion ledger marker landed.
8768            // Re-scanning it is a pure O(n) no-op — every batch commits
8769            // imported=0 while taking 5-12 min. Detect it via the high-water
8770            // checkpoint, write the ledger marker, drop the checkpoint, and skip.
8771            if let Some(progress) = self.load_historical_bundle_progress(&bundle)? {
8772                let backup_max_conversation_id: i64 = source
8773                    .conn
8774                    .query_row_map(
8775                        "SELECT COALESCE(MAX(id), 0) FROM conversations",
8776                        fparams![],
8777                        |row| row.get_typed(0),
8778                    )
8779                    .unwrap_or(0);
8780                if backup_max_conversation_id > 0
8781                    && progress.last_completed_source_row_id >= backup_max_conversation_id
8782                {
8783                    self.record_historical_bundle_import(
8784                        &bundle,
8785                        source.method,
8786                        progress.conversations_imported,
8787                        progress.messages_imported,
8788                    )?;
8789                    self.clear_historical_bundle_progress(&bundle)?;
8790                    tracing::info!(
8791                        path = %bundle.root_path.display(),
8792                        last_completed_source_row_id = progress.last_completed_source_row_id,
8793                        backup_max_conversation_id,
8794                        conversations_imported = progress.conversations_imported,
8795                        messages_imported = progress.messages_imported,
8796                        "historical bundle already fully imported per checkpoint; marking salvaged and skipping O(n) re-scan"
8797                    );
8798                    continue;
8799                }
8800            }
8801
8802            self.import_historical_sources(&source.conn)?;
8803            let (imported_conversations, imported_messages) =
8804                self.import_historical_conversations(&bundle, source.method, &source.conn)?;
8805            self.record_historical_bundle_import(
8806                &bundle,
8807                source.method,
8808                imported_conversations,
8809                imported_messages,
8810            )?;
8811            self.clear_historical_bundle_progress(&bundle)?;
8812
8813            outcome.bundles_imported += 1;
8814            outcome.conversations_imported += imported_conversations;
8815            outcome.messages_imported += imported_messages;
8816
8817            tracing::info!(
8818                path = %bundle.root_path.display(),
8819                bytes = bundle.total_bytes,
8820                method = source.method,
8821                imported_conversations,
8822                imported_messages,
8823                "salvaged historical cass database bundle"
8824            );
8825        }
8826
8827        Ok(outcome)
8828    }
8829
8830    /// Delete a source by ID. Returns true if a row was deleted.
8831    pub fn delete_source(&self, id: &str, _cascade: bool) -> Result<bool> {
8832        if id == LOCAL_SOURCE_ID {
8833            anyhow::bail!("cannot delete the local source");
8834        }
8835        let count = self
8836            .conn
8837            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams![id])?;
8838        if count > 0 {
8839            self.invalidate_conversation_source_cache(id);
8840        }
8841        Ok(count > 0)
8842    }
8843
8844    /// Insert a conversation tree (conversation + messages + snippets + FTS).
8845    pub fn insert_conversation_tree(
8846        &self,
8847        agent_id: i64,
8848        workspace_id: Option<i64>,
8849        conv: &Conversation,
8850    ) -> Result<InsertOutcome> {
8851        let normalized_conv = normalized_conversation_for_storage(conv);
8852        let conv = normalized_conv.as_ref();
8853        self.ensure_source_for_conversation(conv)?;
8854        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
8855        let defer_analytics_updates = defer_analytics_updates_enabled();
8856        let conversation_key = conversation_merge_key(agent_id, conv);
8857        let mut tx = self.conn.transaction()?;
8858        let existing = franken_find_existing_conversation_with_tail_by_key(
8859            &tx,
8860            &conversation_key,
8861            Some(conv),
8862        )?;
8863        if let Some(existing) = existing {
8864            let outcome = self.franken_append_messages_with_tail_in_tx(
8865                &tx,
8866                agent_id,
8867                existing.id,
8868                conv,
8869                existing.tail_state,
8870                defer_lexical_updates,
8871                defer_analytics_updates,
8872            )?;
8873            tx.commit()?;
8874            return Ok(outcome);
8875        }
8876
8877        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
8878            &tx,
8879            agent_id,
8880            workspace_id,
8881            conv,
8882            &conversation_key,
8883        )? {
8884            ConversationInsertStatus::Inserted(conv_id) => conv_id,
8885            ConversationInsertStatus::Existing(existing_id) => {
8886                let ExistingMessageLookup {
8887                    by_idx: mut existing_messages,
8888                    replay: mut existing_replay_fingerprints,
8889                } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
8890                let ExistingConversationNewMessages {
8891                    messages: new_messages,
8892                    new_chars,
8893                    idx_collision_count,
8894                    first_collision_idx,
8895                } = collect_new_messages_for_existing_conversation(
8896                    existing_id,
8897                    conv,
8898                    &mut existing_messages,
8899                    &mut existing_replay_fingerprints,
8900                    "skipping replay-equivalent recovered message with shifted idx",
8901                );
8902                let (inserted_last_idx, inserted_last_created_at) =
8903                    borrowed_messages_tail_state(&new_messages);
8904                let mut inserted_indices = Vec::new();
8905                let mut fts_entries = Vec::new();
8906                let mut fts_pending_chars = 0usize;
8907                let mut _fts_inserted_total = 0usize;
8908                let inserted_message_ids =
8909                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
8910                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8911                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8912                    if !defer_lexical_updates {
8913                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8914                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8915                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8916                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8917                        {
8918                            flush_pending_fts_entries(
8919                                self,
8920                                &tx,
8921                                &mut fts_entries,
8922                                &mut fts_pending_chars,
8923                                &mut _fts_inserted_total,
8924                            )?;
8925                        }
8926                    }
8927                    inserted_indices.push(msg.idx);
8928                }
8929
8930                if idx_collision_count > 0 {
8931                    tracing::warn!(
8932                        conversation_id = existing_id,
8933                        collision_count = idx_collision_count,
8934                        first_idx = first_collision_idx,
8935                        source_path = %conv.source_path.display(),
8936                        "message idx collisions encountered while merging recovered conversation; retaining canonical message variants"
8937                    );
8938                }
8939
8940                if !defer_lexical_updates {
8941                    flush_pending_fts_entries(
8942                        self,
8943                        &tx,
8944                        &mut fts_entries,
8945                        &mut fts_pending_chars,
8946                        &mut _fts_inserted_total,
8947                    )?;
8948                }
8949
8950                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
8951                franken_update_conversation_tail_state(
8952                    &tx,
8953                    existing_id,
8954                    conv_last_ts,
8955                    inserted_last_idx,
8956                    inserted_last_created_at,
8957                )?;
8958                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
8959                {
8960                    franken_update_external_conversation_tail_lookup_key(
8961                        &tx,
8962                        &lookup_key,
8963                        conv_last_ts,
8964                        inserted_last_idx,
8965                        inserted_last_created_at,
8966                    )?;
8967                }
8968
8969                if !defer_analytics_updates && !inserted_indices.is_empty() {
8970                    franken_update_daily_stats_in_tx(
8971                        self,
8972                        &tx,
8973                        &conv.agent_slug,
8974                        &conv.source_id,
8975                        conversation_effective_started_at(conv),
8976                        StatsDelta {
8977                            session_count_delta: 0,
8978                            message_count_delta: inserted_indices.len() as i64,
8979                            total_chars_delta: new_chars,
8980                        },
8981                    )?;
8982                }
8983
8984                tx.commit()?;
8985                return Ok(InsertOutcome {
8986                    conversation_id: existing_id,
8987                    conversation_inserted: false,
8988                    inserted_indices,
8989                });
8990            }
8991        };
8992        let mut fts_entries = Vec::new();
8993        let mut fts_pending_chars = 0usize;
8994        let mut _fts_inserted_total = 0usize;
8995        let mut total_chars: i64 = 0;
8996        let mut inserted_indices = Vec::new();
8997        let mut pending_messages = HashMap::new();
8998        let mut pending_replay_fingerprints = HashSet::new();
8999        let mut idx_collision_count = 0usize;
9000        let mut first_collision_idx: Option<i64> = None;
9001        let mut new_messages = Vec::new();
9002        for msg in &conv.messages {
9003            let incoming_fingerprint = message_merge_fingerprint(msg);
9004            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
9005                if existing_fingerprint != &incoming_fingerprint {
9006                    idx_collision_count = idx_collision_count.saturating_add(1);
9007                    first_collision_idx.get_or_insert(msg.idx);
9008                }
9009                continue;
9010            }
9011            let incoming_replay = message_replay_fingerprint(msg);
9012            if pending_replay_fingerprints.contains(&incoming_replay) {
9013                tracing::debug!(
9014                    conversation_id = conv_id,
9015                    idx = msg.idx,
9016                    source_path = %conv.source_path.display(),
9017                    "skipping replay-equivalent duplicate message within new conversation insert"
9018                );
9019                continue;
9020            }
9021            pending_messages.insert(msg.idx, incoming_fingerprint);
9022            pending_replay_fingerprints.insert(incoming_replay);
9023            new_messages.push(msg);
9024        }
9025        let inserted_message_ids = franken_batch_insert_new_messages(&tx, conv_id, &new_messages)?;
9026        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9027            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9028            if !defer_lexical_updates {
9029                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9030                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9031                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9032                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9033                {
9034                    flush_pending_fts_entries(
9035                        self,
9036                        &tx,
9037                        &mut fts_entries,
9038                        &mut fts_pending_chars,
9039                        &mut _fts_inserted_total,
9040                    )?;
9041                }
9042            }
9043            total_chars += msg.content.len() as i64;
9044            inserted_indices.push(msg.idx);
9045        }
9046        if idx_collision_count > 0 {
9047            tracing::warn!(
9048                conversation_id = conv_id,
9049                collision_count = idx_collision_count,
9050                first_idx = first_collision_idx,
9051                source_path = %conv.source_path.display(),
9052                "message idx collisions encountered while inserting a new conversation; retaining the first canonical variant per idx"
9053            );
9054        }
9055        if !defer_lexical_updates {
9056            flush_pending_fts_entries(
9057                self,
9058                &tx,
9059                &mut fts_entries,
9060                &mut fts_pending_chars,
9061                &mut _fts_inserted_total,
9062            )?;
9063        }
9064
9065        if !defer_analytics_updates {
9066            franken_update_daily_stats_in_tx(
9067                self,
9068                &tx,
9069                &conv.agent_slug,
9070                &conv.source_id,
9071                conversation_effective_started_at(conv),
9072                StatsDelta {
9073                    session_count_delta: 1,
9074                    message_count_delta: inserted_indices.len() as i64,
9075                    total_chars_delta: total_chars,
9076                },
9077            )?;
9078        }
9079
9080        tx.commit()?;
9081        Ok(InsertOutcome {
9082            conversation_id: conv_id,
9083            conversation_inserted: true,
9084            inserted_indices,
9085        })
9086    }
9087
9088    #[cfg(test)]
9089    fn insert_conversation_tree_with_profile(
9090        &self,
9091        agent_id: i64,
9092        workspace_id: Option<i64>,
9093        conv: &Conversation,
9094        profile: &mut InsertConversationTreePerfProfile,
9095    ) -> Result<InsertOutcome> {
9096        let total_start = Instant::now();
9097        let normalized_conv = normalized_conversation_for_storage(conv);
9098        let conv = normalized_conv.as_ref();
9099
9100        let source_start = Instant::now();
9101        self.ensure_source_for_conversation(conv)?;
9102        profile.source_duration += source_start.elapsed();
9103
9104        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9105        let defer_analytics_updates = defer_analytics_updates_enabled();
9106        let conversation_key = conversation_merge_key(agent_id, conv);
9107
9108        let tx_open_start = Instant::now();
9109        let mut tx = self.conn.transaction()?;
9110        profile.tx_open_duration += tx_open_start.elapsed();
9111
9112        let existing_lookup_start = Instant::now();
9113        let existing =
9114            franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
9115        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9116        if let Some(existing_id) = existing {
9117            return Err(anyhow!(
9118                "profile helper expects new conversation path, found existing id {existing_id}"
9119            ));
9120        }
9121
9122        let conversation_row_start = Instant::now();
9123        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
9124            &tx,
9125            agent_id,
9126            workspace_id,
9127            conv,
9128            &conversation_key,
9129        )? {
9130            ConversationInsertStatus::Inserted(conv_id) => conv_id,
9131            ConversationInsertStatus::Existing(existing_id) => {
9132                return Err(anyhow!(
9133                    "profile helper expected inserted conversation row, reused existing id {existing_id}"
9134                ));
9135            }
9136        };
9137        profile.conversation_row_duration += conversation_row_start.elapsed();
9138
9139        let mut fts_entries = Vec::new();
9140        let mut fts_pending_chars = 0usize;
9141        let mut fts_inserted_total = 0usize;
9142        let mut total_chars: i64 = 0;
9143        let mut inserted_indices = Vec::new();
9144        let mut pending_messages = HashMap::new();
9145        let mut pending_replay_fingerprints = HashSet::new();
9146        let mut idx_collision_count = 0usize;
9147        let mut first_collision_idx: Option<i64> = None;
9148        let mut new_messages = Vec::new();
9149
9150        for msg in &conv.messages {
9151            let incoming_fingerprint = message_merge_fingerprint(msg);
9152            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
9153                if existing_fingerprint != &incoming_fingerprint {
9154                    idx_collision_count = idx_collision_count.saturating_add(1);
9155                    first_collision_idx.get_or_insert(msg.idx);
9156                }
9157                continue;
9158            }
9159
9160            let incoming_replay = message_replay_fingerprint(msg);
9161            if pending_replay_fingerprints.contains(&incoming_replay) {
9162                tracing::debug!(
9163                    conversation_id = conv_id,
9164                    idx = msg.idx,
9165                    source_path = %conv.source_path.display(),
9166                    "skipping replay-equivalent duplicate message within profiled new conversation insert"
9167                );
9168                continue;
9169            }
9170
9171            pending_messages.insert(msg.idx, incoming_fingerprint);
9172            pending_replay_fingerprints.insert(incoming_replay);
9173            new_messages.push(msg);
9174        }
9175
9176        let message_insert_start = Instant::now();
9177        let inserted_message_ids = franken_batch_insert_new_messages_with_profile(
9178            &tx,
9179            conv_id,
9180            &new_messages,
9181            &mut profile.message_insert_breakdown,
9182        )?;
9183        profile.message_insert_duration += message_insert_start.elapsed();
9184
9185        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9186            let snippet_insert_start = Instant::now();
9187            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9188            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9189
9190            if !defer_lexical_updates {
9191                let fts_entry_start = Instant::now();
9192                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9193                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9194                profile.fts_entry_duration += fts_entry_start.elapsed();
9195                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9196                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9197                {
9198                    let fts_flush_start = Instant::now();
9199                    flush_pending_fts_entries(
9200                        self,
9201                        &tx,
9202                        &mut fts_entries,
9203                        &mut fts_pending_chars,
9204                        &mut fts_inserted_total,
9205                    )?;
9206                    profile.fts_flush_duration += fts_flush_start.elapsed();
9207                }
9208            }
9209
9210            total_chars += msg.content.len() as i64;
9211            inserted_indices.push(msg.idx);
9212        }
9213
9214        if idx_collision_count > 0 {
9215            tracing::warn!(
9216                conversation_id = conv_id,
9217                collision_count = idx_collision_count,
9218                first_idx = first_collision_idx,
9219                source_path = %conv.source_path.display(),
9220                "message idx collisions encountered while profiling a new conversation insert; retaining the first canonical variant per idx"
9221            );
9222        }
9223
9224        if !defer_lexical_updates {
9225            let fts_flush_start = Instant::now();
9226            flush_pending_fts_entries(
9227                self,
9228                &tx,
9229                &mut fts_entries,
9230                &mut fts_pending_chars,
9231                &mut fts_inserted_total,
9232            )?;
9233            profile.fts_flush_duration += fts_flush_start.elapsed();
9234        }
9235
9236        if !defer_analytics_updates {
9237            let analytics_start = Instant::now();
9238            franken_update_daily_stats_in_tx(
9239                self,
9240                &tx,
9241                &conv.agent_slug,
9242                &conv.source_id,
9243                conversation_effective_started_at(conv),
9244                StatsDelta {
9245                    session_count_delta: 1,
9246                    message_count_delta: inserted_indices.len() as i64,
9247                    total_chars_delta: total_chars,
9248                },
9249            )?;
9250            profile.analytics_duration += analytics_start.elapsed();
9251        }
9252
9253        let commit_start = Instant::now();
9254        tx.commit()?;
9255        profile.commit_duration += commit_start.elapsed();
9256        profile.invocations += 1;
9257        profile.messages += conv.messages.len();
9258        profile.inserted_messages += inserted_indices.len();
9259        profile.total_duration += total_start.elapsed();
9260
9261        Ok(InsertOutcome {
9262            conversation_id: conv_id,
9263            conversation_inserted: true,
9264            inserted_indices,
9265        })
9266    }
9267
9268    #[cfg(test)]
9269    fn append_existing_conversation_with_profile(
9270        &self,
9271        agent_id: i64,
9272        _workspace_id: Option<i64>,
9273        conv: &Conversation,
9274        profile: &mut InsertConversationTreePerfProfile,
9275    ) -> Result<InsertOutcome> {
9276        let total_start = Instant::now();
9277        let normalized_conv = normalized_conversation_for_storage(conv);
9278        let conv = normalized_conv.as_ref();
9279
9280        let source_start = Instant::now();
9281        self.ensure_source_for_conversation(conv)?;
9282        profile.source_duration += source_start.elapsed();
9283
9284        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9285        let defer_analytics_updates = defer_analytics_updates_enabled();
9286        let conversation_key = conversation_merge_key(agent_id, conv);
9287
9288        let tx_open_start = Instant::now();
9289        let mut tx = self.conn.transaction()?;
9290        profile.tx_open_duration += tx_open_start.elapsed();
9291
9292        let existing_lookup_start = Instant::now();
9293        let existing = franken_find_existing_conversation_with_tail_by_key(
9294            &tx,
9295            &conversation_key,
9296            Some(conv),
9297        )?;
9298        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9299        let existing = existing.ok_or_else(|| {
9300            anyhow!("append profile helper expects existing conversation for {conversation_key:?}")
9301        })?;
9302        let existing_id = existing.id;
9303
9304        let existing_idx_lookup_start = Instant::now();
9305        let append_tail_state = existing.tail_state;
9306        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9307        let existing_plan = append_tail_state.as_ref().and_then(|state| {
9308            collect_append_only_tail_messages(
9309                conv,
9310                state.last_message_idx,
9311                state.last_message_created_at,
9312            )
9313        });
9314        let used_append_tail_plan = existing_plan.is_some();
9315        profile.existing_idx_lookup_duration += existing_idx_lookup_start.elapsed();
9316
9317        let dedupe_filter_start = Instant::now();
9318        let ExistingConversationNewMessages {
9319            messages: new_messages,
9320            new_chars,
9321            idx_collision_count,
9322            first_collision_idx,
9323        } = if let Some(existing_plan) = existing_plan {
9324            existing_plan
9325        } else {
9326            let ExistingMessageLookup {
9327                by_idx: mut existing_messages,
9328                replay: mut existing_replay_fingerprints,
9329            } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9330            collect_new_messages_for_existing_conversation(
9331                existing_id,
9332                conv,
9333                &mut existing_messages,
9334                &mut existing_replay_fingerprints,
9335                "skipping replay-equivalent profiled append message with shifted idx",
9336            )
9337        };
9338        profile.dedupe_filter_duration += dedupe_filter_start.elapsed();
9339
9340        let mut inserted_indices = Vec::new();
9341        let mut fts_entries = Vec::new();
9342        let mut fts_pending_chars = 0usize;
9343        let mut fts_inserted_total = 0usize;
9344        let (inserted_last_idx, inserted_last_created_at) =
9345            borrowed_messages_tail_state(&new_messages);
9346
9347        let message_insert_start = Instant::now();
9348        let inserted_message_ids = franken_append_insert_new_messages_with_profile(
9349            &tx,
9350            existing_id,
9351            &new_messages,
9352            &mut profile.message_insert_breakdown,
9353        )?;
9354        profile.message_insert_duration += message_insert_start.elapsed();
9355
9356        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9357            let snippet_insert_start = Instant::now();
9358            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9359            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9360
9361            if !defer_lexical_updates {
9362                let fts_entry_start = Instant::now();
9363                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9364                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9365                profile.fts_entry_duration += fts_entry_start.elapsed();
9366                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9367                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9368                {
9369                    let fts_flush_start = Instant::now();
9370                    flush_pending_fts_entries(
9371                        self,
9372                        &tx,
9373                        &mut fts_entries,
9374                        &mut fts_pending_chars,
9375                        &mut fts_inserted_total,
9376                    )?;
9377                    profile.fts_flush_duration += fts_flush_start.elapsed();
9378                }
9379            }
9380
9381            inserted_indices.push(msg.idx);
9382        }
9383
9384        if idx_collision_count > 0 {
9385            tracing::warn!(
9386                conversation_id = existing_id,
9387                collision_count = idx_collision_count,
9388                first_idx = first_collision_idx,
9389                source_path = %conv.source_path.display(),
9390                "message idx collisions encountered while profiling append merge; retaining canonical message variants"
9391            );
9392        }
9393
9394        if !defer_lexical_updates {
9395            let fts_flush_start = Instant::now();
9396            flush_pending_fts_entries(
9397                self,
9398                &tx,
9399                &mut fts_entries,
9400                &mut fts_pending_chars,
9401                &mut fts_inserted_total,
9402            )?;
9403            profile.fts_flush_duration += fts_flush_start.elapsed();
9404        }
9405
9406        let conversation_row_start = Instant::now();
9407        let mut exact_append_tail_set = false;
9408        if used_append_tail_plan {
9409            if let (Some(last_message_idx), Some(last_message_created_at)) =
9410                (inserted_last_idx, inserted_last_created_at)
9411            {
9412                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9413                    franken_set_conversation_tail_state_after_append(
9414                        &tx,
9415                        existing_id,
9416                        last_message_created_at,
9417                        last_message_idx,
9418                        last_message_created_at,
9419                    )?;
9420                    exact_append_tail_set = true;
9421                } else {
9422                    franken_update_conversation_tail_state(
9423                        &tx,
9424                        existing_id,
9425                        Some(last_message_created_at),
9426                        inserted_last_idx,
9427                        inserted_last_created_at,
9428                    )?;
9429                }
9430            }
9431        } else {
9432            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9433            franken_update_conversation_tail_state(
9434                &tx,
9435                existing_id,
9436                conv_last_ts,
9437                inserted_last_idx,
9438                inserted_last_created_at,
9439            )?;
9440        }
9441        franken_update_external_conversation_tail_after_append(
9442            &tx,
9443            agent_id,
9444            conv,
9445            used_append_tail_plan,
9446            exact_append_tail_set,
9447            inserted_last_idx,
9448            inserted_last_created_at,
9449        )?;
9450        profile.conversation_row_duration += conversation_row_start.elapsed();
9451
9452        if !defer_analytics_updates && !inserted_indices.is_empty() {
9453            let analytics_start = Instant::now();
9454            franken_update_daily_stats_in_tx(
9455                self,
9456                &tx,
9457                &conv.agent_slug,
9458                &conv.source_id,
9459                conversation_effective_started_at(conv),
9460                StatsDelta {
9461                    session_count_delta: 0,
9462                    message_count_delta: inserted_indices.len() as i64,
9463                    total_chars_delta: new_chars,
9464                },
9465            )?;
9466            profile.analytics_duration += analytics_start.elapsed();
9467        }
9468
9469        let commit_start = Instant::now();
9470        tx.commit()?;
9471        profile.commit_duration += commit_start.elapsed();
9472        profile.invocations += 1;
9473        profile.messages += conv.messages.len();
9474        profile.inserted_messages += inserted_indices.len();
9475        profile.total_duration += total_start.elapsed();
9476
9477        Ok(InsertOutcome {
9478            conversation_id: existing_id,
9479            conversation_inserted: false,
9480            inserted_indices,
9481        })
9482    }
9483
9484    /// Append new messages to an existing conversation within an active transaction.
9485    #[allow(clippy::too_many_arguments)]
9486    fn franken_append_messages_with_tail_in_tx(
9487        &self,
9488        tx: &FrankenTransaction<'_>,
9489        agent_id: i64,
9490        conversation_id: i64,
9491        conv: &Conversation,
9492        append_tail_state: Option<ExistingConversationTailState>,
9493        defer_lexical_updates: bool,
9494        defer_analytics_updates: bool,
9495    ) -> Result<InsertOutcome> {
9496        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9497        let append_plan = append_tail_state.as_ref().and_then(|state| {
9498            collect_append_only_tail_messages(
9499                conv,
9500                state.last_message_idx,
9501                state.last_message_created_at,
9502            )
9503        });
9504        let used_append_tail_plan = append_plan.is_some();
9505        let ExistingConversationNewMessages {
9506            messages: new_messages,
9507            new_chars,
9508            idx_collision_count,
9509            first_collision_idx,
9510        } = if let Some(append_plan) = append_plan {
9511            append_plan
9512        } else {
9513            let ExistingMessageLookup {
9514                by_idx: mut existing_messages,
9515                replay: mut existing_replay_fingerprints,
9516            } = franken_existing_message_lookup(tx, conversation_id, &conv.messages)?;
9517            collect_new_messages_for_existing_conversation(
9518                conversation_id,
9519                conv,
9520                &mut existing_messages,
9521                &mut existing_replay_fingerprints,
9522                "skipping replay-equivalent recovered message with shifted idx",
9523            )
9524        };
9525
9526        let mut inserted_indices = Vec::new();
9527        let mut fts_entries = Vec::new();
9528        let mut fts_pending_chars = 0usize;
9529        let mut _fts_inserted_total = 0usize;
9530        let (inserted_last_idx, inserted_last_created_at) =
9531            borrowed_messages_tail_state(&new_messages);
9532        let inserted_message_ids =
9533            franken_append_insert_new_messages(tx, conversation_id, &new_messages)?;
9534        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9535            franken_insert_snippets(tx, msg_id, &msg.snippets)?;
9536            if !defer_lexical_updates {
9537                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9538                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9539                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9540                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9541                {
9542                    flush_pending_fts_entries(
9543                        self,
9544                        tx,
9545                        &mut fts_entries,
9546                        &mut fts_pending_chars,
9547                        &mut _fts_inserted_total,
9548                    )?;
9549                }
9550            }
9551            inserted_indices.push(msg.idx);
9552        }
9553
9554        if idx_collision_count > 0 {
9555            tracing::warn!(
9556                conversation_id,
9557                collision_count = idx_collision_count,
9558                first_idx = first_collision_idx,
9559                source_path = %conv.source_path.display(),
9560                "message idx collisions encountered while appending to an existing conversation; retaining canonical message variants"
9561            );
9562        }
9563
9564        if !defer_lexical_updates {
9565            flush_pending_fts_entries(
9566                self,
9567                tx,
9568                &mut fts_entries,
9569                &mut fts_pending_chars,
9570                &mut _fts_inserted_total,
9571            )?;
9572        }
9573
9574        let mut exact_append_tail_set = false;
9575        if used_append_tail_plan {
9576            if let (Some(last_message_idx), Some(last_message_created_at)) =
9577                (inserted_last_idx, inserted_last_created_at)
9578            {
9579                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9580                    franken_set_conversation_tail_state_after_append(
9581                        tx,
9582                        conversation_id,
9583                        last_message_created_at,
9584                        last_message_idx,
9585                        last_message_created_at,
9586                    )?;
9587                    exact_append_tail_set = true;
9588                } else {
9589                    franken_update_conversation_tail_state(
9590                        tx,
9591                        conversation_id,
9592                        Some(last_message_created_at),
9593                        inserted_last_idx,
9594                        inserted_last_created_at,
9595                    )?;
9596                }
9597            }
9598        } else {
9599            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9600            franken_update_conversation_tail_state(
9601                tx,
9602                conversation_id,
9603                conv_last_ts,
9604                inserted_last_idx,
9605                inserted_last_created_at,
9606            )?;
9607        }
9608        franken_update_external_conversation_tail_after_append(
9609            tx,
9610            agent_id,
9611            conv,
9612            used_append_tail_plan,
9613            exact_append_tail_set,
9614            inserted_last_idx,
9615            inserted_last_created_at,
9616        )?;
9617
9618        if !defer_analytics_updates && !inserted_indices.is_empty() {
9619            let message_count = inserted_indices.len() as i64;
9620            franken_update_daily_stats_in_tx(
9621                self,
9622                tx,
9623                &conv.agent_slug,
9624                &conv.source_id,
9625                conversation_effective_started_at(conv),
9626                StatsDelta {
9627                    session_count_delta: 0,
9628                    message_count_delta: message_count,
9629                    total_chars_delta: new_chars,
9630                },
9631            )?;
9632        }
9633
9634        Ok(InsertOutcome {
9635            conversation_id,
9636            conversation_inserted: false,
9637            inserted_indices,
9638        })
9639    }
9640
9641    /// Rebuild the FTS5 index from scratch (chunked to avoid OOM on large databases, #110).
9642    pub fn rebuild_fts(&self) -> Result<()> {
9643        self.rebuild_fts_via_frankensqlite().map(|_| ())
9644    }
9645
9646    /// Best-effort repair for the derived SQLite FTS fallback index.
9647    ///
9648    /// The canonical archive and Tantivy index remain authoritative, so callers
9649    /// should invoke this from maintenance paths rather than ordinary opens.
9650    pub(crate) fn ensure_search_fallback_fts_consistency(&self) -> Result<FtsConsistencyRepair> {
9651        self.ensure_fts_consistency_via_frankensqlite()
9652    }
9653
9654    pub(crate) fn validate_fts_messages_integrity(&self) -> Result<()> {
9655        validate_fts_messages_integrity_for_connection(&self.conn)
9656    }
9657
9658    pub(crate) fn fallback_fts_is_known_healthy_for_archive_fingerprint(
9659        &self,
9660        archive_fingerprint: &str,
9661    ) -> Result<bool> {
9662        Ok(
9663            self.read_fts_franken_rebuild_generation()? == Some(FTS_FRANKEN_REBUILD_GENERATION)
9664                && self
9665                    .read_fts_franken_rebuild_archive_fingerprint()?
9666                    .as_deref()
9667                    == Some(archive_fingerprint),
9668        )
9669    }
9670
9671    pub(crate) fn record_search_fallback_fts_archive_fingerprint(
9672        &self,
9673        archive_fingerprint: &str,
9674    ) -> Result<()> {
9675        self.conn
9676            .execute_compat(
9677                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9678                fparams![
9679                    FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY,
9680                    archive_fingerprint.to_string()
9681                ],
9682            )
9683            .with_context(|| "recording frankensqlite FTS archive fingerprint")?;
9684        Ok(())
9685    }
9686
9687    pub(crate) fn daily_stats_is_known_healthy_for_archive_fingerprint(
9688        &self,
9689        archive_fingerprint: &str,
9690    ) -> Result<bool> {
9691        Ok(
9692            self.read_daily_stats_health_generation()? == Some(DAILY_STATS_HEALTH_GENERATION)
9693                && self.read_daily_stats_archive_fingerprint()?.as_deref()
9694                    == Some(archive_fingerprint),
9695        )
9696    }
9697
9698    pub(crate) fn record_daily_stats_archive_fingerprint(
9699        &self,
9700        archive_fingerprint: &str,
9701    ) -> Result<()> {
9702        self.conn
9703            .execute_compat(
9704                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9705                fparams![
9706                    DAILY_STATS_HEALTH_GENERATION_META_KEY,
9707                    DAILY_STATS_HEALTH_GENERATION.to_string()
9708                ],
9709            )
9710            .with_context(|| "recording daily_stats health generation")?;
9711        self.conn
9712            .execute_compat(
9713                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9714                fparams![DAILY_STATS_HEALTH_META_KEY, archive_fingerprint.to_string()],
9715            )
9716            .with_context(|| "recording daily_stats archive fingerprint")?;
9717        Ok(())
9718    }
9719
9720    fn read_fts_franken_rebuild_generation(&self) -> Result<Option<i64>> {
9721        let value: Option<String> = self
9722            .conn
9723            .query_row_map(
9724                "SELECT value FROM meta WHERE key = ?1",
9725                fparams![FTS_FRANKEN_REBUILD_META_KEY],
9726                |row| row.get_typed(0),
9727            )
9728            .optional()?;
9729        Ok(value.and_then(|v| v.parse::<i64>().ok()))
9730    }
9731
9732    fn read_fts_franken_rebuild_archive_fingerprint(&self) -> Result<Option<String>> {
9733        Ok(self
9734            .conn
9735            .query_row_map(
9736                "SELECT value FROM meta WHERE key = ?1",
9737                fparams![FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY],
9738                |row| row.get_typed(0),
9739            )
9740            .optional()?)
9741    }
9742
9743    fn read_daily_stats_health_generation(&self) -> Result<Option<i64>> {
9744        let value: Option<String> = self
9745            .conn
9746            .query_row_map(
9747                "SELECT value FROM meta WHERE key = ?1",
9748                fparams![DAILY_STATS_HEALTH_GENERATION_META_KEY],
9749                |row| row.get_typed(0),
9750            )
9751            .optional()?;
9752        Ok(value.and_then(|value| value.parse::<i64>().ok()))
9753    }
9754
9755    fn read_daily_stats_archive_fingerprint(&self) -> Result<Option<String>> {
9756        Ok(self
9757            .conn
9758            .query_row_map(
9759                "SELECT value FROM meta WHERE key = ?1",
9760                fparams![DAILY_STATS_HEALTH_META_KEY],
9761                |row| row.get_typed(0),
9762            )
9763            .optional()?)
9764    }
9765
9766    fn record_fts_franken_rebuild_generation(&self) -> Result<()> {
9767        self.conn
9768            .execute_compat(
9769                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9770                fparams![
9771                    FTS_FRANKEN_REBUILD_META_KEY,
9772                    FTS_FRANKEN_REBUILD_GENERATION.to_string()
9773                ],
9774            )
9775            .with_context(|| "recording frankensqlite FTS rebuild generation")?;
9776        Ok(())
9777    }
9778
9779    fn ensure_fts_consistency_via_frankensqlite(&self) -> Result<FtsConsistencyRepair> {
9780        if self.read_fts_franken_rebuild_generation()? != Some(FTS_FRANKEN_REBUILD_GENERATION) {
9781            // Before triggering an expensive full rebuild, probe whether
9782            // fts_messages is already populated and consistent.  On large
9783            // databases the rebuild can take hours and OOM — skip it when
9784            // the only thing missing is the generation marker (#184).
9785            let fts_already_healthy = (|| -> Result<bool> {
9786                let fts_exists: i64 = self.conn.query_row_map(
9787                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9788                    fparams![],
9789                    |row| row.get_typed(0),
9790                )?;
9791                if fts_exists != 1 {
9792                    return Ok(false);
9793                }
9794                let total: i64 = self.conn.query_row_map(
9795                    "SELECT COUNT(*) FROM messages",
9796                    fparams![],
9797                    |row| row.get_typed(0),
9798                )?;
9799                if total == 0 {
9800                    return Ok(false);
9801                }
9802                let indexed: i64 = self.conn.query_row_map(
9803                    "SELECT COUNT(*) FROM fts_messages",
9804                    fparams![],
9805                    |row| row.get_typed(0),
9806                )?;
9807                // Consider healthy if >=90% of messages are indexed
9808                Ok(indexed > 0 && indexed * 100 >= total * 90)
9809            })()
9810            .unwrap_or(false);
9811
9812            if fts_already_healthy {
9813                tracing::info!(
9814                    target: "cass::fts_rebuild",
9815                    "FTS already populated and consistent; setting generation marker without rebuild"
9816                );
9817                self.record_fts_franken_rebuild_generation()?;
9818                self.set_fts_messages_present_cache(true);
9819            } else {
9820                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9821                self.record_fts_franken_rebuild_generation()?;
9822                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9823            }
9824        }
9825
9826        let inspection = (|| -> Result<(i64, bool)> {
9827            let fts_schema_rows = self.conn.query_row_map(
9828                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9829                fparams![],
9830                |row| row.get_typed::<i64>(0),
9831            )?;
9832            let fts_queryable = fts_schema_rows == 1
9833                && self.conn.query("SELECT COUNT(*) FROM fts_messages").is_ok();
9834            Ok((fts_schema_rows, fts_queryable))
9835        })();
9836
9837        let (fts_schema_rows, fts_queryable) = match inspection {
9838            Ok(result) => result,
9839            Err(err) => {
9840                tracing::warn!(
9841                    error = %err,
9842                    "frankensqlite FTS consistency probe failed; rebuilding authoritative FTS"
9843                );
9844                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9845                self.record_fts_franken_rebuild_generation()?;
9846                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9847            }
9848        };
9849
9850        if fts_schema_rows != 1 || !fts_queryable {
9851            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9852            self.record_fts_franken_rebuild_generation()?;
9853            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9854        }
9855
9856        let total_messages =
9857            self.conn
9858                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
9859                    row.get_typed::<i64>(0)
9860                })?;
9861        let indexed_messages =
9862            self.conn
9863                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9864                    row.get_typed::<i64>(0)
9865                })?;
9866
9867        if indexed_messages == total_messages {
9868            self.set_fts_messages_present_cache(true);
9869            return Ok(FtsConsistencyRepair::AlreadyHealthy {
9870                rows: usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX),
9871            });
9872        }
9873
9874        if indexed_messages > total_messages {
9875            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9876            self.record_fts_franken_rebuild_generation()?;
9877            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9878        }
9879
9880        let inserted_rows = self
9881            .stream_fts_rows_via_frankensqlite(true)
9882            .with_context(|| "incrementally repairing missing FTS rows via frankensqlite")?;
9883        let repaired_rows =
9884            self.conn
9885                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9886                    row.get_typed::<i64>(0)
9887                })?;
9888        if repaired_rows == total_messages {
9889            self.set_fts_messages_present_cache(true);
9890            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9891                inserted_rows,
9892                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9893            });
9894        }
9895
9896        // The incremental catch-up found nothing to insert, yet the gap
9897        // between total_messages (all rows, including orphans) and
9898        // indexed_messages (only rows with valid conversation_id, since the
9899        // FTS INSERT inner-joins on conversations) remains.  A full rebuild
9900        // cannot close this gap either — the orphaned messages will be
9901        // excluded again — so falling through to one would just re-do ~5 min
9902        // of work on every startup.  Accept the current state.
9903        if inserted_rows == 0 {
9904            tracing::debug!(
9905                target: "cass::fts_rebuild",
9906                indexed_messages = repaired_rows,
9907                total_messages,
9908                un_indexable_gap = total_messages.saturating_sub(repaired_rows),
9909                "FTS catch-up inserted 0 rows; remaining gap is un-indexable (likely orphaned messages with dangling conversation_id)"
9910            );
9911            self.set_fts_messages_present_cache(true);
9912            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9913                inserted_rows: 0,
9914                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9915            });
9916        }
9917
9918        // Incremental made progress but didn't fully close the gap — something
9919        // is genuinely inconsistent, so do a full rebuild.
9920        let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9921        self.record_fts_franken_rebuild_generation()?;
9922        Ok(FtsConsistencyRepair::Rebuilt { inserted_rows })
9923    }
9924
9925    pub(crate) fn rebuild_fts_via_frankensqlite(&self) -> Result<usize> {
9926        self.invalidate_fts_messages_present_cache();
9927        self.conn
9928            .execute("DROP TABLE IF EXISTS fts_messages;")
9929            .with_context(|| "dropping derived fts_messages before frankensqlite rebuild")?;
9930        self.conn
9931            .execute_compat(FTS5_REGISTER_SQL, fparams![])
9932            .with_context(|| "creating derived fts_messages via frankensqlite rebuild")?;
9933        self.set_fts_messages_present_cache(true);
9934
9935        self.stream_fts_rows_via_frankensqlite(false)
9936    }
9937
9938    fn stream_fts_rows_via_frankensqlite(&self, missing_only: bool) -> Result<usize> {
9939        let batch_size = fts_rebuild_batch_size().max(1);
9940        let batch_limit = i64::try_from(batch_size).unwrap_or(i64::MAX);
9941        let mut total_inserted: usize = 0;
9942        let mut total_skipped_orphans: usize = 0;
9943        let mut total_skipped_existing: usize = 0;
9944        let mut last_rowid: i64 = 0;
9945        let conversation_by_id = self.load_fts_conversation_projection_map()?;
9946        let agent_slug_by_id = self.load_fts_agent_slug_map()?;
9947        let workspace_path_by_id = self.load_fts_workspace_path_map()?;
9948        let existing_fts_rowids = if missing_only {
9949            Some(self.load_fts_message_rowid_set()?)
9950        } else {
9951            None
9952        };
9953        let mut entries = Vec::new();
9954        let mut pending_chars = 0usize;
9955
9956        loop {
9957            let rows = self.fetch_fts_rebuild_message_rows(last_rowid, batch_limit)?;
9958            let fetched_count = rows.len();
9959            if fetched_count == 0 {
9960                break;
9961            }
9962
9963            let inserted_before_batch = total_inserted;
9964            let skipped_before_batch = total_skipped_orphans;
9965            let existing_before_batch = total_skipped_existing;
9966
9967            for row in rows {
9968                last_rowid = row.rowid;
9969                if existing_fts_rowids
9970                    .as_ref()
9971                    .is_some_and(|rowids| rowids.contains(&row.message_id))
9972                {
9973                    total_skipped_existing = total_skipped_existing.saturating_add(1);
9974                    continue;
9975                }
9976                let Some(conversation) = conversation_by_id.get(&row.conversation_id) else {
9977                    total_skipped_orphans = total_skipped_orphans.saturating_add(1);
9978                    continue;
9979                };
9980                let agent = conversation
9981                    .agent_id
9982                    .and_then(|agent_id| agent_slug_by_id.get(&agent_id))
9983                    .filter(|slug| !slug.is_empty())
9984                    .cloned()
9985                    .unwrap_or_else(|| "unknown".to_string());
9986                let workspace = conversation
9987                    .workspace_id
9988                    .and_then(|workspace_id| workspace_path_by_id.get(&workspace_id))
9989                    .cloned()
9990                    .unwrap_or_default();
9991                pending_chars = pending_chars.saturating_add(row.content.len());
9992                entries.push(FtsEntry {
9993                    content: row.content,
9994                    title: conversation.title.clone(),
9995                    agent,
9996                    workspace,
9997                    source_path: conversation.source_path.clone(),
9998                    created_at: row.created_at,
9999                    message_id: row.message_id,
10000                });
10001                if entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10002                    || pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10003                {
10004                    total_inserted = total_inserted.saturating_add(
10005                        franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
10006                    );
10007                    entries.clear();
10008                    pending_chars = 0;
10009                }
10010            }
10011
10012            if !entries.is_empty() {
10013                total_inserted = total_inserted.saturating_add(
10014                    franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
10015                );
10016                entries.clear();
10017                pending_chars = 0;
10018            }
10019
10020            tracing::debug!(
10021                target: "cass::fts_rebuild",
10022                batch_rows = fetched_count,
10023                batch_inserted = total_inserted.saturating_sub(inserted_before_batch),
10024                batch_skipped_orphans = total_skipped_orphans.saturating_sub(skipped_before_batch),
10025                batch_skipped_existing = total_skipped_existing.saturating_sub(existing_before_batch),
10026                total_inserted,
10027                total_skipped_orphans,
10028                total_skipped_existing,
10029                last_rowid,
10030                missing_only,
10031                "FTS streaming maintenance batch complete"
10032            );
10033
10034            if fetched_count < batch_size {
10035                break;
10036            }
10037        }
10038
10039        Ok(total_inserted)
10040    }
10041
10042    fn fetch_fts_rebuild_message_rows(
10043        &self,
10044        last_rowid: i64,
10045        batch_limit: i64,
10046    ) -> Result<Vec<FtsRebuildMessageRow>> {
10047        self.conn
10048            .query_map_collect(
10049                "SELECT m.rowid, m.id, m.conversation_id, m.content, m.created_at
10050                 FROM messages m
10051                 WHERE m.rowid > ?1
10052                 ORDER BY m.rowid
10053                 LIMIT ?2",
10054                fparams![last_rowid, batch_limit],
10055                |row| {
10056                    Ok(FtsRebuildMessageRow {
10057                        rowid: row.get_typed(0)?,
10058                        message_id: row.get_typed(1)?,
10059                        conversation_id: row.get_typed(2)?,
10060                        content: row.get_typed::<Option<String>>(3)?.unwrap_or_default(),
10061                        created_at: row.get_typed(4)?,
10062                    })
10063                },
10064            )
10065            .with_context(|| format!("fetching FTS maintenance messages after rowid {last_rowid}"))
10066    }
10067
10068    fn load_fts_message_rowid_set(&self) -> Result<HashSet<i64>> {
10069        let rows: Vec<i64> = self
10070            .conn
10071            .query_map_collect("SELECT rowid FROM fts_messages", fparams![], |row| {
10072                row.get_typed(0)
10073            })
10074            .with_context(|| "loading existing FTS message rowids")?;
10075        Ok(rows.into_iter().collect())
10076    }
10077
10078    fn load_fts_conversation_projection_map(
10079        &self,
10080    ) -> Result<HashMap<i64, FtsConversationProjection>> {
10081        let rows: Vec<(i64, FtsConversationProjection)> = self
10082            .conn
10083            .query_map_collect(
10084                "SELECT id, title, agent_id, workspace_id, source_path
10085                 FROM conversations",
10086                fparams![],
10087                |row| {
10088                    Ok((
10089                        row.get_typed(0)?,
10090                        FtsConversationProjection {
10091                            title: row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
10092                            agent_id: row.get_typed(2)?,
10093                            workspace_id: row.get_typed(3)?,
10094                            source_path: row.get_typed::<Option<String>>(4)?.unwrap_or_default(),
10095                        },
10096                    ))
10097                },
10098            )
10099            .with_context(|| "loading FTS conversation projection map")?;
10100        Ok(rows.into_iter().collect())
10101    }
10102
10103    fn load_fts_agent_slug_map(&self) -> Result<HashMap<i64, String>> {
10104        let rows: Vec<(i64, String)> = self
10105            .conn
10106            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
10107                Ok((
10108                    row.get_typed(0)?,
10109                    row.get_typed::<Option<String>>(1)?
10110                        .unwrap_or_else(|| "unknown".to_string()),
10111                ))
10112            })
10113            .with_context(|| "loading FTS agent slug map")?;
10114        Ok(rows.into_iter().collect())
10115    }
10116
10117    fn load_fts_workspace_path_map(&self) -> Result<HashMap<i64, String>> {
10118        let rows: Vec<(i64, String)> = self
10119            .conn
10120            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
10121                Ok((
10122                    row.get_typed(0)?,
10123                    row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
10124                ))
10125            })
10126            .with_context(|| "loading FTS workspace path map")?;
10127        Ok(rows.into_iter().collect())
10128    }
10129
10130    /// Fetch all messages for embedding generation.
10131    pub fn fetch_messages_for_embedding(&self) -> Result<Vec<MessageForEmbedding>> {
10132        // COALESCE(c.agent_id, 0) so legacy V1 conversations with NULL
10133        // agent_id don't cause a runtime row-decode failure (agent_id in
10134        // MessageForEmbedding is i64).  saturating_u32_from_i64 downstream
10135        // turns 0 into the "unknown agent" sentinel for doc-id hashing.
10136        self.conn
10137            .query_map_collect(
10138                "SELECT m.id, m.created_at, COALESCE(c.agent_id, 0), c.workspace_id, c.source_id, m.role, m.content
10139                 FROM messages m
10140                 JOIN conversations c ON m.conversation_id = c.id
10141                 ORDER BY m.id",
10142                fparams![],
10143                |row| {
10144                    let source_id: String = row.get_typed::<Option<String>>(4)?
10145                        .unwrap_or_else(|| "local".to_string());
10146                    Ok(MessageForEmbedding {
10147                        message_id: row.get_typed(0)?,
10148                        created_at: row.get_typed(1)?,
10149                        agent_id: row.get_typed(2)?,
10150                        workspace_id: row.get_typed(3)?,
10151                        source_id_hash: crc32fast::hash(source_id.as_bytes()),
10152                        role: row.get_typed(5)?,
10153                        content: row.get_typed(6)?,
10154                    })
10155                },
10156            )
10157            .with_context(|| "fetching messages for embedding")
10158    }
10159
10160    /// Get the watermark for incremental semantic embedding.
10161    pub fn get_last_embedded_message_id(&self) -> Result<Option<i64>> {
10162        let result: Result<String, _> = self.conn.query_row_map(
10163            "SELECT value FROM meta WHERE key = 'last_embedded_message_id'",
10164            fparams![],
10165            |row| row.get_typed(0),
10166        );
10167        match result.optional() {
10168            Ok(Some(s)) => Ok(s.parse().ok()),
10169            Ok(None) => Ok(None),
10170            Err(e) => Err(e.into()),
10171        }
10172    }
10173
10174    /// Set the watermark for incremental semantic embedding.
10175    pub fn set_last_embedded_message_id(&self, id: i64) -> Result<()> {
10176        self.conn.execute_compat(
10177            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_embedded_message_id', ?1)",
10178            fparams![id.to_string()],
10179        )?;
10180        Ok(())
10181    }
10182
10183    /// Get embedding jobs for a database path.
10184    pub fn get_embedding_jobs(&self, db_path: &str) -> Result<Vec<EmbeddingJobRow>> {
10185        self.conn
10186            .query_map_collect(
10187                "SELECT id, db_path, model_id, status, total_docs, completed_docs, error_message, created_at, started_at, completed_at
10188                 FROM embedding_jobs WHERE db_path = ?1 ORDER BY id DESC",
10189                fparams![db_path],
10190                |row| {
10191                    Ok(EmbeddingJobRow {
10192                        id: row.get_typed(0)?,
10193                        db_path: row.get_typed(1)?,
10194                        model_id: row.get_typed(2)?,
10195                        status: row.get_typed(3)?,
10196                        total_docs: row.get_typed(4)?,
10197                        completed_docs: row.get_typed(5)?,
10198                        error_message: row.get_typed(6)?,
10199                        created_at: row.get_typed(7)?,
10200                        started_at: row.get_typed(8)?,
10201                        completed_at: row.get_typed(9)?,
10202                    })
10203                },
10204            )
10205            .with_context(|| format!("fetching embedding jobs for {db_path}"))
10206    }
10207
10208    /// Create or update an embedding job.
10209    pub fn upsert_embedding_job(
10210        &self,
10211        db_path: &str,
10212        model_id: &str,
10213        total_docs: i64,
10214    ) -> Result<i64> {
10215        let updated = self.conn.execute_compat(
10216            "UPDATE embedding_jobs
10217             SET total_docs = ?3
10218             WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10219            fparams![db_path, model_id, total_docs],
10220        )?;
10221        if updated == 0 {
10222            let insert_result = self.conn.execute_compat(
10223                "INSERT INTO embedding_jobs(db_path, model_id, total_docs) VALUES(?1,?2,?3)",
10224                fparams![db_path, model_id, total_docs],
10225            );
10226            if let Err(err) = insert_result {
10227                if !matches!(err, frankensqlite::FrankenError::UniqueViolation { .. }) {
10228                    return Err(err.into());
10229                }
10230                self.conn.execute_compat(
10231                    "UPDATE embedding_jobs
10232                     SET total_docs = ?3
10233                     WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10234                    fparams![db_path, model_id, total_docs],
10235                )?;
10236            }
10237        }
10238        self.conn
10239            .query_row_map(
10240                "SELECT id FROM embedding_jobs
10241                 WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')
10242                 ORDER BY id DESC
10243                 LIMIT 1",
10244                fparams![db_path, model_id],
10245                |row| row.get_typed(0),
10246            )
10247            .with_context(|| "resolving embedding job id after upsert")
10248    }
10249
10250    /// Mark an embedding job as started.
10251    pub fn start_embedding_job(&self, job_id: i64) -> Result<()> {
10252        self.conn.execute_compat(
10253            "UPDATE embedding_jobs SET status = 'running', started_at = datetime('now') WHERE id = ?1",
10254            fparams![job_id],
10255        )?;
10256        Ok(())
10257    }
10258
10259    /// Mark an embedding job as completed.
10260    pub fn complete_embedding_job(&self, job_id: i64) -> Result<()> {
10261        self.conn.execute_compat(
10262            "UPDATE embedding_jobs SET status = 'completed', completed_at = datetime('now') WHERE id = ?1",
10263            fparams![job_id],
10264        )?;
10265        Ok(())
10266    }
10267
10268    /// Mark an embedding job as failed.
10269    pub fn fail_embedding_job(&self, job_id: i64, error: &str) -> Result<()> {
10270        self.conn.execute_compat(
10271            "UPDATE embedding_jobs SET status = 'failed', error_message = ?2, completed_at = datetime('now') WHERE id = ?1",
10272            fparams![job_id, error],
10273        )?;
10274        Ok(())
10275    }
10276
10277    /// Cancel embedding jobs for a database path.
10278    pub fn cancel_embedding_jobs(&self, db_path: &str, model_id: Option<&str>) -> Result<usize> {
10279        if let Some(mid) = model_id {
10280            Ok(self.conn.execute_compat(
10281                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10282                fparams![db_path, mid],
10283            )?)
10284        } else {
10285            Ok(self.conn.execute_compat(
10286                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND status IN ('pending', 'running')",
10287                fparams![db_path],
10288            )?)
10289        }
10290    }
10291
10292    /// Update embedding job progress.
10293    pub fn update_job_progress(&self, job_id: i64, completed_docs: i64) -> Result<()> {
10294        self.conn.execute_compat(
10295            "UPDATE embedding_jobs SET completed_docs = ?2 WHERE id = ?1",
10296            fparams![job_id, completed_docs],
10297        )?;
10298        Ok(())
10299    }
10300
10301    // =====================================================================
10302    // Analytics query methods
10303    // =====================================================================
10304
10305    /// Get session count for a date range using materialized stats.
10306    /// Returns (count, is_from_cache) where is_from_cache is true if from daily_stats.
10307    ///
10308    /// Falls back to COUNT(*) query when daily_stats table is empty or stale.
10309    pub fn count_sessions_in_range(
10310        &self,
10311        start_ts_ms: Option<i64>,
10312        end_ts_ms: Option<i64>,
10313        agent_slug: Option<&str>,
10314        source_id: Option<&str>,
10315    ) -> Result<(i64, bool)> {
10316        let agent = agent_slug.unwrap_or("all");
10317        let source = source_id.unwrap_or("all");
10318
10319        // Check if we have materialized stats
10320        let stats_count: i64 = self
10321            .conn
10322            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10323                row.get_typed(0)
10324            })
10325            .unwrap_or(0);
10326
10327        if stats_count == 0 {
10328            return self.count_sessions_direct(start_ts_ms, end_ts_ms, agent_slug, source_id);
10329        }
10330
10331        // Use materialized stats
10332        let start_day = start_ts_ms.map(Self::day_id_from_millis);
10333        let end_day = end_ts_ms.map(Self::day_id_from_millis);
10334
10335        let count: i64 = match (start_day, end_day) {
10336            (Some(start), Some(end)) => self.conn.query_row_map(
10337                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10338                 WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4",
10339                fparams![start, end, agent, source],
10340                |row| row.get_typed(0),
10341            )?,
10342            (Some(start), None) => self.conn.query_row_map(
10343                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10344                 WHERE day_id >= ?1 AND agent_slug = ?2 AND source_id = ?3",
10345                fparams![start, agent, source],
10346                |row| row.get_typed(0),
10347            )?,
10348            (None, Some(end)) => self.conn.query_row_map(
10349                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10350                 WHERE day_id <= ?1 AND agent_slug = ?2 AND source_id = ?3",
10351                fparams![end, agent, source],
10352                |row| row.get_typed(0),
10353            )?,
10354            (None, None) => self.conn.query_row_map(
10355                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10356                 WHERE agent_slug = ?1 AND source_id = ?2",
10357                fparams![agent, source],
10358                |row| row.get_typed(0),
10359            )?,
10360        };
10361
10362        Ok((count, true))
10363    }
10364
10365    /// Direct COUNT(*) query as fallback when daily_stats is empty.
10366    fn count_sessions_direct(
10367        &self,
10368        start_ts_ms: Option<i64>,
10369        end_ts_ms: Option<i64>,
10370        agent_slug: Option<&str>,
10371        source_id: Option<&str>,
10372    ) -> Result<(i64, bool)> {
10373        // Build dynamic SQL with positional params.  Single-table scan of
10374        // conversations; filter on agent slug via an EXISTS subquery only
10375        // when that filter is actually requested.  This avoids the unneeded
10376        // 2-table JOIN (which also silently dropped legacy conversations
10377        // with NULL agent_id) and sidesteps frankensqlite's materialization
10378        // fallback entirely.
10379        let mut sql = "SELECT COUNT(*) FROM conversations c WHERE 1=1".to_string();
10380        let mut param_values: Vec<ParamValue> = Vec::new();
10381        let mut idx = 1;
10382
10383        if let Some(start) = start_ts_ms {
10384            sql.push_str(&format!(" AND c.started_at >= ?{idx}"));
10385            param_values.push(ParamValue::from(start));
10386            idx += 1;
10387        }
10388        if let Some(end) = end_ts_ms {
10389            sql.push_str(&format!(" AND c.started_at <= ?{idx}"));
10390            param_values.push(ParamValue::from(end));
10391            idx += 1;
10392        }
10393        if let Some(agent) = agent_slug
10394            && agent != "all"
10395        {
10396            sql.push_str(&format!(
10397                " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug = ?{idx})"
10398            ));
10399            param_values.push(ParamValue::from(agent));
10400            idx += 1;
10401        }
10402        if let Some(source) = source_id
10403            && source != "all"
10404        {
10405            sql.push_str(&format!(" AND c.source_id = ?{idx}"));
10406            param_values.push(ParamValue::from(source));
10407            let _ = idx; // suppress unused warning
10408        }
10409
10410        let count: i64 = self
10411            .conn
10412            .query_row_map(&sql, &param_values, |row| row.get_typed(0))?;
10413        Ok((count, false))
10414    }
10415
10416    /// Get daily histogram data for a date range.
10417    pub fn get_daily_histogram(
10418        &self,
10419        start_ts_ms: i64,
10420        end_ts_ms: i64,
10421        agent_slug: Option<&str>,
10422        source_id: Option<&str>,
10423    ) -> Result<Vec<DailyCount>> {
10424        let start_day = Self::day_id_from_millis(start_ts_ms);
10425        let end_day = Self::day_id_from_millis(end_ts_ms);
10426        let agent = agent_slug.unwrap_or("all");
10427        let source = source_id.unwrap_or("all");
10428
10429        let rows = self.conn.query_map_collect(
10430            "SELECT day_id, session_count, message_count, total_chars
10431             FROM daily_stats
10432             WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4
10433             ORDER BY day_id",
10434            fparams![start_day, end_day, agent, source],
10435            |row| {
10436                Ok(DailyCount {
10437                    day_id: row.get_typed(0)?,
10438                    sessions: row.get_typed(1)?,
10439                    messages: row.get_typed(2)?,
10440                    chars: row.get_typed(3)?,
10441                })
10442            },
10443        )?;
10444
10445        Ok(rows)
10446    }
10447
10448    /// Check health of daily stats table.
10449    pub fn daily_stats_health(&self) -> Result<DailyStatsHealth> {
10450        let row_count: i64 =
10451            self.conn
10452                .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10453                    row.get_typed(0)
10454                })?;
10455
10456        let oldest_update: Option<i64> = self.conn.query_row_map(
10457            "SELECT MIN(last_updated) FROM daily_stats",
10458            fparams![],
10459            |row| row.get_typed(0),
10460        )?;
10461
10462        let conversation_count: i64 =
10463            self.conn
10464                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
10465                    row.get_typed(0)
10466                })?;
10467
10468        let materialized_total: i64 = self.conn.query_row_map(
10469            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10470                 WHERE agent_slug = 'all' AND source_id = 'all'",
10471            fparams![],
10472            |row| row.get_typed(0),
10473        )?;
10474
10475        Ok(DailyStatsHealth {
10476            populated: row_count > 0,
10477            row_count,
10478            oldest_update_ms: oldest_update,
10479            conversation_count,
10480            materialized_total,
10481            drift: (conversation_count - materialized_total).abs(),
10482        })
10483    }
10484
10485    /// Batch insert multiple conversations with full analytics (token usage,
10486    /// message metrics, rollups).  Frankensqlite equivalent of
10487    /// `SqliteStorage::insert_conversations_batched`.
10488    pub fn insert_conversations_batched(
10489        &self,
10490        conversations: &[(i64, Option<i64>, &Conversation)],
10491    ) -> Result<Vec<InsertOutcome>> {
10492        if conversations.is_empty() {
10493            return Ok(Vec::new());
10494        }
10495
10496        self.ensure_sources_for_batch(conversations)?;
10497
10498        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
10499        let defer_analytics_updates = defer_analytics_updates_enabled();
10500
10501        let pricing_table = PricingTable::franken_load(&self.conn).unwrap_or_else(|e| {
10502            tracing::warn!(target: "cass::analytics::pricing", error = %e, "failed to load pricing table");
10503            PricingTable { entries: Vec::new() }
10504        });
10505        let mut pricing_diag = PricingDiagnostics::default();
10506
10507        let mut tx = self.conn.transaction()?;
10508
10509        // Bug #167: Ensure all referenced agents, workspaces, and sources
10510        // exist inside the transaction so FK checks pass.  The caller resolves
10511        // IDs via ensure_agent / ensure_workspace / ensure_sources_for_batch
10512        // outside the transaction, but those autocommit writes may not be
10513        // visible inside the transaction snapshot in frankensqlite.  Re-verify
10514        // (and insert if missing) within the tx.
10515        ensure_agents_in_tx(&tx, conversations)?;
10516        ensure_workspaces_in_tx(&tx, conversations)?;
10517        ensure_sources_in_tx(&tx, conversations)?;
10518
10519        let mut outcomes = Vec::with_capacity(conversations.len());
10520        let mut fts_entries = Vec::new();
10521        let mut fts_pending_chars = 0usize;
10522        let mut fts_inserted_total = 0usize;
10523        let mut fts_count_total = 0usize;
10524        let mut stats = StatsAggregator::new();
10525        let mut token_stats = TokenStatsAggregator::new();
10526        let mut token_entries: Vec<TokenUsageEntry> = Vec::new();
10527        let mut metrics_entries: Vec<MessageMetricsEntry> = Vec::new();
10528        let mut rollup_agg = AnalyticsRollupAggregator::new();
10529        let mut conv_ids_to_summarize: Vec<i64> = Vec::new();
10530        let mut pending_conversation_ids: HashMap<PendingConversationKey, i64> = HashMap::new();
10531        let mut pending_message_fingerprints: HashMap<i64, HashMap<i64, MessageMergeFingerprint>> =
10532            HashMap::new();
10533        let mut pending_message_replay_fingerprints: HashMap<
10534            i64,
10535            HashSet<MessageReplayFingerprint>,
10536        > = HashMap::new();
10537
10538        for &(agent_id, workspace_id, raw_conv) in conversations {
10539            let normalized_conv = normalized_conversation_for_storage(raw_conv);
10540            let conv = normalized_conv.as_ref();
10541            let mut total_chars: i64 = 0;
10542            let mut inserted_indices = Vec::with_capacity(conv.messages.len());
10543            let mut inserted_messages: Vec<(i64, &Message)> =
10544                Vec::with_capacity(conv.messages.len());
10545            let mut session_count_delta = 1_i64;
10546            let conversation_key = conversation_merge_key(agent_id, conv);
10547
10548            let existing_conv_id = if let Some(existing_id) =
10549                pending_conversation_ids.get(&conversation_key)
10550            {
10551                Some(*existing_id)
10552            } else {
10553                let existing_id =
10554                    franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
10555                if let Some(existing_id) = existing_id {
10556                    pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10557                }
10558                existing_id
10559            };
10560
10561            let conv_id = if let Some(existing_id) = existing_conv_id {
10562                session_count_delta = 0;
10563                let ExistingMessageLookup {
10564                    by_idx: mut existing_messages,
10565                    replay: mut existing_replay_fingerprints,
10566                } = franken_existing_message_lookup_with_pending(
10567                    &tx,
10568                    existing_id,
10569                    &conv.messages,
10570                    &mut pending_message_fingerprints,
10571                    &mut pending_message_replay_fingerprints,
10572                )?;
10573                let ExistingConversationNewMessages {
10574                    messages: new_messages,
10575                    new_chars,
10576                    idx_collision_count,
10577                    first_collision_idx,
10578                } = collect_new_messages_for_existing_conversation(
10579                    existing_id,
10580                    conv,
10581                    &mut existing_messages,
10582                    &mut existing_replay_fingerprints,
10583                    "skipping replay-equivalent recovered message with shifted idx during batched merge",
10584                );
10585                let (inserted_last_idx, inserted_last_created_at) =
10586                    borrowed_messages_tail_state(&new_messages);
10587                let inserted_message_ids =
10588                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10589                total_chars += new_chars;
10590                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10591                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10592                    if !defer_lexical_updates {
10593                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10594                        fts_count_total += 1;
10595                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
10596                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10597                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10598                        {
10599                            flush_pending_fts_entries(
10600                                self,
10601                                &tx,
10602                                &mut fts_entries,
10603                                &mut fts_pending_chars,
10604                                &mut fts_inserted_total,
10605                            )?;
10606                        }
10607                    }
10608                    inserted_indices.push(msg.idx);
10609                    inserted_messages.push((msg_id, msg));
10610                }
10611
10612                if idx_collision_count > 0 {
10613                    tracing::warn!(
10614                        conversation_id = existing_id,
10615                        collision_count = idx_collision_count,
10616                        first_idx = first_collision_idx,
10617                        source_path = %conv.source_path.display(),
10618                        "message idx collisions encountered during batched conversation merge; retaining canonical message variants"
10619                    );
10620                }
10621
10622                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10623                franken_update_conversation_tail_state(
10624                    &tx,
10625                    existing_id,
10626                    conv_last_ts,
10627                    inserted_last_idx,
10628                    inserted_last_created_at,
10629                )?;
10630                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
10631                {
10632                    franken_update_external_conversation_tail_lookup_key(
10633                        &tx,
10634                        &lookup_key,
10635                        conv_last_ts,
10636                        inserted_last_idx,
10637                        inserted_last_created_at,
10638                    )?;
10639                }
10640
10641                pending_message_fingerprints.insert(existing_id, existing_messages);
10642                pending_message_replay_fingerprints
10643                    .insert(existing_id, existing_replay_fingerprints);
10644
10645                existing_id
10646            } else {
10647                match franken_insert_conversation_or_get_existing(
10648                    &tx,
10649                    agent_id,
10650                    workspace_id,
10651                    conv,
10652                )? {
10653                    ConversationInsertStatus::Inserted(new_conv_id) => {
10654                        pending_conversation_ids.insert(conversation_key.clone(), new_conv_id);
10655                        let pending_messages =
10656                            pending_message_fingerprints.entry(new_conv_id).or_default();
10657                        let pending_replay_fingerprints = pending_message_replay_fingerprints
10658                            .entry(new_conv_id)
10659                            .or_default();
10660                        let mut new_messages = Vec::new();
10661                        for msg in &conv.messages {
10662                            let incoming_replay = message_replay_fingerprint(msg);
10663                            if pending_messages.contains_key(&msg.idx)
10664                                || pending_replay_fingerprints.contains(&incoming_replay)
10665                            {
10666                                continue;
10667                            }
10668                            pending_messages.insert(msg.idx, message_merge_fingerprint(msg));
10669                            pending_replay_fingerprints.insert(incoming_replay);
10670                            new_messages.push(msg);
10671                        }
10672                        let inserted_message_ids =
10673                            franken_batch_insert_new_messages(&tx, new_conv_id, &new_messages)?;
10674                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10675                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10676                            if !defer_lexical_updates {
10677                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10678                                fts_count_total += 1;
10679                                fts_pending_chars =
10680                                    fts_pending_chars.saturating_add(msg.content.len());
10681                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10682                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10683                                {
10684                                    flush_pending_fts_entries(
10685                                        self,
10686                                        &tx,
10687                                        &mut fts_entries,
10688                                        &mut fts_pending_chars,
10689                                        &mut fts_inserted_total,
10690                                    )?;
10691                                }
10692                            }
10693                            total_chars += msg.content.len() as i64;
10694                            inserted_indices.push(msg.idx);
10695                            inserted_messages.push((msg_id, msg));
10696                        }
10697                        new_conv_id
10698                    }
10699                    ConversationInsertStatus::Existing(existing_id) => {
10700                        session_count_delta = 0;
10701                        pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10702                        let ExistingMessageLookup {
10703                            by_idx: mut existing_messages,
10704                            replay: mut existing_replay_fingerprints,
10705                        } = franken_existing_message_lookup_with_pending(
10706                            &tx,
10707                            existing_id,
10708                            &conv.messages,
10709                            &mut pending_message_fingerprints,
10710                            &mut pending_message_replay_fingerprints,
10711                        )?;
10712                        let ExistingConversationNewMessages {
10713                            messages: new_messages,
10714                            new_chars,
10715                            idx_collision_count,
10716                            first_collision_idx,
10717                        } = collect_new_messages_for_existing_conversation(
10718                            existing_id,
10719                            conv,
10720                            &mut existing_messages,
10721                            &mut existing_replay_fingerprints,
10722                            "skipping replay-equivalent recovered message with shifted idx after duplicate conversation recovery",
10723                        );
10724                        let (inserted_last_idx, inserted_last_created_at) =
10725                            borrowed_messages_tail_state(&new_messages);
10726                        let inserted_message_ids =
10727                            franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10728                        total_chars += new_chars;
10729                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10730                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10731                            if !defer_lexical_updates {
10732                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10733                                fts_count_total += 1;
10734                                fts_pending_chars =
10735                                    fts_pending_chars.saturating_add(msg.content.len());
10736                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10737                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10738                                {
10739                                    flush_pending_fts_entries(
10740                                        self,
10741                                        &tx,
10742                                        &mut fts_entries,
10743                                        &mut fts_pending_chars,
10744                                        &mut fts_inserted_total,
10745                                    )?;
10746                                }
10747                            }
10748                            inserted_indices.push(msg.idx);
10749                            inserted_messages.push((msg_id, msg));
10750                        }
10751
10752                        if idx_collision_count > 0 {
10753                            tracing::warn!(
10754                                conversation_id = existing_id,
10755                                collision_count = idx_collision_count,
10756                                first_idx = first_collision_idx,
10757                                source_path = %conv.source_path.display(),
10758                                "message idx collisions encountered after duplicate conversation recovery; retaining canonical message variants"
10759                            );
10760                        }
10761
10762                        let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10763                        franken_update_conversation_tail_state(
10764                            &tx,
10765                            existing_id,
10766                            conv_last_ts,
10767                            inserted_last_idx,
10768                            inserted_last_created_at,
10769                        )?;
10770                        if let Some(lookup_key) =
10771                            conversation_external_lookup_key_for_conv(agent_id, conv)
10772                        {
10773                            franken_update_external_conversation_tail_lookup_key(
10774                                &tx,
10775                                &lookup_key,
10776                                conv_last_ts,
10777                                inserted_last_idx,
10778                                inserted_last_created_at,
10779                            )?;
10780                        }
10781
10782                        pending_message_fingerprints.insert(existing_id, existing_messages);
10783                        pending_message_replay_fingerprints
10784                            .insert(existing_id, existing_replay_fingerprints);
10785
10786                        existing_id
10787                    }
10788                }
10789            };
10790
10791            if !defer_analytics_updates {
10792                let delta = StatsDelta {
10793                    session_count_delta,
10794                    message_count_delta: inserted_messages.len() as i64,
10795                    total_chars_delta: total_chars,
10796                };
10797
10798                let effective_started_at = conversation_effective_started_at(conv);
10799                let day_id = effective_started_at
10800                    .map(FrankenStorage::day_id_from_millis)
10801                    .unwrap_or(0);
10802                stats.record_delta(
10803                    &conv.agent_slug,
10804                    &conv.source_id,
10805                    day_id,
10806                    delta.session_count_delta,
10807                    delta.message_count_delta,
10808                    delta.total_chars_delta,
10809                );
10810
10811                let conv_day_id = day_id;
10812                let mut session_model_family = String::from("unknown");
10813                let mut has_any_tokens = false;
10814
10815                for &(message_id, msg) in &inserted_messages {
10816                    let role_s = role_str(&msg.role);
10817                    let usage = if historical_raw_json(&msg.extra_json).is_some() {
10818                        crate::connectors::extract_tokens_for_agent(
10819                            &conv.agent_slug,
10820                            &serde_json::Value::Null,
10821                            &msg.content,
10822                            &role_s,
10823                        )
10824                    } else {
10825                        crate::connectors::extract_tokens_for_agent(
10826                            &conv.agent_slug,
10827                            &msg.extra_json,
10828                            &msg.content,
10829                            &role_s,
10830                        )
10831                    };
10832
10833                    let msg_ts = msg
10834                        .created_at
10835                        .or(conversation_effective_started_at(conv))
10836                        .unwrap_or(0);
10837                    let msg_day_id = if msg_ts > 0 {
10838                        FrankenStorage::day_id_from_millis(msg_ts)
10839                    } else {
10840                        conv_day_id
10841                    };
10842
10843                    let model_info = usage
10844                        .model_name
10845                        .as_deref()
10846                        .map(crate::connectors::normalize_model);
10847
10848                    let model_family = model_info
10849                        .as_ref()
10850                        .map(|i| i.family.clone())
10851                        .unwrap_or_else(|| "unknown".into());
10852                    let model_tier = model_info
10853                        .as_ref()
10854                        .map(|i| i.tier.clone())
10855                        .unwrap_or_else(|| "unknown".into());
10856                    let provider = usage
10857                        .provider
10858                        .clone()
10859                        .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
10860                        .unwrap_or_else(|| "unknown".into());
10861
10862                    if model_family != "unknown" {
10863                        session_model_family = model_family.clone();
10864                    }
10865
10866                    let estimated_cost = pricing_table.compute_cost(
10867                        usage.model_name.as_deref(),
10868                        msg_day_id,
10869                        usage.input_tokens,
10870                        usage.output_tokens,
10871                        usage.cache_read_tokens,
10872                        usage.cache_creation_tokens,
10873                    );
10874                    if estimated_cost.is_some() {
10875                        pricing_diag.record_priced();
10876                    } else if usage.has_token_data() {
10877                        pricing_diag.record_unpriced(usage.model_name.as_deref());
10878                    }
10879
10880                    token_stats.record(
10881                        &conv.agent_slug,
10882                        &conv.source_id,
10883                        msg_day_id,
10884                        &model_family,
10885                        &role_s,
10886                        &usage,
10887                        msg.content.len() as i64,
10888                        estimated_cost.unwrap_or(0.0),
10889                    );
10890
10891                    if usage.has_token_data() {
10892                        has_any_tokens = true;
10893                    }
10894
10895                    let content_chars = msg.content.len() as i64;
10896                    let content_tokens_est = content_chars / 4;
10897                    let msg_hour_id = FrankenStorage::hour_id_from_millis(msg_ts);
10898                    let has_plan = has_plan_for_role(&role_s, &msg.content);
10899
10900                    token_entries.push(TokenUsageEntry {
10901                        message_id,
10902                        conversation_id: conv_id,
10903                        agent_id,
10904                        workspace_id,
10905                        source_id: conv.source_id.clone(),
10906                        timestamp_ms: msg_ts,
10907                        day_id: msg_day_id,
10908                        model_name: usage.model_name.clone(),
10909                        model_family: Some(model_family.clone()),
10910                        model_tier: Some(model_tier.clone()),
10911                        service_tier: usage.service_tier.clone(),
10912                        provider: Some(provider.clone()),
10913                        input_tokens: usage.input_tokens,
10914                        output_tokens: usage.output_tokens,
10915                        cache_read_tokens: usage.cache_read_tokens,
10916                        cache_creation_tokens: usage.cache_creation_tokens,
10917                        thinking_tokens: usage.thinking_tokens,
10918                        total_tokens: usage.total_tokens(),
10919                        estimated_cost_usd: estimated_cost,
10920                        role: role_s.to_string(),
10921                        content_chars,
10922                        has_tool_calls: usage.has_tool_calls,
10923                        tool_call_count: usage.tool_call_count,
10924                        data_source: usage.data_source.as_str().to_string(),
10925                    });
10926
10927                    let mm = MessageMetricsEntry {
10928                        message_id,
10929                        created_at_ms: msg_ts,
10930                        hour_id: msg_hour_id,
10931                        day_id: msg_day_id,
10932                        agent_slug: conv.agent_slug.clone(),
10933                        workspace_id: workspace_id.unwrap_or(0),
10934                        source_id: conv.source_id.clone(),
10935                        role: role_s.to_string(),
10936                        content_chars,
10937                        content_tokens_est,
10938                        model_name: usage.model_name.clone(),
10939                        model_family: model_family.clone(),
10940                        model_tier: model_tier.clone(),
10941                        provider,
10942                        api_input_tokens: usage.input_tokens,
10943                        api_output_tokens: usage.output_tokens,
10944                        api_cache_read_tokens: usage.cache_read_tokens,
10945                        api_cache_creation_tokens: usage.cache_creation_tokens,
10946                        api_thinking_tokens: usage.thinking_tokens,
10947                        api_service_tier: usage.service_tier.clone(),
10948                        api_data_source: usage.data_source.as_str().to_string(),
10949                        tool_call_count: usage.tool_call_count as i64,
10950                        has_tool_calls: usage.has_tool_calls,
10951                        has_plan,
10952                    };
10953                    rollup_agg.record(&mm);
10954                    metrics_entries.push(mm);
10955                }
10956
10957                if session_count_delta > 0 {
10958                    token_stats.record_session(
10959                        &conv.agent_slug,
10960                        &conv.source_id,
10961                        conv_day_id,
10962                        &session_model_family,
10963                    );
10964                }
10965
10966                if has_any_tokens {
10967                    conv_ids_to_summarize.push(conv_id);
10968                }
10969            }
10970
10971            outcomes.push(InsertOutcome {
10972                conversation_id: conv_id,
10973                conversation_inserted: session_count_delta > 0,
10974                inserted_indices,
10975            });
10976        }
10977
10978        // Batch insert all FTS entries at once
10979        if !defer_lexical_updates {
10980            flush_pending_fts_entries(
10981                self,
10982                &tx,
10983                &mut fts_entries,
10984                &mut fts_pending_chars,
10985                &mut fts_inserted_total,
10986            )?;
10987        }
10988        if !defer_lexical_updates && fts_count_total > 0 {
10989            tracing::debug!(
10990                target: "cass::perf::fts5",
10991                total = fts_count_total,
10992                inserted = fts_inserted_total,
10993                conversations = conversations.len(),
10994                "franken_batch_fts_insert_complete"
10995            );
10996        }
10997
10998        // Batched daily_stats update
10999        if !defer_analytics_updates && !stats.is_empty() {
11000            let entries = stats.expand();
11001            let affected = franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
11002            tracing::debug!(
11003                target: "cass::perf::daily_stats",
11004                raw = stats.raw_entry_count(),
11005                expanded = entries.len(),
11006                affected = affected,
11007                "franken_batched_stats_update_complete"
11008            );
11009        }
11010
11011        // Batch insert token_usage rows
11012        if !defer_analytics_updates && !token_entries.is_empty() {
11013            let token_count = token_entries.len();
11014            let inserted = franken_insert_token_usage_batched_in_tx(&tx, &token_entries)?;
11015            tracing::debug!(
11016                target: "cass::perf::token_usage",
11017                total = token_count,
11018                inserted = inserted,
11019                "franken_batch_token_usage_insert_complete"
11020            );
11021        }
11022
11023        // Batched token_daily_stats update
11024        if !defer_analytics_updates && !token_stats.is_empty() {
11025            let entries = token_stats.expand();
11026            let affected = franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
11027            tracing::debug!(
11028                target: "cass::perf::token_daily_stats",
11029                raw = token_stats.raw_entry_count(),
11030                expanded = entries.len(),
11031                affected = affected,
11032                "franken_batched_token_stats_update_complete"
11033            );
11034        }
11035
11036        // Batch insert message_metrics rows
11037        if !defer_analytics_updates && !metrics_entries.is_empty() {
11038            let mm_count = metrics_entries.len();
11039            let inserted = franken_insert_message_metrics_batched_in_tx(&tx, &metrics_entries)?;
11040            tracing::debug!(
11041                target: "cass::perf::message_metrics",
11042                total = mm_count,
11043                inserted = inserted,
11044                "franken_batch_message_metrics_insert_complete"
11045            );
11046        }
11047
11048        // Flush usage_hourly + usage_daily rollups
11049        if !defer_analytics_updates && !rollup_agg.is_empty() {
11050            let (hourly, daily, models_daily) =
11051                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
11052            tracing::debug!(
11053                target: "cass::perf::usage_rollups",
11054                hourly_buckets = rollup_agg.hourly_entry_count(),
11055                daily_buckets = rollup_agg.daily_entry_count(),
11056                models_daily_buckets = rollup_agg.models_daily_entry_count(),
11057                hourly_affected = hourly,
11058                daily_affected = daily,
11059                models_daily_affected = models_daily,
11060                "franken_batched_usage_rollups_complete"
11061            );
11062        }
11063
11064        // Update conversation-level token summaries
11065        if !defer_analytics_updates {
11066            for conv_id in &conv_ids_to_summarize {
11067                franken_update_conversation_token_summaries_in_tx(&tx, *conv_id)?;
11068            }
11069        }
11070
11071        tx.commit()?;
11072
11073        pricing_diag.log_summary();
11074
11075        Ok(outcomes)
11076    }
11077}
11078
11079fn normalized_storage_source_parts(
11080    source_id: Option<&str>,
11081    origin_kind: Option<&str>,
11082    origin_host: Option<&str>,
11083) -> (String, SourceKind, Option<String>) {
11084    let host_label = crate::search::tantivy::normalized_index_origin_host(origin_host);
11085    let source_id = crate::search::tantivy::normalized_index_source_id(
11086        source_id,
11087        origin_kind,
11088        host_label.as_deref(),
11089    );
11090
11091    if source_id == LOCAL_SOURCE_ID {
11092        (source_id, SourceKind::Local, None)
11093    } else {
11094        (source_id, SourceKind::Ssh, host_label)
11095    }
11096}
11097
11098fn normalized_source_for_conversation(conv: &Conversation) -> Source {
11099    let (id, kind, host_label) = normalized_storage_source_parts(
11100        Some(conv.source_id.as_str()),
11101        None,
11102        conv.origin_host.as_deref(),
11103    );
11104    Source {
11105        id,
11106        kind,
11107        host_label,
11108        machine_id: None,
11109        platform: None,
11110        config_json: None,
11111        created_at: None,
11112        updated_at: None,
11113    }
11114}
11115
11116fn is_bootstrap_local_source(source: &Source) -> bool {
11117    source.id == LOCAL_SOURCE_ID
11118        && matches!(source.kind, SourceKind::Local)
11119        && source.host_label.is_none()
11120        && source.machine_id.is_none()
11121        && source.platform.is_none()
11122        && source.config_json.is_none()
11123        && source.created_at.is_none()
11124        && source.updated_at.is_none()
11125}
11126
11127fn normalized_conversation_for_storage<'a>(conv: &'a Conversation) -> Cow<'a, Conversation> {
11128    let normalized_source = normalized_source_for_conversation(conv);
11129    if normalized_source.id == conv.source_id && normalized_source.host_label == conv.origin_host {
11130        Cow::Borrowed(conv)
11131    } else {
11132        let mut normalized = conv.clone();
11133        normalized.source_id = normalized_source.id;
11134        normalized.origin_host = normalized_source.host_label;
11135        Cow::Owned(normalized)
11136    }
11137}
11138
11139impl FrankenStorage {
11140    fn ensure_source_for_conversation(&self, conv: &Conversation) -> Result<()> {
11141        let source = normalized_source_for_conversation(conv);
11142        if is_bootstrap_local_source(&source) {
11143            // `open()` and schema repair always seed the canonical local source row.
11144            // Avoid an autocommit UPDATE on every local conversation insert.
11145            return Ok(());
11146        }
11147        let cache_key = EnsuredConversationSourceKey::from_source(&source);
11148        if self.conversation_source_already_ensured(&cache_key) {
11149            return Ok(());
11150        }
11151        self.upsert_source(&source)?;
11152        self.mark_conversation_source_ensured(cache_key);
11153        Ok(())
11154    }
11155
11156    fn ensure_sources_for_batch(
11157        &self,
11158        conversations: &[(i64, Option<i64>, &Conversation)],
11159    ) -> Result<()> {
11160        let mut seen = HashSet::with_capacity(conversations.len());
11161        for &(_, _, conv) in conversations {
11162            let source = normalized_source_for_conversation(conv);
11163            if seen.insert(source.id.clone()) {
11164                if is_bootstrap_local_source(&source) {
11165                    continue;
11166                }
11167                self.upsert_source(&source)?;
11168                self.mark_conversation_source_ensured(EnsuredConversationSourceKey::from_source(
11169                    &source,
11170                ));
11171            }
11172        }
11173        Ok(())
11174    }
11175}
11176
11177// =========================================================================
11178// FrankenStorage transaction helper functions
11179// =========================================================================
11180
11181/// Get last_insert_rowid from a frankensqlite transaction.
11182fn franken_last_rowid(tx: &FrankenTransaction<'_>) -> Result<i64> {
11183    tx.last_insert_rowid()
11184        .ok()
11185        .filter(|&id| id > 0)
11186        .with_context(|| "last_insert_rowid() returned NULL or 0 after INSERT")
11187}
11188
11189/// Bug #167: Ensure all agents referenced by a batch exist within the
11190/// transaction.  The caller already resolved `agent_id` values via
11191/// `ensure_agent` outside the transaction, but those autocommit writes may
11192/// not be visible inside a frankensqlite transaction snapshot.  This function
11193/// checks each unique agent_id and creates a stub row if it's missing.
11194fn ensure_agents_in_tx(
11195    tx: &FrankenTransaction<'_>,
11196    conversations: &[(i64, Option<i64>, &Conversation)],
11197) -> Result<()> {
11198    let mut seen = HashSet::new();
11199    let now = FrankenStorage::now_millis();
11200    for &(agent_id, _, conv) in conversations {
11201        if !seen.insert(agent_id) {
11202            continue;
11203        }
11204        let exists: i64 = tx.query_row_map(
11205            "SELECT COUNT(*) FROM agents WHERE id = ?1",
11206            fparams![agent_id],
11207            |row| row.get_typed(0),
11208        )?;
11209        if exists == 0 {
11210            tracing::debug!(
11211                target: "cass::fk_guard",
11212                agent_id,
11213                slug = %conv.agent_slug,
11214                "inserting agent row inside transaction to satisfy FK constraint"
11215            );
11216            // INSERT OR IGNORE: the slug might already exist with a different
11217            // id from a concurrent writer.  If the slug row exists, the FK
11218            // constraint is already satisfied (the caller just got a stale id).
11219            tx.execute_compat(
11220                "INSERT OR IGNORE INTO agents(id, slug, name, kind, created_at, updated_at)
11221                 VALUES(?1, ?2, ?3, 'cli', ?4, ?5)",
11222                fparams![
11223                    agent_id,
11224                    conv.agent_slug.as_str(),
11225                    conv.agent_slug.as_str(),
11226                    now,
11227                    now
11228                ],
11229            )?;
11230        }
11231    }
11232    Ok(())
11233}
11234
11235/// Bug #167: Ensure all workspaces referenced by a batch exist within the
11236/// transaction.  Same rationale as `ensure_agents_in_tx`.
11237fn ensure_workspaces_in_tx(
11238    tx: &FrankenTransaction<'_>,
11239    conversations: &[(i64, Option<i64>, &Conversation)],
11240) -> Result<()> {
11241    let mut seen = HashSet::new();
11242    for &(_, workspace_id, conv) in conversations {
11243        let ws_id = match workspace_id {
11244            Some(id) => id,
11245            None => continue,
11246        };
11247        if !seen.insert(ws_id) {
11248            continue;
11249        }
11250        let exists: i64 = tx.query_row_map(
11251            "SELECT COUNT(*) FROM workspaces WHERE id = ?1",
11252            fparams![ws_id],
11253            |row| row.get_typed(0),
11254        )?;
11255        if exists == 0 {
11256            let path_str = conv
11257                .workspace
11258                .as_ref()
11259                .map(|p| p.to_string_lossy().to_string())
11260                .unwrap_or_default();
11261            tracing::debug!(
11262                target: "cass::fk_guard",
11263                workspace_id = ws_id,
11264                path = %path_str,
11265                "inserting workspace row inside transaction to satisfy FK constraint"
11266            );
11267            tx.execute_compat(
11268                "INSERT OR IGNORE INTO workspaces(id, path) VALUES(?1, ?2)",
11269                fparams![ws_id, path_str.as_str()],
11270            )?;
11271        }
11272    }
11273    Ok(())
11274}
11275
11276/// Bug #167: Ensure all sources referenced by a batch exist within the
11277/// transaction.  Same rationale as `ensure_agents_in_tx` — source_id is a
11278/// TEXT FK on the conversations table.
11279fn ensure_sources_in_tx(
11280    tx: &FrankenTransaction<'_>,
11281    conversations: &[(i64, Option<i64>, &Conversation)],
11282) -> Result<()> {
11283    let mut seen = HashSet::new();
11284    for &(_, _, conv) in conversations {
11285        let (source_id, source_kind, host_label) = normalized_storage_source_parts(
11286            Some(conv.source_id.as_str()),
11287            None,
11288            conv.origin_host.as_deref(),
11289        );
11290        if !seen.insert(source_id.clone()) {
11291            continue;
11292        }
11293        let exists: i64 = tx.query_row_map(
11294            "SELECT COUNT(*) FROM sources WHERE id = ?1",
11295            fparams![source_id.as_str()],
11296            |row| row.get_typed(0),
11297        )?;
11298        if exists == 0 {
11299            let kind_str = source_kind.to_string();
11300            let now = FrankenStorage::now_millis();
11301            tracing::debug!(
11302                target: "cass::fk_guard",
11303                source_id = %source_id,
11304                kind = kind_str.as_str(),
11305                "inserting source row inside transaction to satisfy FK constraint"
11306            );
11307            tx.execute_compat(
11308                "INSERT OR IGNORE INTO sources(id, kind, host_label, created_at, updated_at)
11309                 VALUES(?1, ?2, ?3, ?4, ?5)",
11310                fparams![
11311                    source_id.as_str(),
11312                    kind_str.as_str(),
11313                    host_label.as_deref(),
11314                    now,
11315                    now
11316                ],
11317            )?;
11318        }
11319    }
11320    Ok(())
11321}
11322
11323fn env_flag_enabled(name: &str) -> bool {
11324    dotenvy::var(name)
11325        .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
11326        .unwrap_or(false)
11327}
11328
11329fn defer_storage_lexical_updates_enabled() -> bool {
11330    env_flag_enabled("CASS_DEFER_LEXICAL_UPDATES")
11331}
11332
11333fn defer_analytics_updates_enabled() -> bool {
11334    env_flag_enabled("CASS_DEFER_ANALYTICS_UPDATES")
11335}
11336
11337enum ConversationInsertStatus {
11338    Inserted(i64),
11339    Existing(i64),
11340}
11341
11342fn franken_find_external_conversation_tail_lookup(
11343    tx: &FrankenTransaction<'_>,
11344    lookup_key: &str,
11345) -> Result<Option<ExistingConversationWithTail>> {
11346    let params = [SqliteValue::from(lookup_key)];
11347    let row = tx
11348        .query_row_with_params(
11349            "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
11350             FROM conversation_external_tail_lookup
11351             WHERE lookup_key = ?1",
11352            &params,
11353        )
11354        .optional()?;
11355    let Some(row) = row else {
11356        return Ok(None);
11357    };
11358    let id = row.get_typed(0)?;
11359    let ended_at = row.get_typed(1)?;
11360    let last_message_idx = row.get_typed(2)?;
11361    let last_message_created_at = row.get_typed(3)?;
11362    Ok(Some(ExistingConversationWithTail {
11363        id,
11364        tail_state: existing_conversation_tail_state_from_cached(
11365            last_message_idx,
11366            last_message_created_at,
11367            ended_at,
11368        ),
11369    }))
11370}
11371
11372fn franken_find_external_conversation_lookup(
11373    tx: &FrankenTransaction<'_>,
11374    lookup_key: &str,
11375) -> Result<Option<i64>> {
11376    Ok(franken_find_external_conversation_tail_lookup(tx, lookup_key)?.map(|existing| existing.id))
11377}
11378
11379fn franken_insert_external_conversation_tail_lookup_key(
11380    tx: &FrankenTransaction<'_>,
11381    lookup_key: &str,
11382    conversation_id: i64,
11383    ended_at: Option<i64>,
11384    last_message_idx: Option<i64>,
11385    last_message_created_at: Option<i64>,
11386) -> Result<()> {
11387    let params = [
11388        SqliteValue::from(lookup_key),
11389        SqliteValue::from(conversation_id),
11390        SqliteValue::from(ended_at),
11391        SqliteValue::from(last_message_idx),
11392        SqliteValue::from(last_message_created_at),
11393    ];
11394    tx.execute_with_params(
11395        "INSERT OR REPLACE INTO conversation_external_tail_lookup(
11396             lookup_key, conversation_id, ended_at, last_message_idx, last_message_created_at
11397         ) VALUES(?1, ?2, ?3, ?4, ?5)",
11398        &params,
11399    )?;
11400    Ok(())
11401}
11402
11403fn franken_insert_external_conversation_tail_lookup(
11404    tx: &FrankenTransaction<'_>,
11405    source_id: &str,
11406    agent_id: i64,
11407    external_id: &str,
11408    existing: ExistingConversationWithTail,
11409) -> Result<()> {
11410    let lookup_key = conversation_external_lookup_key(source_id, agent_id, external_id);
11411    let ended_at = existing.tail_state.and_then(|state| state.ended_at);
11412    let last_message_idx = existing.tail_state.map(|state| state.last_message_idx);
11413    let last_message_created_at = existing
11414        .tail_state
11415        .map(|state| state.last_message_created_at);
11416    franken_insert_external_conversation_tail_lookup_key(
11417        tx,
11418        &lookup_key,
11419        existing.id,
11420        ended_at,
11421        last_message_idx,
11422        last_message_created_at,
11423    )
11424}
11425
11426fn franken_update_external_conversation_tail_lookup_key(
11427    tx: &FrankenTransaction<'_>,
11428    lookup_key: &str,
11429    ended_at_candidate: Option<i64>,
11430    last_message_idx_candidate: Option<i64>,
11431    last_message_created_at_candidate: Option<i64>,
11432) -> Result<()> {
11433    if ended_at_candidate.is_none()
11434        && last_message_idx_candidate.is_none()
11435        && last_message_created_at_candidate.is_none()
11436    {
11437        return Ok(());
11438    }
11439    tx.execute_compat(
11440        "UPDATE conversation_external_tail_lookup
11441         SET ended_at = CASE
11442                 WHEN ?1 IS NULL THEN ended_at
11443                 ELSE MAX(IFNULL(ended_at, 0), ?1)
11444             END,
11445             last_message_idx = CASE
11446                 WHEN ?2 IS NULL THEN last_message_idx
11447                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
11448                 ELSE last_message_idx
11449             END,
11450             last_message_created_at = CASE
11451                 WHEN ?3 IS NULL THEN last_message_created_at
11452                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
11453                 ELSE last_message_created_at
11454             END
11455         WHERE lookup_key = ?4",
11456        fparams![
11457            ended_at_candidate,
11458            last_message_idx_candidate,
11459            last_message_created_at_candidate,
11460            lookup_key
11461        ],
11462    )?;
11463    Ok(())
11464}
11465
11466fn franken_set_external_conversation_tail_lookup_after_append(
11467    tx: &FrankenTransaction<'_>,
11468    lookup_key: &str,
11469    ended_at: i64,
11470    last_message_idx: i64,
11471    last_message_created_at: i64,
11472) -> Result<()> {
11473    tx.execute_compat(
11474        "UPDATE conversation_external_tail_lookup
11475         SET ended_at = ?1,
11476             last_message_idx = ?2,
11477             last_message_created_at = ?3
11478         WHERE lookup_key = ?4",
11479        fparams![
11480            ended_at,
11481            last_message_idx,
11482            last_message_created_at,
11483            lookup_key
11484        ],
11485    )?;
11486    Ok(())
11487}
11488
11489fn franken_update_external_conversation_tail_after_append(
11490    tx: &FrankenTransaction<'_>,
11491    agent_id: i64,
11492    conv: &Conversation,
11493    used_append_tail_plan: bool,
11494    exact_append_set: bool,
11495    inserted_last_idx: Option<i64>,
11496    inserted_last_created_at: Option<i64>,
11497) -> Result<()> {
11498    let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv) else {
11499        return Ok(());
11500    };
11501
11502    if exact_append_set
11503        && let (Some(last_message_idx), Some(last_message_created_at)) =
11504            (inserted_last_idx, inserted_last_created_at)
11505    {
11506        return franken_set_external_conversation_tail_lookup_after_append(
11507            tx,
11508            &lookup_key,
11509            last_message_created_at,
11510            last_message_idx,
11511            last_message_created_at,
11512        );
11513    }
11514
11515    let ended_at_candidate = if used_append_tail_plan {
11516        inserted_last_created_at
11517    } else {
11518        conv.messages.iter().filter_map(|m| m.created_at).max()
11519    };
11520    franken_update_external_conversation_tail_lookup_key(
11521        tx,
11522        &lookup_key,
11523        ended_at_candidate,
11524        inserted_last_idx,
11525        inserted_last_created_at,
11526    )
11527}
11528
11529fn franken_find_existing_conversation_by_key(
11530    tx: &FrankenTransaction<'_>,
11531    key: &PendingConversationKey,
11532    conv: Option<&Conversation>,
11533) -> Result<Option<i64>> {
11534    franken_find_existing_conversation_by_key_impl(tx, key, conv, false)
11535}
11536
11537fn franken_find_existing_conversation_by_key_after_conflict(
11538    tx: &FrankenTransaction<'_>,
11539    key: &PendingConversationKey,
11540    conv: Option<&Conversation>,
11541) -> Result<Option<i64>> {
11542    franken_find_existing_conversation_by_key_impl(tx, key, conv, true)
11543}
11544
11545fn franken_find_existing_conversation_by_key_impl(
11546    tx: &FrankenTransaction<'_>,
11547    key: &PendingConversationKey,
11548    conv: Option<&Conversation>,
11549    allow_legacy_external_scan: bool,
11550) -> Result<Option<i64>> {
11551    match key {
11552        PendingConversationKey::External {
11553            source_id,
11554            agent_id,
11555            external_id,
11556        } => {
11557            let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
11558            if let Some(existing_id) = franken_find_external_conversation_lookup(tx, &lookup_key)? {
11559                return Ok(Some(existing_id));
11560            }
11561            if !allow_legacy_external_scan {
11562                return Ok(None);
11563            }
11564
11565            let existing_id = tx
11566                .query_row_map(
11567                    "SELECT id
11568                 FROM conversations
11569                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
11570                    fparams![source_id.as_str(), *agent_id, external_id.as_str()],
11571                    |row| row.get_typed(0),
11572                )
11573                .optional()?;
11574            if let Some(existing_id) = existing_id {
11575                let tail_state = franken_existing_conversation_append_tail_state(tx, existing_id)?;
11576                franken_insert_external_conversation_tail_lookup_key(
11577                    tx,
11578                    &lookup_key,
11579                    existing_id,
11580                    tail_state.and_then(|state| state.ended_at),
11581                    tail_state.map(|state| state.last_message_idx),
11582                    tail_state.map(|state| state.last_message_created_at),
11583                )?;
11584                Ok(Some(existing_id))
11585            } else {
11586                Ok(None)
11587            }
11588        }
11589        PendingConversationKey::SourcePath {
11590            source_id,
11591            agent_id,
11592            source_path,
11593            started_at,
11594        } => {
11595            let exact_match = tx
11596                .query_row_map(
11597                    "SELECT c.id
11598                     FROM conversations c
11599                     WHERE c.source_id = ?1
11600                       AND c.agent_id = ?2
11601                       AND c.source_path = ?3
11602                       AND ((
11603                            COALESCE(
11604                                c.started_at,
11605                                (SELECT MIN(created_at)
11606                                 FROM messages
11607                                 WHERE conversation_id = c.id
11608                                   AND created_at IS NOT NULL)
11609                            ) IS NULL
11610                            AND ?4 IS NULL
11611                       ) OR COALESCE(
11612                            c.started_at,
11613                            (SELECT MIN(created_at)
11614                             FROM messages
11615                             WHERE conversation_id = c.id
11616                               AND created_at IS NOT NULL)
11617                       ) = ?4)
11618                     ORDER BY c.id
11619                     LIMIT 1",
11620                    fparams![
11621                        source_id.as_str(),
11622                        *agent_id,
11623                        source_path.as_str(),
11624                        *started_at
11625                    ],
11626                    |row| row.get_typed(0),
11627                )
11628                .optional()?;
11629            if exact_match.is_some() {
11630                return Ok(exact_match);
11631            }
11632
11633            let Some(conv) = conv else {
11634                return Ok(None);
11635            };
11636            let incoming_fingerprints = conversation_message_fingerprints(conv);
11637            if incoming_fingerprints.is_empty() {
11638                return Ok(None);
11639            }
11640            let incoming_replay_fingerprints = conversation_message_replay_fingerprints(conv);
11641
11642            let candidates: Vec<(i64, Option<i64>)> = tx.query_map_collect(
11643                "SELECT
11644                     c.id,
11645                     COALESCE(
11646                         c.started_at,
11647                         (SELECT MIN(created_at)
11648                          FROM messages
11649                          WHERE conversation_id = c.id
11650                            AND created_at IS NOT NULL)
11651                     ) AS effective_started_at
11652                 FROM conversations c
11653                 WHERE c.source_id = ?1
11654                   AND c.agent_id = ?2
11655                   AND c.source_path = ?3
11656                 ORDER BY c.id",
11657                fparams![source_id.as_str(), *agent_id, source_path.as_str()],
11658                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
11659            )?;
11660
11661            let mut best_candidate: Option<(i64, ConversationMergeEvidence)> = None;
11662            for (candidate_id, candidate_started_at) in candidates {
11663                let existing_fingerprints =
11664                    franken_existing_message_fingerprints(tx, candidate_id)?;
11665                let existing_replay_fingerprints =
11666                    replay_fingerprints_from_merge_set(&existing_fingerprints);
11667                let Some(evidence) = conversation_merge_evidence(
11668                    &incoming_fingerprints,
11669                    &incoming_replay_fingerprints,
11670                    &existing_fingerprints,
11671                    &existing_replay_fingerprints,
11672                    *started_at,
11673                    candidate_started_at,
11674                ) else {
11675                    continue;
11676                };
11677
11678                let candidate_key = (
11679                    evidence.exact_overlap,
11680                    evidence.replay_overlap,
11681                    evidence.started_close,
11682                    evidence.smaller_replay_set,
11683                    std::cmp::Reverse(evidence.start_distance_ms),
11684                );
11685                let should_replace = best_candidate
11686                    .as_ref()
11687                    .map(|(_, best_evidence)| {
11688                        candidate_key
11689                            > (
11690                                best_evidence.exact_overlap,
11691                                best_evidence.replay_overlap,
11692                                best_evidence.started_close,
11693                                best_evidence.smaller_replay_set,
11694                                std::cmp::Reverse(best_evidence.start_distance_ms),
11695                            )
11696                    })
11697                    .unwrap_or(true);
11698
11699                if should_replace {
11700                    best_candidate = Some((candidate_id, evidence));
11701                }
11702            }
11703
11704            Ok(best_candidate.map(|(candidate_id, _)| candidate_id))
11705        }
11706    }
11707}
11708
11709fn franken_insert_conversation_or_get_existing(
11710    tx: &FrankenTransaction<'_>,
11711    agent_id: i64,
11712    workspace_id: Option<i64>,
11713    conv: &Conversation,
11714) -> Result<ConversationInsertStatus> {
11715    let conversation_key = conversation_merge_key(agent_id, conv);
11716    if let Some(existing_id) =
11717        franken_find_existing_conversation_by_key(tx, &conversation_key, Some(conv))?
11718    {
11719        return Ok(ConversationInsertStatus::Existing(existing_id));
11720    }
11721
11722    franken_insert_conversation_or_get_existing_after_miss(
11723        tx,
11724        agent_id,
11725        workspace_id,
11726        conv,
11727        &conversation_key,
11728    )
11729}
11730
11731fn franken_insert_conversation_or_get_existing_after_miss(
11732    tx: &FrankenTransaction<'_>,
11733    agent_id: i64,
11734    workspace_id: Option<i64>,
11735    conv: &Conversation,
11736    conversation_key: &PendingConversationKey,
11737) -> Result<ConversationInsertStatus> {
11738    match franken_insert_conversation(tx, agent_id, workspace_id, conv) {
11739        Ok(Some(conv_id)) => Ok(ConversationInsertStatus::Inserted(conv_id)),
11740        Ok(None) => {
11741            // A concurrent writer won the unique-provenance race. Resolve the
11742            // canonical row so callers can merge messages into it.
11743            let existing_id =
11744                franken_find_existing_conversation_by_key_after_conflict(
11745                    tx,
11746                    conversation_key,
11747                    Some(conv),
11748                )?
11749                    .with_context(|| {
11750                        format!(
11751                            "conversation INSERT produced a duplicate conflict but existing row was not found for source_id={} agent_id={} external_id={:?} source_path={}",
11752                            conv.source_id,
11753                            agent_id,
11754                            conv.external_id,
11755                            conv.source_path.display()
11756                        )
11757                    })?;
11758            tracing::warn!(
11759                source_id = %conv.source_id,
11760                agent_id,
11761                external_id = ?conv.external_id,
11762                existing_id,
11763                source_path = %conv.source_path.display(),
11764                "conversation INSERT: duplicate gracefully recovered, reusing existing row"
11765            );
11766            Ok(ConversationInsertStatus::Existing(existing_id))
11767        }
11768        Err(error) => {
11769            tracing::error!(
11770                source_id = %conv.source_id,
11771                agent_id,
11772                external_id = ?conv.external_id,
11773                error = %error,
11774                source_path = %conv.source_path.display(),
11775                "franken_insert_conversation failed"
11776            );
11777            Err(error)
11778        }
11779    }
11780}
11781
11782/// Insert a conversation into the DB within a frankensqlite transaction.
11783///
11784/// Uses a plain `INSERT` so the common miss path stays on the slim direct
11785/// insert lane. Duplicate provenance conflicts are converted into `Ok(None)`
11786/// so callers can recover the canonical row and merge messages into it.
11787fn franken_insert_conversation(
11788    tx: &FrankenTransaction<'_>,
11789    agent_id: i64,
11790    workspace_id: Option<i64>,
11791    conv: &Conversation,
11792) -> Result<Option<i64>> {
11793    let (metadata_json_str, metadata_bin) = franken_metadata_insert_payload(&conv.metadata_json)?;
11794    let (last_message_idx, last_message_created_at) = conversation_tail_state(conv);
11795    let metadata_bin_bytes = metadata_bin.as_deref();
11796
11797    match tx.execute_compat(
11798        "INSERT INTO conversations(
11799            agent_id, workspace_id, source_id, external_id, title, source_path,
11800            started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin,
11801            last_message_idx, last_message_created_at
11802        ) VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14)",
11803        fparams![
11804            agent_id,
11805            workspace_id,
11806            conv.source_id.as_str(),
11807            conv.external_id.as_deref(),
11808            conv.title.as_deref(),
11809            path_to_string(&conv.source_path),
11810            conv.started_at,
11811            conv.ended_at,
11812            conv.approx_tokens,
11813            metadata_json_str.as_deref(),
11814            conv.origin_host.as_deref(),
11815            metadata_bin_bytes,
11816            last_message_idx,
11817            last_message_created_at
11818        ],
11819    ) {
11820        Ok(_) => {
11821            let conv_id = franken_last_rowid(tx)?;
11822            franken_insert_conversation_tail_state(
11823                tx,
11824                conv_id,
11825                conv.ended_at,
11826                last_message_idx,
11827                last_message_created_at,
11828            )?;
11829            if let Some(external_id) = conv.external_id.as_deref() {
11830                franken_insert_external_conversation_tail_lookup(
11831                    tx,
11832                    conv.source_id.as_str(),
11833                    agent_id,
11834                    external_id,
11835                    ExistingConversationWithTail {
11836                        id: conv_id,
11837                        tail_state: existing_conversation_tail_state_from_cached(
11838                            last_message_idx,
11839                            last_message_created_at,
11840                            conv.ended_at,
11841                        ),
11842                    },
11843                )?;
11844            }
11845            Ok(Some(conv_id))
11846        }
11847        Err(frankensqlite::FrankenError::UniqueViolation { .. }) => {
11848            tracing::debug!(
11849                source_id = %conv.source_id,
11850                agent_id,
11851                external_id = ?conv.external_id,
11852                source_path = %conv.source_path.display(),
11853                "conversation INSERT: duplicate provenance conflict"
11854            );
11855            Ok(None)
11856        }
11857        Err(error) => Err(error.into()),
11858    }
11859}
11860
11861type MetadataInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11862
11863fn franken_metadata_insert_payload(value: &serde_json::Value) -> Result<MetadataInsertPayload<'_>> {
11864    if let Some(raw) = historical_raw_json(value) {
11865        Ok((Some(Cow::Borrowed(raw)), None))
11866    } else if value.is_null() {
11867        Ok((Some(Cow::Borrowed("null")), None))
11868    } else if value.as_object().is_some_and(|object| object.is_empty()) {
11869        Ok((None, None))
11870    } else if let Some(metadata_bin) = serialize_json_to_msgpack(value) {
11871        Ok((None, Some(metadata_bin)))
11872    } else {
11873        Ok((Some(Cow::Owned(serde_json::to_string(value)?)), None))
11874    }
11875}
11876
11877fn franken_insert_new_message(
11878    tx: &FrankenTransaction<'_>,
11879    conversation_id: i64,
11880    msg: &Message,
11881) -> Result<i64> {
11882    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11883    let extra_bin_bytes = extra_bin.as_deref();
11884
11885    tx.execute_compat(
11886        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
11887         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
11888            fparams![
11889                conversation_id,
11890                msg.idx,
11891                role_as_str(&msg.role),
11892                msg.author.as_deref(),
11893                msg.created_at,
11894                msg.content.as_str(),
11895                extra_json_str.as_deref(),
11896                extra_bin_bytes
11897        ],
11898    )?;
11899    franken_last_rowid(tx)
11900}
11901
11902type MessageInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11903
11904fn franken_message_insert_payload(msg: &Message) -> Result<MessageInsertPayload<'_>> {
11905    if let Some(raw) = historical_raw_json(&msg.extra_json) {
11906        Ok((Some(Cow::Borrowed(raw)), None))
11907    } else if msg.extra_json.is_null() {
11908        Ok((None, None))
11909    } else {
11910        let extra_bin = serialize_json_to_msgpack(&msg.extra_json);
11911        if extra_bin.is_some() {
11912            Ok((None, extra_bin))
11913        } else {
11914            Ok((
11915                Some(Cow::Owned(serde_json::to_string(&msg.extra_json)?)),
11916                None,
11917            ))
11918        }
11919    }
11920}
11921
11922/// Batch size for proven-new message inserts.
11923///
11924/// Each row binds 8 values, so 100 rows stays well under SQLite's default
11925/// `SQLITE_MAX_VARIABLE_NUMBER` limit of 999 while still amortizing parse cost.
11926const MESSAGE_INSERT_BATCH_SIZE: usize = 100;
11927
11928/// Append workloads profile fastest with larger chunks on current frankensqlite.
11929///
11930/// After the tail-state hot table removed conversation-row rewrites from the
11931/// append path, 50-row chunks beat the old 20-row setting on the append-merge
11932/// profile. 100-row chunks slightly regress the 20-message workload.
11933const APPEND_MESSAGE_INSERT_BATCH_SIZE: usize = 50;
11934
11935fn message_insert_batch_sql(row_count: usize) -> &'static str {
11936    static MESSAGE_INSERT_BATCH_SQL: std::sync::OnceLock<Vec<String>> = std::sync::OnceLock::new();
11937
11938    let max_batch_size = MESSAGE_INSERT_BATCH_SIZE.max(APPEND_MESSAGE_INSERT_BATCH_SIZE);
11939    let cached_sql = MESSAGE_INSERT_BATCH_SQL.get_or_init(|| {
11940        let mut sql_by_row_count = Vec::with_capacity(max_batch_size + 1);
11941        sql_by_row_count.push(String::new());
11942        for row_count in 1..=max_batch_size {
11943            let placeholders = (0..row_count)
11944                .map(|idx| {
11945                    let base = idx * 8;
11946                    format!(
11947                        "(?{},?{},?{},?{},?{},?{},?{},?{})",
11948                        base + 1,
11949                        base + 2,
11950                        base + 3,
11951                        base + 4,
11952                        base + 5,
11953                        base + 6,
11954                        base + 7,
11955                        base + 8
11956                    )
11957                })
11958                .collect::<Vec<_>>()
11959                .join(",");
11960            sql_by_row_count.push(format!(
11961                "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin) VALUES {placeholders}"
11962            ));
11963        }
11964        sql_by_row_count
11965    });
11966
11967    cached_sql
11968        .get(row_count)
11969        .map(String::as_str)
11970        .expect("message insert batch size must be covered by the cached SQL table")
11971}
11972
11973fn franken_batch_insert_new_messages(
11974    tx: &FrankenTransaction<'_>,
11975    conversation_id: i64,
11976    messages: &[&Message],
11977) -> Result<Vec<i64>> {
11978    franken_batch_insert_new_messages_with_batch_size(
11979        tx,
11980        conversation_id,
11981        messages,
11982        MESSAGE_INSERT_BATCH_SIZE,
11983    )
11984}
11985
11986fn franken_append_insert_new_messages(
11987    tx: &FrankenTransaction<'_>,
11988    conversation_id: i64,
11989    messages: &[&Message],
11990) -> Result<Vec<i64>> {
11991    franken_batch_insert_new_messages_with_batch_size(
11992        tx,
11993        conversation_id,
11994        messages,
11995        APPEND_MESSAGE_INSERT_BATCH_SIZE,
11996    )
11997}
11998
11999fn franken_batch_insert_new_messages_with_batch_size(
12000    tx: &FrankenTransaction<'_>,
12001    conversation_id: i64,
12002    messages: &[&Message],
12003    batch_size: usize,
12004) -> Result<Vec<i64>> {
12005    let batch_size = batch_size.max(1);
12006    let mut inserted_ids = Vec::with_capacity(messages.len());
12007    for chunk in messages.chunks(batch_size) {
12008        if chunk.len() == 1 {
12009            inserted_ids.push(franken_insert_new_message(tx, conversation_id, chunk[0])?);
12010            continue;
12011        }
12012        let sql = message_insert_batch_sql(chunk.len());
12013
12014        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
12015        for msg in chunk {
12016            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12017            param_values.push(SqliteValue::from(conversation_id));
12018            param_values.push(SqliteValue::from(msg.idx));
12019            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
12020            param_values.push(SqliteValue::from(msg.author.as_deref()));
12021            param_values.push(SqliteValue::from(msg.created_at));
12022            param_values.push(SqliteValue::from(msg.content.as_str()));
12023            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
12024            param_values.push(SqliteValue::from(extra_bin.as_deref()));
12025        }
12026
12027        tx.execute_with_params(sql, &param_values)?;
12028
12029        let last_id = franken_last_rowid(tx)?;
12030        let first_id = last_id
12031            .checked_sub((chunk.len() - 1) as i64)
12032            .with_context(|| {
12033                format!(
12034                    "inferring rowid range for {}-row message batch ending at {last_id}",
12035                    chunk.len()
12036                )
12037            })?;
12038        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
12039    }
12040
12041    Ok(inserted_ids)
12042}
12043
12044#[cfg(test)]
12045fn franken_insert_new_message_with_profile(
12046    tx: &FrankenTransaction<'_>,
12047    conversation_id: i64,
12048    msg: &Message,
12049    profile: &mut MessageInsertSubstageProfile,
12050) -> Result<i64> {
12051    profile.single_row_calls += 1;
12052    profile.batch_rows += 1;
12053
12054    let payload_start = Instant::now();
12055    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12056    profile.payload_duration += payload_start.elapsed();
12057    let extra_bin_bytes = extra_bin.as_deref();
12058
12059    let execute_start = Instant::now();
12060    tx.execute_compat(
12061        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
12062         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
12063            fparams![
12064                conversation_id,
12065                msg.idx,
12066                role_as_str(&msg.role),
12067                msg.author.as_deref(),
12068                msg.created_at,
12069                msg.content.as_str(),
12070                extra_json_str.as_deref(),
12071                extra_bin_bytes
12072        ],
12073    )?;
12074    profile.execute_duration += execute_start.elapsed();
12075
12076    let rowid_start = Instant::now();
12077    let rowid = franken_last_rowid(tx)?;
12078    profile.rowid_duration += rowid_start.elapsed();
12079    Ok(rowid)
12080}
12081
12082#[cfg(test)]
12083fn franken_batch_insert_new_messages_with_profile(
12084    tx: &FrankenTransaction<'_>,
12085    conversation_id: i64,
12086    messages: &[&Message],
12087    profile: &mut MessageInsertSubstageProfile,
12088) -> Result<Vec<i64>> {
12089    franken_batch_insert_new_messages_with_profile_batch_size(
12090        tx,
12091        conversation_id,
12092        messages,
12093        profile,
12094        MESSAGE_INSERT_BATCH_SIZE,
12095    )
12096}
12097
12098#[cfg(test)]
12099fn franken_append_insert_new_messages_with_profile(
12100    tx: &FrankenTransaction<'_>,
12101    conversation_id: i64,
12102    messages: &[&Message],
12103    profile: &mut MessageInsertSubstageProfile,
12104) -> Result<Vec<i64>> {
12105    franken_batch_insert_new_messages_with_profile_batch_size(
12106        tx,
12107        conversation_id,
12108        messages,
12109        profile,
12110        APPEND_MESSAGE_INSERT_BATCH_SIZE,
12111    )
12112}
12113
12114#[cfg(test)]
12115fn franken_batch_insert_new_messages_with_profile_batch_size(
12116    tx: &FrankenTransaction<'_>,
12117    conversation_id: i64,
12118    messages: &[&Message],
12119    profile: &mut MessageInsertSubstageProfile,
12120    batch_size: usize,
12121) -> Result<Vec<i64>> {
12122    let batch_size = batch_size.max(1);
12123    let mut inserted_ids = Vec::with_capacity(messages.len());
12124    for chunk in messages.chunks(batch_size) {
12125        if chunk.len() == 1 {
12126            inserted_ids.push(franken_insert_new_message_with_profile(
12127                tx,
12128                conversation_id,
12129                chunk[0],
12130                profile,
12131            )?);
12132            continue;
12133        }
12134
12135        profile.batch_calls += 1;
12136        profile.batch_rows += chunk.len();
12137
12138        let sql_build_start = Instant::now();
12139        let sql = message_insert_batch_sql(chunk.len());
12140        profile.sql_build_duration += sql_build_start.elapsed();
12141
12142        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
12143        for msg in chunk {
12144            let payload_start = Instant::now();
12145            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12146            profile.payload_duration += payload_start.elapsed();
12147
12148            let param_build_start = Instant::now();
12149            param_values.push(SqliteValue::from(conversation_id));
12150            param_values.push(SqliteValue::from(msg.idx));
12151            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
12152            param_values.push(SqliteValue::from(msg.author.as_deref()));
12153            param_values.push(SqliteValue::from(msg.created_at));
12154            param_values.push(SqliteValue::from(msg.content.as_str()));
12155            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
12156            param_values.push(SqliteValue::from(extra_bin.as_deref()));
12157            profile.param_build_duration += param_build_start.elapsed();
12158        }
12159
12160        let execute_start = Instant::now();
12161        tx.execute_with_params(sql, &param_values)?;
12162        profile.execute_duration += execute_start.elapsed();
12163
12164        let rowid_start = Instant::now();
12165        let last_id = franken_last_rowid(tx)?;
12166        let first_id = last_id
12167            .checked_sub((chunk.len() - 1) as i64)
12168            .with_context(|| {
12169                format!(
12170                    "inferring rowid range for {}-row message batch ending at {last_id}",
12171                    chunk.len()
12172                )
12173            })?;
12174        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
12175        profile.rowid_duration += rowid_start.elapsed();
12176    }
12177
12178    Ok(inserted_ids)
12179}
12180
12181/// Insert snippets within a frankensqlite transaction.
12182fn franken_insert_snippets(
12183    tx: &FrankenTransaction<'_>,
12184    message_id: i64,
12185    snippets: &[Snippet],
12186) -> Result<()> {
12187    for snip in snippets {
12188        let file_path_str = snip.file_path.as_ref().map(path_to_string);
12189        tx.execute_compat(
12190            "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text)
12191             VALUES(?1,?2,?3,?4,?5,?6)",
12192            fparams![
12193                message_id,
12194                file_path_str.as_deref(),
12195                snip.start_line,
12196                snip.end_line,
12197                snip.language.as_deref(),
12198                snip.snippet_text.as_deref()
12199            ],
12200        )?;
12201    }
12202    Ok(())
12203}
12204
12205fn franken_existing_message_fingerprints(
12206    tx: &FrankenTransaction<'_>,
12207    conversation_id: i64,
12208) -> Result<HashSet<MessageMergeFingerprint>> {
12209    let rows = tx.query_params(
12210        "SELECT idx, role, author, created_at, content
12211         FROM messages
12212         WHERE conversation_id = ?1",
12213        fparams![conversation_id],
12214    )?;
12215    let mut fingerprints = HashSet::with_capacity(rows.len());
12216    for row in rows {
12217        let role: String = row.get_typed(1)?;
12218        let content: String = row.get_typed(4)?;
12219        fingerprints.insert(MessageMergeFingerprint {
12220            idx: row.get_typed(0)?,
12221            created_at: row.get_typed(3)?,
12222            role: role_from_str(&role),
12223            author: row.get_typed(2)?,
12224            content_hash: *blake3::hash(content.as_bytes()).as_bytes(),
12225        });
12226    }
12227    Ok(fingerprints)
12228}
12229
12230struct ExistingMessageLookup {
12231    by_idx: HashMap<i64, MessageMergeFingerprint>,
12232    replay: HashSet<MessageReplayFingerprint>,
12233}
12234
12235fn franken_existing_message_lookup(
12236    tx: &FrankenTransaction<'_>,
12237    conversation_id: i64,
12238    incoming_messages: &[Message],
12239) -> Result<ExistingMessageLookup> {
12240    if incoming_messages.is_empty() {
12241        return Ok(ExistingMessageLookup {
12242            by_idx: HashMap::new(),
12243            replay: HashSet::new(),
12244        });
12245    }
12246
12247    let min_idx = incoming_messages
12248        .iter()
12249        .map(|msg| msg.idx)
12250        .min()
12251        .unwrap_or(0);
12252    let max_idx = incoming_messages
12253        .iter()
12254        .map(|msg| msg.idx)
12255        .max()
12256        .unwrap_or(min_idx);
12257    let requires_full_scan = incoming_messages.iter().any(|msg| msg.created_at.is_none());
12258    let created_bounds = incoming_messages
12259        .iter()
12260        .filter_map(|msg| msg.created_at)
12261        .fold(None, |bounds: Option<(i64, i64)>, created_at| {
12262            Some(match bounds {
12263                Some((min_created_at, max_created_at)) => (
12264                    min_created_at.min(created_at),
12265                    max_created_at.max(created_at),
12266                ),
12267                None => (created_at, created_at),
12268            })
12269        });
12270
12271    let mut indexed_by_idx = HashMap::with_capacity(incoming_messages.len());
12272    let mut indexed_replay = HashSet::with_capacity(incoming_messages.len());
12273    let mut exact_idx_match = true;
12274    for msg in incoming_messages {
12275        record_message_lookup_exact_idx_probe();
12276        let Some((role, author, created_at, content)) = tx
12277            .query_row_map(
12278                "SELECT role, author, created_at, content
12279                 FROM messages INDEXED BY sqlite_autoindex_messages_1
12280                 WHERE conversation_id = ?1 AND idx = ?2
12281                 LIMIT 1",
12282                fparams![conversation_id, msg.idx],
12283                |row| {
12284                    Ok((
12285                        row.get_typed::<String>(0)?,
12286                        row.get_typed::<Option<String>>(1)?,
12287                        row.get_typed::<Option<i64>>(2)?,
12288                        row.get_typed::<String>(3)?,
12289                    ))
12290                },
12291            )
12292            .optional()?
12293        else {
12294            exact_idx_match = false;
12295            break;
12296        };
12297        let role = role_from_str(&role);
12298        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12299        let fingerprint = MessageMergeFingerprint {
12300            idx: msg.idx,
12301            created_at,
12302            role: role.clone(),
12303            author: author.clone(),
12304            content_hash,
12305        };
12306        if fingerprint != message_merge_fingerprint(msg) {
12307            exact_idx_match = false;
12308            break;
12309        }
12310        indexed_by_idx.insert(msg.idx, fingerprint);
12311        indexed_replay.insert(MessageReplayFingerprint {
12312            created_at,
12313            role,
12314            author,
12315            content_hash,
12316        });
12317    }
12318
12319    if exact_idx_match {
12320        return Ok(ExistingMessageLookup {
12321            by_idx: indexed_by_idx,
12322            replay: indexed_replay,
12323        });
12324    }
12325
12326    let (rows, replay_full_scan) = if requires_full_scan {
12327        let rows = tx.query_params(
12328            "SELECT idx, role, author, created_at, content
12329             FROM messages INDEXED BY sqlite_autoindex_messages_1
12330             WHERE conversation_id = ?1",
12331            fparams![conversation_id],
12332        )?;
12333        record_message_lookup_full_scan_query(rows.len());
12334        (rows, true)
12335    } else if let Some((min_created_at, max_created_at)) = created_bounds {
12336        let mut rows = tx.query_params(
12337            "SELECT idx, role, author, created_at, content
12338             FROM messages INDEXED BY sqlite_autoindex_messages_1
12339             WHERE conversation_id = ?1
12340               AND idx >= ?2
12341               AND idx <= ?3",
12342            fparams![conversation_id, min_idx, max_idx],
12343        )?;
12344        rows.extend(tx.query_params(
12345            "SELECT idx, role, author, created_at, content
12346             FROM messages INDEXED BY sqlite_autoindex_messages_1
12347             WHERE conversation_id = ?1
12348               AND created_at IS NOT NULL
12349               AND created_at >= ?2
12350               AND created_at <= ?3",
12351            fparams![conversation_id, min_created_at, max_created_at],
12352        )?);
12353        record_message_lookup_bounded_queries(2, rows.len());
12354        (rows, false)
12355    } else {
12356        let rows = tx.query_params(
12357            "SELECT idx, role, author, created_at, content
12358             FROM messages INDEXED BY sqlite_autoindex_messages_1
12359             WHERE conversation_id = ?1",
12360            fparams![conversation_id],
12361        )?;
12362        record_message_lookup_full_scan_query(rows.len());
12363        (rows, true)
12364    };
12365
12366    let mut by_idx = HashMap::with_capacity(rows.len());
12367    let mut replay = HashSet::with_capacity(rows.len());
12368    for row in rows {
12369        let idx: i64 = row.get_typed(0)?;
12370        let role: String = row.get_typed(1)?;
12371        let author: Option<String> = row.get_typed(2)?;
12372        let created_at: Option<i64> = row.get_typed(3)?;
12373        let content: String = row.get_typed(4)?;
12374        let role = role_from_str(&role);
12375        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12376
12377        if idx >= min_idx && idx <= max_idx {
12378            by_idx.insert(
12379                idx,
12380                MessageMergeFingerprint {
12381                    idx,
12382                    created_at,
12383                    role: role.clone(),
12384                    author: author.clone(),
12385                    content_hash,
12386                },
12387            );
12388        }
12389
12390        let replay_matches = if replay_full_scan {
12391            true
12392        } else if let Some((min_created_at, max_created_at)) = created_bounds {
12393            created_at.is_some_and(|ts| ts >= min_created_at && ts <= max_created_at)
12394        } else {
12395            true
12396        };
12397        if replay_matches {
12398            replay.insert(MessageReplayFingerprint {
12399                created_at,
12400                role,
12401                author,
12402                content_hash,
12403            });
12404        }
12405    }
12406
12407    Ok(ExistingMessageLookup { by_idx, replay })
12408}
12409
12410fn franken_existing_message_lookup_with_pending(
12411    tx: &FrankenTransaction<'_>,
12412    conversation_id: i64,
12413    incoming_messages: &[Message],
12414    pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
12415    pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
12416) -> Result<ExistingMessageLookup> {
12417    if let (Some(by_idx), Some(replay)) = (
12418        pending_message_fingerprints.get(&conversation_id),
12419        pending_message_replay_fingerprints.get(&conversation_id),
12420    ) {
12421        if incoming_messages.iter().all(|msg| {
12422            by_idx.contains_key(&msg.idx) || replay.contains(&message_replay_fingerprint(msg))
12423        }) {
12424            return Ok(ExistingMessageLookup {
12425                by_idx: by_idx.clone(),
12426                replay: replay.clone(),
12427            });
12428        }
12429
12430        let fresh = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12431        let mut merged_by_idx = by_idx.clone();
12432        let mut merged_replay = replay.clone();
12433        merged_by_idx.extend(fresh.by_idx);
12434        merged_replay.extend(fresh.replay);
12435        pending_message_fingerprints.insert(conversation_id, merged_by_idx.clone());
12436        pending_message_replay_fingerprints.insert(conversation_id, merged_replay.clone());
12437        return Ok(ExistingMessageLookup {
12438            by_idx: merged_by_idx,
12439            replay: merged_replay,
12440        });
12441    }
12442
12443    let lookup = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12444    pending_message_fingerprints.insert(conversation_id, lookup.by_idx.clone());
12445    pending_message_replay_fingerprints.insert(conversation_id, lookup.replay.clone());
12446    Ok(lookup)
12447}
12448
12449/// Batch insert FTS5 entries within a frankensqlite transaction.
12450fn franken_batch_insert_fts(tx: &FrankenTransaction<'_>, entries: &[FtsEntry]) -> Result<usize> {
12451    if entries.is_empty() {
12452        return Ok(0);
12453    }
12454
12455    let mut inserted = 0;
12456
12457    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12458        let placeholders: String = chunk
12459            .iter()
12460            .enumerate()
12461            .map(|(i, _)| {
12462                let base = i * 7 + 1; // +1 for 1-indexed params
12463                format!(
12464                    "(?{},?{},?{},?{},?{},?{},?{})",
12465                    base,
12466                    base + 1,
12467                    base + 2,
12468                    base + 3,
12469                    base + 4,
12470                    base + 5,
12471                    base + 6
12472                )
12473            })
12474            .collect::<Vec<_>>()
12475            .join(",");
12476
12477        let sql = format!(
12478            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12479        );
12480
12481        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12482        for entry in chunk {
12483            param_values.push(SqliteValue::from(entry.message_id));
12484            param_values.push(SqliteValue::from(entry.content.as_str()));
12485            param_values.push(SqliteValue::from(entry.title.as_str()));
12486            param_values.push(SqliteValue::from(entry.agent.as_str()));
12487            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12488            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12489            param_values.push(SqliteValue::from(entry.created_at));
12490        }
12491
12492        match tx.execute_with_params(&sql, &param_values) {
12493            Ok(_) => {
12494                inserted += chunk.len();
12495            }
12496            Err(err) => {
12497                tracing::warn!(
12498                    error = %err,
12499                    chunk_docs = chunk.len(),
12500                    "frankensqlite FTS batch insert failed; skipping db-resident FTS maintenance because Tantivy is authoritative"
12501                );
12502                return Ok(inserted);
12503            }
12504        }
12505    }
12506
12507    Ok(inserted)
12508}
12509
12510fn franken_batch_insert_fts_on_connection(
12511    conn: &FrankenConnection,
12512    entries: &[FtsEntry],
12513) -> Result<usize> {
12514    if entries.is_empty() {
12515        return Ok(0);
12516    }
12517
12518    let mut inserted = 0;
12519
12520    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12521        let placeholders: String = chunk
12522            .iter()
12523            .enumerate()
12524            .map(|(i, _)| {
12525                let base = i * 7 + 1;
12526                format!(
12527                    "(?{},?{},?{},?{},?{},?{},?{})",
12528                    base,
12529                    base + 1,
12530                    base + 2,
12531                    base + 3,
12532                    base + 4,
12533                    base + 5,
12534                    base + 6
12535                )
12536            })
12537            .collect::<Vec<_>>()
12538            .join(",");
12539
12540        let sql = format!(
12541            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12542        );
12543
12544        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12545        for entry in chunk {
12546            param_values.push(SqliteValue::from(entry.message_id));
12547            param_values.push(SqliteValue::from(entry.content.as_str()));
12548            param_values.push(SqliteValue::from(entry.title.as_str()));
12549            param_values.push(SqliteValue::from(entry.agent.as_str()));
12550            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12551            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12552            param_values.push(SqliteValue::from(entry.created_at));
12553        }
12554
12555        conn.execute_with_params(&sql, &param_values)
12556            .with_context(|| {
12557                format!(
12558                    "inserting {} rows into fts_messages during streaming FTS maintenance",
12559                    chunk.len()
12560                )
12561            })?;
12562        inserted += chunk.len();
12563    }
12564
12565    Ok(inserted)
12566}
12567
12568/// Update daily stats within a frankensqlite transaction.
12569fn franken_update_daily_stats_in_tx(
12570    storage: &FrankenStorage,
12571    tx: &FrankenTransaction<'_>,
12572    agent_slug: &str,
12573    source_id: &str,
12574    started_at: Option<i64>,
12575    delta: StatsDelta,
12576) -> Result<()> {
12577    let day_id = started_at
12578        .map(FrankenStorage::day_id_from_millis)
12579        .unwrap_or(0);
12580    let now = FrankenStorage::now_millis();
12581
12582    let targets = [
12583        DailyStatsTarget {
12584            day_id,
12585            agent_slug,
12586            source_id,
12587        },
12588        DailyStatsTarget {
12589            day_id,
12590            agent_slug: "all",
12591            source_id,
12592        },
12593        DailyStatsTarget {
12594            day_id,
12595            agent_slug,
12596            source_id: "all",
12597        },
12598        DailyStatsTarget {
12599            day_id,
12600            agent_slug: "all",
12601            source_id: "all",
12602        },
12603    ];
12604
12605    if agent_slug != "all"
12606        && source_id != "all"
12607        && franken_update_ensured_daily_stats_targets_in_tx(storage, tx, &targets, now, delta)?
12608    {
12609        return Ok(());
12610    }
12611
12612    for target in targets {
12613        franken_apply_daily_stats_delta_in_tx(storage, tx, target, now, delta)?;
12614    }
12615
12616    Ok(())
12617}
12618
12619#[derive(Clone, Copy)]
12620struct DailyStatsTarget<'a> {
12621    day_id: i64,
12622    agent_slug: &'a str,
12623    source_id: &'a str,
12624}
12625
12626fn franken_update_ensured_daily_stats_targets_in_tx(
12627    storage: &FrankenStorage,
12628    tx: &FrankenTransaction<'_>,
12629    targets: &[DailyStatsTarget<'_>; 4],
12630    now: i64,
12631    delta: StatsDelta,
12632) -> Result<bool> {
12633    let cache_keys = targets.map(|target| {
12634        EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id)
12635    });
12636    if !storage.daily_stats_keys_already_ensured(&cache_keys) {
12637        return Ok(false);
12638    }
12639
12640    let primary = targets[0];
12641    let rows_changed = tx.execute_compat(
12642        "UPDATE daily_stats
12643         SET session_count = session_count + ?4,
12644             message_count = message_count + ?5,
12645             total_chars = total_chars + ?6,
12646             last_updated = ?7
12647         WHERE day_id = ?1
12648           AND ((agent_slug = ?2 AND source_id = ?3)
12649                OR (agent_slug = 'all' AND source_id = ?3)
12650                OR (agent_slug = ?2 AND source_id = 'all')
12651                OR (agent_slug = 'all' AND source_id = 'all'))",
12652        fparams![
12653            primary.day_id,
12654            primary.agent_slug,
12655            primary.source_id,
12656            delta.session_count_delta,
12657            delta.message_count_delta,
12658            delta.total_chars_delta,
12659            now
12660        ],
12661    )?;
12662    if rows_changed == targets.len() {
12663        return Ok(true);
12664    }
12665
12666    for (target, cache_key) in targets.iter().copied().zip(cache_keys) {
12667        let exists = tx
12668            .query_row_map(
12669                "SELECT 1 FROM daily_stats
12670                 WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3
12671                 LIMIT 1",
12672                fparams![target.day_id, target.agent_slug, target.source_id],
12673                |row| row.get_typed::<i64>(0),
12674            )
12675            .optional()?
12676            .is_some();
12677        if exists {
12678            continue;
12679        }
12680
12681        tx.execute_compat(
12682            "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12683             VALUES(?1,?2,?3,?4,?5,?6,?7)",
12684            fparams![
12685                target.day_id,
12686                target.agent_slug,
12687                target.source_id,
12688                delta.session_count_delta,
12689                delta.message_count_delta,
12690                delta.total_chars_delta,
12691                now
12692            ],
12693        )?;
12694        storage.mark_daily_stats_key_ensured(cache_key);
12695    }
12696
12697    Ok(true)
12698}
12699
12700fn franken_apply_daily_stats_delta_in_tx(
12701    storage: &FrankenStorage,
12702    tx: &FrankenTransaction<'_>,
12703    target: DailyStatsTarget<'_>,
12704    now: i64,
12705    delta: StatsDelta,
12706) -> Result<()> {
12707    let cache_key = EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id);
12708    if storage.daily_stats_key_already_ensured(&cache_key) {
12709        let rows_changed = tx.execute_compat(
12710            "UPDATE daily_stats
12711             SET session_count = session_count + ?4,
12712                 message_count = message_count + ?5,
12713                 total_chars = total_chars + ?6,
12714                 last_updated = ?7
12715             WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3",
12716            fparams![
12717                target.day_id,
12718                target.agent_slug,
12719                target.source_id,
12720                delta.session_count_delta,
12721                delta.message_count_delta,
12722                delta.total_chars_delta,
12723                now
12724            ],
12725        )?;
12726        if rows_changed > 0 {
12727            return Ok(());
12728        }
12729    }
12730
12731    tx.execute_compat(
12732        "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12733         VALUES(?1,?2,?3,?4,?5,?6,?7)
12734         ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12735            session_count = session_count + excluded.session_count,
12736            message_count = message_count + excluded.message_count,
12737            total_chars = total_chars + excluded.total_chars,
12738            last_updated = excluded.last_updated",
12739        fparams![
12740            target.day_id,
12741            target.agent_slug,
12742            target.source_id,
12743            delta.session_count_delta,
12744            delta.message_count_delta,
12745            delta.total_chars_delta,
12746            now
12747        ],
12748    )?;
12749    storage.mark_daily_stats_key_ensured(cache_key);
12750    Ok(())
12751}
12752
12753// -------------------------------------------------------------------------
12754// Frankensqlite batch helpers
12755// -------------------------------------------------------------------------
12756
12757/// Batch upsert daily_stats within a frankensqlite transaction.
12758fn franken_update_daily_stats_batched_in_tx(
12759    tx: &FrankenTransaction<'_>,
12760    entries: &[(i64, String, String, StatsDelta)],
12761) -> Result<usize> {
12762    if entries.is_empty() {
12763        return Ok(0);
12764    }
12765
12766    let now = FrankenStorage::now_millis();
12767    let mut total_affected = 0;
12768
12769    // Keep frankensqlite UPSERTs row-wise inside the transaction. The
12770    // multi-row VALUES ... ON CONFLICT form still falls back through
12771    // INSERT...SELECT in fsqlite-core, which rejects UPSERT/RETURNING during
12772    // real cass indexing.
12773    for (day_id, agent, source, delta) in entries {
12774        total_affected += tx.execute_compat(
12775            "INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12776             VALUES(?1,?2,?3,?4,?5,?6,?7)
12777             ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12778                 session_count = session_count + excluded.session_count,
12779                 message_count = message_count + excluded.message_count,
12780                 total_chars = total_chars + excluded.total_chars,
12781                 last_updated = excluded.last_updated",
12782            fparams![
12783                *day_id,
12784                agent.as_str(),
12785                source.as_str(),
12786                delta.session_count_delta,
12787                delta.message_count_delta,
12788                delta.total_chars_delta,
12789                now
12790            ],
12791        )?;
12792    }
12793
12794    Ok(total_affected)
12795}
12796
12797/// Batch insert token_usage rows within a frankensqlite transaction.
12798///
12799/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12800/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12801/// UPSERT/OR IGNORE conflict clauses.
12802fn franken_insert_token_usage_batched_in_tx(
12803    tx: &FrankenTransaction<'_>,
12804    entries: &[TokenUsageEntry],
12805) -> Result<usize> {
12806    if entries.is_empty() {
12807        return Ok(0);
12808    }
12809
12810    let mut total_inserted = 0;
12811
12812    for e in entries {
12813        let params_vec: Vec<ParamValue> = vec![
12814            ParamValue::from(e.message_id),
12815            ParamValue::from(e.conversation_id),
12816            ParamValue::from(e.agent_id),
12817            ParamValue::from(e.workspace_id),
12818            ParamValue::from(e.source_id.clone()),
12819            ParamValue::from(e.timestamp_ms),
12820            ParamValue::from(e.day_id),
12821            ParamValue::from(e.model_name.clone()),
12822            ParamValue::from(e.model_family.clone()),
12823            ParamValue::from(e.model_tier.clone()),
12824            ParamValue::from(e.service_tier.clone()),
12825            ParamValue::from(e.provider.clone()),
12826            ParamValue::from(e.input_tokens),
12827            ParamValue::from(e.output_tokens),
12828            ParamValue::from(e.cache_read_tokens),
12829            ParamValue::from(e.cache_creation_tokens),
12830            ParamValue::from(e.thinking_tokens),
12831            ParamValue::from(e.total_tokens),
12832            ParamValue::from(e.estimated_cost_usd),
12833            ParamValue::from(e.role.clone()),
12834            ParamValue::from(e.content_chars),
12835            ParamValue::from(e.has_tool_calls as i64),
12836            ParamValue::from(e.tool_call_count as i64),
12837            ParamValue::from(e.data_source.clone()),
12838        ];
12839
12840        let values = param_slice_to_values(&params_vec);
12841        total_inserted += tx.execute_with_params(
12842            "INSERT OR IGNORE INTO token_usage (
12843                message_id, conversation_id, agent_id, workspace_id, source_id,
12844                timestamp_ms, day_id,
12845                model_name, model_family, model_tier, service_tier, provider,
12846                input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
12847                thinking_tokens, total_tokens, estimated_cost_usd,
12848                role, content_chars, has_tool_calls, tool_call_count, data_source
12849            )
12850            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12851            &values,
12852        )?;
12853    }
12854
12855    Ok(total_inserted)
12856}
12857
12858/// Batch upsert token_daily_stats within a frankensqlite transaction.
12859fn franken_update_token_daily_stats_batched_in_tx(
12860    tx: &FrankenTransaction<'_>,
12861    entries: &[(i64, String, String, String, TokenStatsDelta)],
12862) -> Result<usize> {
12863    if entries.is_empty() {
12864        return Ok(0);
12865    }
12866
12867    let now = FrankenStorage::now_millis();
12868    let mut total_affected = 0;
12869
12870    for (day_id, agent, source, model, delta) in entries {
12871        total_affected += tx.execute_compat(
12872            "INSERT INTO token_daily_stats (
12873                day_id, agent_slug, source_id, model_family,
12874                api_call_count, user_message_count, assistant_message_count, tool_message_count,
12875                total_input_tokens, total_output_tokens, total_cache_read_tokens,
12876                total_cache_creation_tokens, total_thinking_tokens, grand_total_tokens,
12877                total_content_chars, total_tool_calls, estimated_cost_usd, session_count,
12878                last_updated
12879            )
12880            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19)
12881            ON CONFLICT(day_id, agent_slug, source_id, model_family) DO UPDATE SET
12882                api_call_count = api_call_count + excluded.api_call_count,
12883                user_message_count = user_message_count + excluded.user_message_count,
12884                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12885                tool_message_count = tool_message_count + excluded.tool_message_count,
12886                total_input_tokens = total_input_tokens + excluded.total_input_tokens,
12887                total_output_tokens = total_output_tokens + excluded.total_output_tokens,
12888                total_cache_read_tokens = total_cache_read_tokens + excluded.total_cache_read_tokens,
12889                total_cache_creation_tokens = total_cache_creation_tokens + excluded.total_cache_creation_tokens,
12890                total_thinking_tokens = total_thinking_tokens + excluded.total_thinking_tokens,
12891                grand_total_tokens = grand_total_tokens + excluded.grand_total_tokens,
12892                total_content_chars = total_content_chars + excluded.total_content_chars,
12893                total_tool_calls = total_tool_calls + excluded.total_tool_calls,
12894                estimated_cost_usd = estimated_cost_usd + excluded.estimated_cost_usd,
12895                session_count = session_count + excluded.session_count,
12896                last_updated = excluded.last_updated",
12897            fparams![
12898                *day_id,
12899                agent.as_str(),
12900                source.as_str(),
12901                model.as_str(),
12902                delta.api_call_count,
12903                delta.user_message_count,
12904                delta.assistant_message_count,
12905                delta.tool_message_count,
12906                delta.total_input_tokens,
12907                delta.total_output_tokens,
12908                delta.total_cache_read_tokens,
12909                delta.total_cache_creation_tokens,
12910                delta.total_thinking_tokens,
12911                delta.grand_total_tokens,
12912                delta.total_content_chars,
12913                delta.total_tool_calls,
12914                delta.estimated_cost_usd,
12915                delta.session_count,
12916                now
12917            ],
12918        )?;
12919    }
12920
12921    Ok(total_affected)
12922}
12923
12924/// Batch insert message_metrics rows within a frankensqlite transaction.
12925///
12926/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12927/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12928/// UPSERT/OR IGNORE conflict clauses.
12929fn franken_insert_message_metrics_batched_in_tx(
12930    tx: &FrankenTransaction<'_>,
12931    entries: &[MessageMetricsEntry],
12932) -> Result<usize> {
12933    if entries.is_empty() {
12934        return Ok(0);
12935    }
12936
12937    let mut total_inserted = 0;
12938
12939    for e in entries {
12940        let params_vec: Vec<ParamValue> = vec![
12941            ParamValue::from(e.message_id),
12942            ParamValue::from(e.created_at_ms),
12943            ParamValue::from(e.hour_id),
12944            ParamValue::from(e.day_id),
12945            ParamValue::from(e.agent_slug.clone()),
12946            ParamValue::from(e.workspace_id),
12947            ParamValue::from(e.source_id.clone()),
12948            ParamValue::from(e.role.clone()),
12949            ParamValue::from(e.content_chars),
12950            ParamValue::from(e.content_tokens_est),
12951            ParamValue::from(e.model_name.clone()),
12952            ParamValue::from(e.model_family.clone()),
12953            ParamValue::from(e.model_tier.clone()),
12954            ParamValue::from(e.provider.clone()),
12955            ParamValue::from(e.api_input_tokens),
12956            ParamValue::from(e.api_output_tokens),
12957            ParamValue::from(e.api_cache_read_tokens),
12958            ParamValue::from(e.api_cache_creation_tokens),
12959            ParamValue::from(e.api_thinking_tokens),
12960            ParamValue::from(e.api_service_tier.clone()),
12961            ParamValue::from(e.api_data_source.clone()),
12962            ParamValue::from(e.tool_call_count),
12963            ParamValue::from(e.has_tool_calls as i64),
12964            ParamValue::from(e.has_plan as i64),
12965        ];
12966
12967        let values = param_slice_to_values(&params_vec);
12968        total_inserted += tx.execute_with_params(
12969            "INSERT OR IGNORE INTO message_metrics (
12970                message_id, created_at_ms, hour_id, day_id,
12971                agent_slug, workspace_id, source_id, role,
12972                content_chars, content_tokens_est,
12973                model_name, model_family, model_tier, provider,
12974                api_input_tokens, api_output_tokens, api_cache_read_tokens,
12975                api_cache_creation_tokens, api_thinking_tokens,
12976                api_service_tier, api_data_source,
12977                tool_call_count, has_tool_calls, has_plan
12978            )
12979            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12980            &values,
12981        )?;
12982    }
12983
12984    Ok(total_inserted)
12985}
12986
12987/// Flush one rollup table (shared logic for hourly + daily) within a frankensqlite transaction.
12988fn franken_flush_rollup_table(
12989    tx: &FrankenTransaction<'_>,
12990    table: &str,
12991    bucket_col: &str,
12992    deltas: &HashMap<(i64, String, i64, String), UsageRollupDelta>,
12993    now: i64,
12994) -> Result<usize> {
12995    if deltas.is_empty() {
12996        return Ok(0);
12997    }
12998
12999    let mut total_affected = 0;
13000
13001    for ((bucket_id, agent, workspace_id, source), d) in deltas {
13002        let sql = format!(
13003            "INSERT INTO {table} (
13004                {bucket_col}, agent_slug, workspace_id, source_id,
13005                message_count, user_message_count, assistant_message_count,
13006                tool_call_count, plan_message_count, plan_content_tokens_est_total,
13007                plan_api_tokens_total, api_coverage_message_count,
13008                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
13009                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
13010                api_cache_read_tokens_total, api_cache_creation_tokens_total,
13011                api_thinking_tokens_total, last_updated
13012            )
13013            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
13014            ON CONFLICT({bucket_col}, agent_slug, workspace_id, source_id) DO UPDATE SET
13015                message_count = message_count + excluded.message_count,
13016                user_message_count = user_message_count + excluded.user_message_count,
13017                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13018                tool_call_count = tool_call_count + excluded.tool_call_count,
13019                plan_message_count = plan_message_count + excluded.plan_message_count,
13020                plan_content_tokens_est_total = plan_content_tokens_est_total + excluded.plan_content_tokens_est_total,
13021                plan_api_tokens_total = plan_api_tokens_total + excluded.plan_api_tokens_total,
13022                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
13023                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
13024                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
13025                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
13026                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
13027                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
13028                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
13029                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
13030                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
13031                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
13032                last_updated = excluded.last_updated"
13033        );
13034
13035        total_affected += tx.execute_compat(
13036            &sql,
13037            fparams![
13038                *bucket_id,
13039                agent.as_str(),
13040                *workspace_id,
13041                source.as_str(),
13042                d.message_count,
13043                d.user_message_count,
13044                d.assistant_message_count,
13045                d.tool_call_count,
13046                d.plan_message_count,
13047                d.plan_content_tokens_est_total,
13048                d.plan_api_tokens_total,
13049                d.api_coverage_message_count,
13050                d.content_tokens_est_total,
13051                d.content_tokens_est_user,
13052                d.content_tokens_est_assistant,
13053                d.api_tokens_total,
13054                d.api_input_tokens_total,
13055                d.api_output_tokens_total,
13056                d.api_cache_read_tokens_total,
13057                d.api_cache_creation_tokens_total,
13058                d.api_thinking_tokens_total,
13059                now
13060            ],
13061        )?;
13062    }
13063
13064    Ok(total_affected)
13065}
13066
13067/// Flush usage_models_daily rollup within a frankensqlite transaction.
13068fn franken_flush_model_daily_rollup_table(
13069    tx: &FrankenTransaction<'_>,
13070    deltas: &HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
13071    now: i64,
13072) -> Result<usize> {
13073    if deltas.is_empty() {
13074        return Ok(0);
13075    }
13076
13077    let mut total_affected = 0;
13078
13079    for ((day_id, agent, workspace_id, source, model_family, model_tier), d) in deltas {
13080        total_affected += tx.execute_compat(
13081            "INSERT INTO usage_models_daily (
13082                day_id, agent_slug, workspace_id, source_id, model_family, model_tier,
13083                message_count, user_message_count, assistant_message_count,
13084                tool_call_count, plan_message_count, api_coverage_message_count,
13085                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
13086                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
13087                api_cache_read_tokens_total, api_cache_creation_tokens_total,
13088                api_thinking_tokens_total, last_updated
13089            )
13090            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
13091            ON CONFLICT(day_id, agent_slug, workspace_id, source_id, model_family, model_tier) DO UPDATE SET
13092                message_count = message_count + excluded.message_count,
13093                user_message_count = user_message_count + excluded.user_message_count,
13094                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13095                tool_call_count = tool_call_count + excluded.tool_call_count,
13096                plan_message_count = plan_message_count + excluded.plan_message_count,
13097                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
13098                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
13099                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
13100                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
13101                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
13102                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
13103                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
13104                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
13105                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
13106                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
13107                last_updated = excluded.last_updated",
13108            fparams![
13109                *day_id,
13110                agent.as_str(),
13111                *workspace_id,
13112                source.as_str(),
13113                model_family.as_str(),
13114                model_tier.as_str(),
13115                d.message_count,
13116                d.user_message_count,
13117                d.assistant_message_count,
13118                d.tool_call_count,
13119                d.plan_message_count,
13120                d.api_coverage_message_count,
13121                d.content_tokens_est_total,
13122                d.content_tokens_est_user,
13123                d.content_tokens_est_assistant,
13124                d.api_tokens_total,
13125                d.api_input_tokens_total,
13126                d.api_output_tokens_total,
13127                d.api_cache_read_tokens_total,
13128                d.api_cache_creation_tokens_total,
13129                d.api_thinking_tokens_total,
13130                now
13131            ],
13132        )?;
13133    }
13134
13135    Ok(total_affected)
13136}
13137
13138/// Flush AnalyticsRollupAggregator deltas via frankensqlite transaction.
13139fn franken_flush_analytics_rollups_in_tx(
13140    tx: &FrankenTransaction<'_>,
13141    agg: &AnalyticsRollupAggregator,
13142) -> Result<(usize, usize, usize)> {
13143    let now = FrankenStorage::now_millis();
13144
13145    let hourly_affected =
13146        franken_flush_rollup_table(tx, "usage_hourly", "hour_id", &agg.hourly, now)?;
13147    let daily_affected = franken_flush_rollup_table(tx, "usage_daily", "day_id", &agg.daily, now)?;
13148    let models_daily_affected = franken_flush_model_daily_rollup_table(tx, &agg.models_daily, now)?;
13149
13150    Ok((hourly_affected, daily_affected, models_daily_affected))
13151}
13152
13153/// Update conversation-level token summary columns via frankensqlite transaction.
13154fn franken_update_conversation_token_summaries_in_tx(
13155    tx: &FrankenTransaction<'_>,
13156    conversation_id: i64,
13157) -> Result<()> {
13158    tx.execute_compat(
13159        "UPDATE conversations SET
13160            total_input_tokens = (SELECT SUM(input_tokens) FROM token_usage WHERE conversation_id = ?1),
13161            total_output_tokens = (SELECT SUM(output_tokens) FROM token_usage WHERE conversation_id = ?1),
13162            total_cache_read_tokens = (SELECT SUM(cache_read_tokens) FROM token_usage WHERE conversation_id = ?1),
13163            total_cache_creation_tokens = (SELECT SUM(cache_creation_tokens) FROM token_usage WHERE conversation_id = ?1),
13164            grand_total_tokens = (SELECT SUM(total_tokens) FROM token_usage WHERE conversation_id = ?1),
13165            estimated_cost_usd = (SELECT SUM(estimated_cost_usd) FROM token_usage WHERE conversation_id = ?1),
13166            primary_model = (SELECT model_name FROM token_usage WHERE conversation_id = ?1
13167                             AND model_name IS NOT NULL
13168                             GROUP BY model_name ORDER BY COUNT(*) DESC LIMIT 1),
13169            api_call_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13170                              AND data_source = 'api'),
13171            tool_call_count = (SELECT SUM(tool_call_count) FROM token_usage WHERE conversation_id = ?1),
13172            user_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13173                                  AND role = 'user'),
13174            assistant_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13175                                       AND role IN ('assistant', 'agent'))
13176         WHERE id = ?1",
13177        fparams![conversation_id],
13178    )?;
13179    Ok(())
13180}
13181
13182impl FrankenStorage {
13183    /// Rebuild token_daily_stats from the token_usage ledger.
13184    pub fn rebuild_token_daily_stats(&self) -> Result<usize> {
13185        const CONVERSATION_BATCH_SIZE: usize = 1_000;
13186        const TOKEN_USAGE_BATCH_SIZE: usize = 10_000;
13187
13188        let total_usage_rows: i64 =
13189            self.conn
13190                .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
13191                    row.get_typed(0)
13192                })?;
13193        tracing::info!(
13194            target: "cass::analytics",
13195            total_usage_rows,
13196            "token_daily_stats_rebuild_start"
13197        );
13198
13199        let mut tx = self.conn.transaction()?;
13200        tx.execute("DELETE FROM token_daily_stats")?;
13201
13202        let mut last_conversation_id = 0_i64;
13203        let mut rows_created = 0_usize;
13204
13205        loop {
13206            let conversation_rows = tx.query_map_collect(
13207                "SELECT c.id, c.started_at, c.source_id,
13208                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown')
13209                 FROM conversations c
13210                 WHERE c.id > ?1
13211                 ORDER BY c.id
13212                 LIMIT ?2",
13213                fparams![last_conversation_id, CONVERSATION_BATCH_SIZE as i64],
13214                |row| {
13215                    Ok((
13216                        row.get_typed::<i64>(0)?,
13217                        row.get_typed::<Option<i64>>(1)?,
13218                        row.get_typed::<String>(2)?,
13219                        row.get_typed::<String>(3)?,
13220                    ))
13221                },
13222            )?;
13223            if conversation_rows.is_empty() {
13224                break;
13225            }
13226
13227            let mut aggregate = TokenStatsAggregator::new();
13228
13229            for (conversation_id, started_at, source_id, agent_slug) in conversation_rows {
13230                last_conversation_id = conversation_id;
13231                let conversation_day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13232                let mut last_token_usage_id = 0_i64;
13233                let mut session_model_family = String::from("unknown");
13234
13235                loop {
13236                    let usage_rows = tx.query_map_collect(
13237                        "SELECT id, day_id, role,
13238                                COALESCE(model_family, 'unknown'),
13239                                input_tokens, output_tokens, cache_read_tokens,
13240                                cache_creation_tokens, thinking_tokens,
13241                                has_tool_calls, tool_call_count,
13242                                content_chars, estimated_cost_usd
13243                         FROM token_usage
13244                         WHERE conversation_id = ?1
13245                           AND id > ?2
13246                         ORDER BY id
13247                         LIMIT ?3",
13248                        fparams![
13249                            conversation_id,
13250                            last_token_usage_id,
13251                            TOKEN_USAGE_BATCH_SIZE as i64
13252                        ],
13253                        |row| {
13254                            Ok((
13255                                row.get_typed::<i64>(0)?,
13256                                row.get_typed::<i64>(1)?,
13257                                row.get_typed::<String>(2)?,
13258                                row.get_typed::<String>(3)?,
13259                                row.get_typed::<Option<i64>>(4)?,
13260                                row.get_typed::<Option<i64>>(5)?,
13261                                row.get_typed::<Option<i64>>(6)?,
13262                                row.get_typed::<Option<i64>>(7)?,
13263                                row.get_typed::<Option<i64>>(8)?,
13264                                row.get_typed::<i64>(9)?,
13265                                row.get_typed::<i64>(10)?,
13266                                row.get_typed::<i64>(11)?,
13267                                row.get_typed::<Option<f64>>(12)?,
13268                            ))
13269                        },
13270                    )?;
13271                    if usage_rows.is_empty() {
13272                        break;
13273                    }
13274
13275                    for (
13276                        token_usage_id,
13277                        day_id,
13278                        role,
13279                        model_family,
13280                        input_tokens,
13281                        output_tokens,
13282                        cache_read_tokens,
13283                        cache_creation_tokens,
13284                        thinking_tokens,
13285                        has_tool_calls,
13286                        tool_call_count,
13287                        content_chars,
13288                        estimated_cost_usd,
13289                    ) in usage_rows
13290                    {
13291                        last_token_usage_id = token_usage_id;
13292                        if model_family != "unknown" {
13293                            session_model_family = model_family.clone();
13294                        }
13295                        let usage = crate::connectors::ExtractedTokenUsage {
13296                            model_name: None,
13297                            provider: None,
13298                            input_tokens,
13299                            output_tokens,
13300                            cache_read_tokens,
13301                            cache_creation_tokens,
13302                            thinking_tokens,
13303                            service_tier: None,
13304                            has_tool_calls: has_tool_calls != 0,
13305                            tool_call_count: u32::try_from(tool_call_count.max(0)).unwrap_or(0),
13306                            data_source: franken_agent_detection::TokenDataSource::Api,
13307                        };
13308                        aggregate.record(
13309                            &agent_slug,
13310                            &source_id,
13311                            day_id,
13312                            &model_family,
13313                            &role,
13314                            &usage,
13315                            content_chars,
13316                            estimated_cost_usd.unwrap_or(0.0),
13317                        );
13318                    }
13319                }
13320
13321                aggregate.record_session(
13322                    &agent_slug,
13323                    &source_id,
13324                    conversation_day_id,
13325                    &session_model_family,
13326                );
13327            }
13328
13329            let entries = aggregate.expand();
13330            rows_created = rows_created.saturating_add(entries.len());
13331            franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
13332        }
13333
13334        tx.commit()?;
13335
13336        tracing::info!(
13337            target: "cass::analytics",
13338            rows_created,
13339            "token_daily_stats_rebuild_complete"
13340        );
13341
13342        Ok(rows_created)
13343    }
13344
13345    /// Rebuild analytics tables (message_metrics + rollups) from existing
13346    /// messages in the database. Does NOT re-parse raw agent session files.
13347    pub fn rebuild_analytics(&self) -> Result<AnalyticsRebuildResult> {
13348        let start = Instant::now();
13349
13350        let total_messages: i64 =
13351            self.conn
13352                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13353                    row.get_typed(0)
13354                })?;
13355        tracing::info!(
13356            target: "cass::analytics",
13357            total_messages,
13358            "analytics_rebuild_start"
13359        );
13360
13361        let mut tx = self.conn.transaction()?;
13362
13363        tx.execute("DELETE FROM message_metrics")?;
13364        tx.execute("DELETE FROM usage_hourly")?;
13365        tx.execute("DELETE FROM usage_daily")?;
13366        tx.execute("DELETE FROM usage_models_daily")?;
13367
13368        const CHUNK_SIZE: i64 = 10_000;
13369        let mut offset: i64 = 0;
13370        let mut total_inserted: usize = 0;
13371        let mut usage_hourly_rows: usize = 0;
13372        let mut usage_daily_rows: usize = 0;
13373        let mut usage_models_daily_rows: usize = 0;
13374
13375        loop {
13376            #[allow(clippy::type_complexity)]
13377            let rows: Vec<(
13378                i64,
13379                String,
13380                String,
13381                Option<serde_json::Value>,
13382                Option<i64>,
13383                Option<i64>,
13384                String,
13385                Option<i64>,
13386                String,
13387            )> = tx.query_map_collect(
13388                // Avoid the 3-table JOIN with LIMIT/OFFSET that triggers
13389                // frankensqlite's materialization fallback (see 860acb12).
13390                // Inline the agent slug lookup as a correlated subquery and
13391                // fall back to 'unknown' for NULL agent_id, matching the
13392                // FTS / lexical rebuild paths.
13393                "SELECT m.id, m.idx, m.role, m.content, m.extra_json, m.extra_bin,
13394                        m.created_at,
13395                        c.id AS conv_id, c.started_at AS conv_started_at,
13396                        c.source_id, c.workspace_id,
13397                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown') AS agent_slug
13398                 FROM messages m
13399                 JOIN conversations c ON m.conversation_id = c.id
13400                 ORDER BY m.id
13401                 LIMIT ?1 OFFSET ?2",
13402                fparams![CHUNK_SIZE, offset],
13403                |row| {
13404                    let msg_id: i64 = row.get_typed(0)?;
13405                    let role: String = row.get_typed(2)?;
13406                    let content: String = row.get_typed(3)?;
13407                    let extra_json = row
13408                        .get_typed::<Option<String>>(4)?
13409                        .and_then(|s| serde_json::from_str(&s).ok())
13410                        .or_else(|| {
13411                            row.get_typed::<Option<Vec<u8>>>(5)
13412                                .ok()
13413                                .flatten()
13414                                .and_then(|b| rmp_serde::from_slice(&b).ok())
13415                        });
13416                    let msg_ts: Option<i64> = row.get_typed(6)?;
13417                    let conv_started_at: Option<i64> = row.get_typed(8)?;
13418                    let source_id: String = row.get_typed(9)?;
13419                    let workspace_id: Option<i64> = row.get_typed(10)?;
13420                    let agent_slug: String = row.get_typed(11)?;
13421                    let effective_ts = msg_ts.or(conv_started_at).unwrap_or(0);
13422
13423                    Ok((
13424                        msg_id,
13425                        role,
13426                        content,
13427                        extra_json,
13428                        Some(effective_ts),
13429                        workspace_id,
13430                        source_id,
13431                        conv_started_at,
13432                        agent_slug,
13433                    ))
13434                },
13435            )?;
13436
13437            if rows.is_empty() {
13438                break;
13439            }
13440
13441            let chunk_len = rows.len();
13442            let mut entries = Vec::with_capacity(chunk_len);
13443            let mut rollup_agg = AnalyticsRollupAggregator::new();
13444
13445            for (
13446                msg_id,
13447                role,
13448                content,
13449                extra_json,
13450                effective_ts,
13451                workspace_id,
13452                source_id,
13453                _conv_started_at,
13454                agent_slug,
13455            ) in &rows
13456            {
13457                let ts = effective_ts.unwrap_or(0);
13458                let day_id = Self::day_id_from_millis(ts);
13459                let hour_id = Self::hour_id_from_millis(ts);
13460                let content_chars = content.len() as i64;
13461                let content_tokens_est = content_chars / 4;
13462                let extra = extra_json
13463                    .as_ref()
13464                    .cloned()
13465                    .unwrap_or(serde_json::Value::Null);
13466                let usage =
13467                    crate::connectors::extract_tokens_for_agent(agent_slug, &extra, content, role);
13468                let model_info = usage
13469                    .model_name
13470                    .as_deref()
13471                    .map(crate::connectors::normalize_model);
13472                let model_family = model_info
13473                    .as_ref()
13474                    .map(|i| i.family.clone())
13475                    .unwrap_or_else(|| "unknown".into());
13476                let model_tier = model_info
13477                    .as_ref()
13478                    .map(|i| i.tier.clone())
13479                    .unwrap_or_else(|| "unknown".into());
13480                let provider = usage
13481                    .provider
13482                    .clone()
13483                    .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
13484                    .unwrap_or_else(|| "unknown".into());
13485
13486                let entry = MessageMetricsEntry {
13487                    message_id: *msg_id,
13488                    created_at_ms: ts,
13489                    hour_id,
13490                    day_id,
13491                    agent_slug: agent_slug.clone(),
13492                    workspace_id: workspace_id.unwrap_or(0),
13493                    source_id: source_id.clone(),
13494                    role: role.clone(),
13495                    content_chars,
13496                    content_tokens_est,
13497                    model_name: usage.model_name.clone(),
13498                    model_family,
13499                    model_tier,
13500                    provider,
13501                    api_input_tokens: usage.input_tokens,
13502                    api_output_tokens: usage.output_tokens,
13503                    api_cache_read_tokens: usage.cache_read_tokens,
13504                    api_cache_creation_tokens: usage.cache_creation_tokens,
13505                    api_thinking_tokens: usage.thinking_tokens,
13506                    api_service_tier: usage.service_tier,
13507                    api_data_source: usage.data_source.as_str().to_string(),
13508                    tool_call_count: usage.tool_call_count as i64,
13509                    has_tool_calls: usage.has_tool_calls,
13510                    has_plan: has_plan_for_role(role, content),
13511                };
13512                rollup_agg.record(&entry);
13513                entries.push(entry);
13514            }
13515
13516            total_inserted += franken_insert_message_metrics_batched_in_tx(&tx, &entries)?;
13517            let (hourly, daily, models_daily) =
13518                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
13519            usage_hourly_rows += hourly;
13520            usage_daily_rows += daily;
13521            usage_models_daily_rows += models_daily;
13522            offset += chunk_len as i64;
13523
13524            tracing::debug!(
13525                target: "cass::analytics",
13526                offset,
13527                chunk = chunk_len,
13528                inserted = entries.len(),
13529                total = total_inserted,
13530                "analytics_rebuild_chunk"
13531            );
13532
13533            if (chunk_len as i64) < CHUNK_SIZE {
13534                break;
13535            }
13536        }
13537
13538        tx.commit()?;
13539
13540        let elapsed = start.elapsed();
13541        let elapsed_ms = elapsed.as_millis() as u64;
13542        let msgs_per_sec = if elapsed_ms > 0 {
13543            (total_inserted as f64) / (elapsed_ms as f64 / 1000.0)
13544        } else {
13545            0.0
13546        };
13547
13548        tracing::info!(
13549            target: "cass::analytics",
13550            message_metrics_rows = total_inserted,
13551            usage_hourly_rows,
13552            usage_daily_rows,
13553            usage_models_daily_rows,
13554            elapsed_ms,
13555            messages_per_sec = format!("{:.0}", msgs_per_sec),
13556            "analytics_rebuild_complete"
13557        );
13558
13559        Ok(AnalyticsRebuildResult {
13560            message_metrics_rows: total_inserted,
13561            usage_hourly_rows,
13562            usage_daily_rows,
13563            usage_models_daily_rows,
13564            elapsed_ms,
13565            messages_per_sec: msgs_per_sec,
13566        })
13567    }
13568
13569    /// Rebuild all daily stats from scratch.
13570    pub fn rebuild_daily_stats(&self) -> Result<DailyStatsRebuildResult> {
13571        const DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE: usize = 1_000;
13572        const DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE: usize = 10_000;
13573
13574        let mut conversation_batch_size = rebuild_batch_size_env(
13575            "CASS_DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE",
13576            DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE,
13577        );
13578        let mut message_batch_size = rebuild_batch_size_env(
13579            "CASS_DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE",
13580            DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE,
13581        );
13582
13583        let total_messages: i64 =
13584            self.conn
13585                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13586                    row.get_typed(0)
13587                })?;
13588        let message_metrics_rows: i64 =
13589            self.conn
13590                .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
13591                    row.get_typed(0)
13592                })?;
13593        let use_message_metrics = total_messages > 0 && total_messages == message_metrics_rows;
13594
13595        tracing::info!(
13596            target: "cass::perf::daily_stats",
13597            total_messages,
13598            message_metrics_rows,
13599            use_message_metrics,
13600            "daily_stats rebuild selected message source"
13601        );
13602
13603        let mut tx = self.conn.transaction()?;
13604        tx.execute("DELETE FROM daily_stats")?;
13605
13606        let mut last_conversation_id = 0_i64;
13607        let mut conversation_batch_count = 0_usize;
13608        let mut conversations_processed = 0_usize;
13609        let mut messages_processed = 0_usize;
13610        let mut message_batch_count = 0_usize;
13611        let mut raw_entries_flushed = 0_usize;
13612        let mut expanded_entries_flushed = 0_usize;
13613        let message_scan_sql = if use_message_metrics {
13614            "SELECT m.idx, mm.content_chars
13615             FROM messages m
13616             JOIN message_metrics mm ON mm.message_id = m.id
13617             WHERE m.conversation_id = ?1
13618               AND m.idx > ?2
13619             ORDER BY m.conversation_id, m.idx
13620             LIMIT ?3"
13621        } else {
13622            "SELECT m.idx, COALESCE(LENGTH(CAST(m.content AS BLOB)), 0)
13623             FROM messages m
13624             WHERE m.conversation_id = ?1
13625               AND m.idx > ?2
13626             ORDER BY m.conversation_id, m.idx
13627             LIMIT ?3"
13628        };
13629
13630        loop {
13631            // Avoid the 2-table JOIN with LIMIT that triggers frankensqlite's
13632            // materialization fallback (which is what the OOM retry below is
13633            // defending against — see 860acb12).  Inline agent slug via
13634            // correlated subquery and degrade NULL agent_id to 'unknown' for
13635            // consistency with the lexical/FTS rebuild paths.
13636            let conversation_rows = match self.conn.query_with_params(
13637                "SELECT c.id, c.started_at,
13638                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
13639                        c.source_id
13640                 FROM conversations c
13641                 WHERE c.id > ?1
13642                 ORDER BY c.id
13643                 LIMIT ?2",
13644                &params_from_iter([
13645                    ParamValue::from(last_conversation_id),
13646                    ParamValue::from(conversation_batch_size as i64),
13647                ]),
13648            ) {
13649                Ok(rows) => rows,
13650                Err(err) if is_out_of_memory_error(&err) && conversation_batch_size > 1 => {
13651                    let previous_batch_size = conversation_batch_size;
13652                    conversation_batch_size = (conversation_batch_size / 2).max(1);
13653                    tracing::warn!(
13654                        previous_batch_size,
13655                        conversation_batch_size,
13656                        last_conversation_id,
13657                        "daily_stats conversation scan ran out of memory; retrying with smaller batch"
13658                    );
13659                    continue;
13660                }
13661                Err(err) => return Err(err.into()),
13662            };
13663            if conversation_rows.is_empty() {
13664                break;
13665            }
13666
13667            let mut aggregate = StatsAggregator::new();
13668            let mut conversation_batch_meta: Vec<(i64, i64, String, String)> =
13669                Vec::with_capacity(conversation_rows.len());
13670            for row in &conversation_rows {
13671                let conversation_id: i64 = row.get_typed(0)?;
13672                let started_at: Option<i64> = row.get_typed(1)?;
13673                let agent_slug: String = row.get_typed(2)?;
13674                let source_id: String = row.get_typed(3)?;
13675                last_conversation_id = conversation_id;
13676                let day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13677                aggregate.record_delta(&agent_slug, &source_id, day_id, 1, 0, 0);
13678                conversation_batch_meta.push((conversation_id, day_id, agent_slug, source_id));
13679                conversations_processed += 1;
13680            }
13681
13682            conversation_batch_count += 1;
13683            raw_entries_flushed += aggregate.raw_entry_count();
13684            let entries = aggregate.expand();
13685            expanded_entries_flushed += entries.len();
13686            if !entries.is_empty() {
13687                franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13688            }
13689            if conversation_batch_count.is_multiple_of(25) {
13690                tracing::info!(
13691                    target: "cass::perf::daily_stats",
13692                    conversations_processed,
13693                    batches = conversation_batch_count,
13694                    batch_size = conversation_batch_size,
13695                    last_conversation_id,
13696                    "daily_stats rebuild conversation scan progress"
13697                );
13698            }
13699            if conversation_batch_meta.is_empty() {
13700                continue;
13701            }
13702
13703            for (conversation_id, day_id, agent_slug, source_id) in conversation_batch_meta {
13704                let mut cursor_message_idx = -1_i64;
13705                loop {
13706                    let message_rows = match self.conn.query_with_params(
13707                        message_scan_sql,
13708                        &params_from_iter([
13709                            ParamValue::from(conversation_id),
13710                            ParamValue::from(cursor_message_idx),
13711                            ParamValue::from(message_batch_size as i64),
13712                        ]),
13713                    ) {
13714                        Ok(rows) => rows,
13715                        Err(err) if is_out_of_memory_error(&err) && message_batch_size > 1 => {
13716                            let previous_batch_size = message_batch_size;
13717                            message_batch_size = (message_batch_size / 2).max(1);
13718                            tracing::warn!(
13719                                previous_batch_size,
13720                                message_batch_size,
13721                                conversation_id,
13722                                cursor_message_idx,
13723                                "daily_stats message scan ran out of memory; retrying with smaller batch"
13724                            );
13725                            continue;
13726                        }
13727                        Err(err) => return Err(err.into()),
13728                    };
13729                    if message_rows.is_empty() {
13730                        break;
13731                    }
13732
13733                    let mut aggregate = StatsAggregator::new();
13734                    for row in &message_rows {
13735                        let message_idx: i64 = row.get_typed(0)?;
13736                        let content_len: i64 = row.get_typed(1)?;
13737                        cursor_message_idx = message_idx;
13738                        aggregate.record_delta(&agent_slug, &source_id, day_id, 0, 1, content_len);
13739                        messages_processed += 1;
13740                    }
13741
13742                    message_batch_count += 1;
13743                    raw_entries_flushed += aggregate.raw_entry_count();
13744                    let entries = aggregate.expand();
13745                    expanded_entries_flushed += entries.len();
13746                    if !entries.is_empty() {
13747                        franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13748                    }
13749                    if message_batch_count.is_multiple_of(50) {
13750                        tracing::info!(
13751                            target: "cass::perf::daily_stats",
13752                            messages_processed,
13753                            batches = message_batch_count,
13754                            batch_size = message_batch_size,
13755                            source = if use_message_metrics {
13756                                "message_metrics"
13757                            } else {
13758                                "messages"
13759                            },
13760                            conversation_id,
13761                            cursor_message_idx,
13762                            "daily_stats rebuild message scan progress"
13763                        );
13764                    }
13765                }
13766            }
13767        }
13768
13769        let rows_created: i64 =
13770            tx.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
13771                row.get_typed(0)
13772            })?;
13773        let total_sessions: i64 = tx.query_row_map(
13774            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
13775            fparams![],
13776            |row| row.get_typed(0),
13777        )?;
13778
13779        tx.commit()?;
13780
13781        tracing::info!(
13782            target: "cass::perf::daily_stats",
13783            rows_created,
13784            total_sessions,
13785            conversations_processed,
13786            conversation_batches = conversation_batch_count,
13787            conversation_batch_size,
13788            message_batches = message_batch_count,
13789            message_batch_size,
13790            messages_processed,
13791            use_message_metrics,
13792            raw_entries_flushed,
13793            expanded_entries_flushed,
13794            "Daily stats rebuilt from conversations"
13795        );
13796
13797        Ok(DailyStatsRebuildResult {
13798            rows_created,
13799            total_sessions,
13800        })
13801    }
13802}
13803
13804// SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
13805// All methods are available through FrankenStorage.
13806
13807// -------------------------------------------------------------------------
13808// IndexingCache (Opt 7.2) - N+1 Prevention for Agent/Workspace IDs
13809// -------------------------------------------------------------------------
13810
13811/// Cache for agent and workspace IDs during batch indexing.
13812///
13813/// Prevents N+1 database queries by caching the results of ensure_agent
13814/// and ensure_workspace calls within a batch. This is per-batch and
13815/// single-threaded, so no synchronization is needed.
13816///
13817/// # Usage
13818/// ```ignore
13819/// let mut cache = IndexingCache::new();
13820/// for conv in conversations {
13821///     let agent_id = cache.get_or_insert_agent(storage, &agent)?;
13822///     let workspace_id = cache.get_or_insert_workspace(storage, workspace)?;
13823///     // ... use agent_id and workspace_id
13824/// }
13825/// ```
13826///
13827/// # Rollback
13828/// Set environment variable `CASS_SQLITE_CACHE=0` to bypass caching
13829/// and use direct DB calls (useful for debugging).
13830#[derive(Debug, Default)]
13831pub struct IndexingCache {
13832    agent_ids: HashMap<String, i64>,
13833    workspace_ids: HashMap<PathBuf, i64>,
13834    hits: u64,
13835    misses: u64,
13836}
13837
13838pub trait IndexingCacheStorage {
13839    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64>;
13840    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64>;
13841}
13842
13843impl IndexingCacheStorage for FrankenStorage {
13844    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64> {
13845        self.ensure_agent(agent)
13846    }
13847
13848    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
13849        self.ensure_workspace(path, display_name)
13850    }
13851}
13852
13853// IndexingCacheStorage for SqliteStorage removed: SqliteStorage is a type alias for FrankenStorage.
13854
13855impl IndexingCache {
13856    /// Create a new empty cache.
13857    pub fn new() -> Self {
13858        Self {
13859            agent_ids: HashMap::new(),
13860            workspace_ids: HashMap::new(),
13861            hits: 0,
13862            misses: 0,
13863        }
13864    }
13865
13866    /// Check if caching is enabled via environment variable.
13867    /// Returns true unless CASS_SQLITE_CACHE is set to "0" or "false".
13868    pub fn is_enabled() -> bool {
13869        dotenvy::var("CASS_SQLITE_CACHE")
13870            .map(|v| v != "0" && v.to_lowercase() != "false")
13871            .unwrap_or(true)
13872    }
13873
13874    /// Get or insert an agent ID, using cache if available.
13875    ///
13876    /// Returns the cached ID if present, otherwise calls ensure_agent
13877    /// and caches the result.
13878    pub fn get_or_insert_agent<S>(&mut self, storage: &S, agent: &Agent) -> Result<i64>
13879    where
13880        S: IndexingCacheStorage + ?Sized,
13881    {
13882        if let Some(&cached) = self.agent_ids.get(&agent.slug) {
13883            self.hits += 1;
13884            return Ok(cached);
13885        }
13886
13887        self.misses += 1;
13888        let id = storage.ensure_indexing_agent(agent)?;
13889        self.agent_ids.insert(agent.slug.clone(), id);
13890        Ok(id)
13891    }
13892
13893    /// Get or insert a workspace ID, using cache if available.
13894    ///
13895    /// Returns the cached ID if present, otherwise calls ensure_workspace
13896    /// and caches the result.
13897    pub fn get_or_insert_workspace(
13898        &mut self,
13899        storage: &(impl IndexingCacheStorage + ?Sized),
13900        path: &Path,
13901        display_name: Option<&str>,
13902    ) -> Result<i64> {
13903        if let Some(&cached) = self.workspace_ids.get(path) {
13904            self.hits += 1;
13905            return Ok(cached);
13906        }
13907
13908        self.misses += 1;
13909        let id = storage.ensure_indexing_workspace(path, display_name)?;
13910        self.workspace_ids.insert(path.to_path_buf(), id);
13911        Ok(id)
13912    }
13913
13914    /// Get cache statistics: (hits, misses, hit_rate).
13915    pub fn stats(&self) -> (u64, u64, f64) {
13916        let total = self.hits + self.misses;
13917        let hit_rate = if total > 0 {
13918            self.hits as f64 / total as f64
13919        } else {
13920            0.0
13921        };
13922        (self.hits, self.misses, hit_rate)
13923    }
13924
13925    /// Clear the cache, resetting all state.
13926    pub fn clear(&mut self) {
13927        self.agent_ids.clear();
13928        self.workspace_ids.clear();
13929        self.hits = 0;
13930        self.misses = 0;
13931    }
13932
13933    /// Number of cached agents.
13934    pub fn agent_count(&self) -> usize {
13935        self.agent_ids.len()
13936    }
13937
13938    /// Number of cached workspaces.
13939    pub fn workspace_count(&self) -> usize {
13940        self.workspace_ids.len()
13941    }
13942}
13943
13944// -------------------------------------------------------------------------
13945// StatsAggregator (kzxu) - Batched Daily Stats Updates
13946// -------------------------------------------------------------------------
13947// Aggregates daily stats in memory during batch ingestion, then flushes
13948// to the database in a single batched INSERT...ON CONFLICT operation.
13949// This prevents N×4 database writes (4 permutations per conversation).
13950
13951/// Accumulated statistics delta for a single (day_id, agent, source) combination.
13952#[derive(Clone, Copy, Debug, Default)]
13953pub struct StatsDelta {
13954    pub session_count_delta: i64,
13955    pub message_count_delta: i64,
13956    pub total_chars_delta: i64,
13957}
13958
13959/// In-memory aggregator for batched daily stats updates.
13960///
13961/// During batch ingestion, we accumulate deltas per (day_id, agent, source) key.
13962/// After processing all conversations, call `expand()` to generate the 4
13963/// permutations per raw entry, then flush via `SqliteStorage::update_daily_stats_batched`.
13964///
13965/// # Example
13966/// ```ignore
13967/// let mut agg = StatsAggregator::new();
13968/// for conv in conversations {
13969///     agg.record(&conv.agent_slug, source_id, day_id, msg_count, char_count);
13970/// }
13971/// let entries = agg.expand();
13972/// storage.update_daily_stats_batched(&entries)?;
13973/// ```
13974#[derive(Debug, Default)]
13975pub struct StatsAggregator {
13976    /// Raw deltas keyed by (day_id, agent_slug, source_id).
13977    /// Only stores specific (non-"all") combinations.
13978    deltas: HashMap<(i64, String, String), StatsDelta>,
13979}
13980
13981impl StatsAggregator {
13982    /// Create a new empty aggregator.
13983    pub fn new() -> Self {
13984        Self {
13985            deltas: HashMap::new(),
13986        }
13987    }
13988
13989    /// Record a conversation's contribution to stats (session + messages + chars).
13990    ///
13991    /// This increments session_count by 1.
13992    ///
13993    /// # Arguments
13994    /// * `agent_slug` - The specific agent slug (not "all")
13995    /// * `source_id` - The specific source ID (not "all")
13996    /// * `day_id` - Days since 2020-01-01 (from `SqliteStorage::day_id_from_millis`)
13997    /// * `message_count` - Number of messages in the conversation
13998    /// * `total_chars` - Total character count across all messages
13999    pub fn record(
14000        &mut self,
14001        agent_slug: &str,
14002        source_id: &str,
14003        day_id: i64,
14004        message_count: i64,
14005        total_chars: i64,
14006    ) {
14007        self.record_delta(agent_slug, source_id, day_id, 1, message_count, total_chars);
14008    }
14009
14010    /// Record an arbitrary delta. Use this for append-only updates where
14011    /// `session_count_delta` may be 0 but message/char deltas are non-zero.
14012    pub fn record_delta(
14013        &mut self,
14014        agent_slug: &str,
14015        source_id: &str,
14016        day_id: i64,
14017        session_count_delta: i64,
14018        message_count_delta: i64,
14019        total_chars_delta: i64,
14020    ) {
14021        if session_count_delta == 0 && message_count_delta == 0 && total_chars_delta == 0 {
14022            return;
14023        }
14024        let key = (day_id, agent_slug.to_owned(), source_id.to_owned());
14025        let delta = self.deltas.entry(key).or_default();
14026        delta.session_count_delta += session_count_delta;
14027        delta.message_count_delta += message_count_delta;
14028        delta.total_chars_delta += total_chars_delta;
14029    }
14030
14031    /// Expand raw deltas into the 4 permutation keys:
14032    /// - (agent, source) - specific both
14033    /// - ("all", source) - all agents, specific source
14034    /// - (agent, "all") - specific agent, all sources
14035    /// - ("all", "all") - totals
14036    ///
14037    /// Returns entries sorted by (day_id, agent_slug, source_id) for deterministic batching.
14038    pub fn expand(&self) -> Vec<(i64, String, String, StatsDelta)> {
14039        let mut expanded: HashMap<(i64, String, String), StatsDelta> = HashMap::new();
14040
14041        for ((day_id, agent, source), delta) in &self.deltas {
14042            let permutations = [
14043                (agent.as_str(), source.as_str()),
14044                ("all", source.as_str()),
14045                (agent.as_str(), "all"),
14046                ("all", "all"),
14047            ];
14048
14049            // Ensure we don't double-apply deltas if agent/source is already "all".
14050            for idx in 0..permutations.len() {
14051                let (a, s) = permutations[idx];
14052                if permutations[..idx].contains(&(a, s)) {
14053                    continue;
14054                }
14055                let key = (*day_id, a.to_owned(), s.to_owned());
14056                let entry = expanded.entry(key).or_default();
14057                entry.session_count_delta += delta.session_count_delta;
14058                entry.message_count_delta += delta.message_count_delta;
14059                entry.total_chars_delta += delta.total_chars_delta;
14060            }
14061        }
14062
14063        let mut out: Vec<(i64, String, String, StatsDelta)> = expanded
14064            .into_iter()
14065            .map(|((d, a, s), delta)| (d, a, s, delta))
14066            .collect();
14067        out.sort_by(|(d1, a1, s1, _), (d2, a2, s2, _)| {
14068            d1.cmp(d2).then_with(|| a1.cmp(a2)).then_with(|| s1.cmp(s2))
14069        });
14070        out
14071    }
14072
14073    /// Check if the aggregator is empty (no data recorded).
14074    pub fn is_empty(&self) -> bool {
14075        self.deltas.is_empty()
14076    }
14077
14078    /// Get number of distinct raw (day, agent, source) combinations recorded.
14079    pub fn raw_entry_count(&self) -> usize {
14080        self.deltas.len()
14081    }
14082}
14083
14084// -------------------------------------------------------------------------
14085// TokenStatsAggregator — Batched Token Analytics Daily Stats
14086// -------------------------------------------------------------------------
14087// Mirrors StatsAggregator pattern for token-level metrics.
14088// Aggregates token usage in memory during batch ingestion, then flushes
14089// to token_daily_stats in a single batched INSERT...ON CONFLICT operation.
14090
14091/// Accumulated token statistics delta for a single (day_id, agent, source, model_family) combination.
14092#[derive(Clone, Debug, Default)]
14093pub struct TokenStatsDelta {
14094    pub api_call_count: i64,
14095    pub user_message_count: i64,
14096    pub assistant_message_count: i64,
14097    pub tool_message_count: i64,
14098    pub total_input_tokens: i64,
14099    pub total_output_tokens: i64,
14100    pub total_cache_read_tokens: i64,
14101    pub total_cache_creation_tokens: i64,
14102    pub total_thinking_tokens: i64,
14103    pub grand_total_tokens: i64,
14104    pub total_content_chars: i64,
14105    pub total_tool_calls: i64,
14106    pub estimated_cost_usd: f64,
14107    pub session_count: i64,
14108}
14109
14110/// In-memory aggregator for batched token daily stats updates.
14111///
14112/// During batch ingestion, accumulate token deltas per (day_id, agent, source, model_family) key.
14113/// After processing, call `expand()` to generate the 5 permutation keys, then flush via
14114/// `update_token_daily_stats_batched_in_tx`.
14115#[derive(Debug, Default)]
14116pub struct TokenStatsAggregator {
14117    /// Raw deltas keyed by (day_id, agent_slug, source_id, model_family).
14118    deltas: HashMap<(i64, String, String, String), TokenStatsDelta>,
14119}
14120
14121impl TokenStatsAggregator {
14122    pub fn new() -> Self {
14123        Self {
14124            deltas: HashMap::new(),
14125        }
14126    }
14127
14128    /// Record a single message's token contribution.
14129    #[allow(clippy::too_many_arguments)]
14130    pub fn record(
14131        &mut self,
14132        agent_slug: &str,
14133        source_id: &str,
14134        day_id: i64,
14135        model_family: &str,
14136        role: &str,
14137        usage: &crate::connectors::ExtractedTokenUsage,
14138        content_chars: i64,
14139        estimated_cost_usd: f64,
14140    ) {
14141        let key = (
14142            day_id,
14143            agent_slug.to_owned(),
14144            source_id.to_owned(),
14145            model_family.to_owned(),
14146        );
14147        let delta = self.deltas.entry(key).or_default();
14148
14149        delta.api_call_count += 1;
14150        match role {
14151            "user" => delta.user_message_count += 1,
14152            "assistant" | "agent" => delta.assistant_message_count += 1,
14153            "tool" => delta.tool_message_count += 1,
14154            _ => {}
14155        }
14156
14157        delta.total_input_tokens += usage.input_tokens.unwrap_or(0);
14158        delta.total_output_tokens += usage.output_tokens.unwrap_or(0);
14159        delta.total_cache_read_tokens += usage.cache_read_tokens.unwrap_or(0);
14160        delta.total_cache_creation_tokens += usage.cache_creation_tokens.unwrap_or(0);
14161        delta.total_thinking_tokens += usage.thinking_tokens.unwrap_or(0);
14162        delta.grand_total_tokens += usage.total_tokens().unwrap_or(0);
14163        delta.total_content_chars += content_chars;
14164        delta.total_tool_calls += usage.tool_call_count as i64;
14165        delta.estimated_cost_usd += estimated_cost_usd;
14166    }
14167
14168    /// Record a session count bump for a given day/agent/source/model.
14169    pub fn record_session(
14170        &mut self,
14171        agent_slug: &str,
14172        source_id: &str,
14173        day_id: i64,
14174        model_family: &str,
14175    ) {
14176        let key = (
14177            day_id,
14178            agent_slug.to_owned(),
14179            source_id.to_owned(),
14180            model_family.to_owned(),
14181        );
14182        self.deltas.entry(key).or_default().session_count += 1;
14183    }
14184
14185    /// Expand raw deltas into 5 permutation keys for the 4-dimensional composite PK:
14186    /// - (agent, source, model)  — specific all three
14187    /// - ("all", source, model)  — all agents
14188    /// - (agent, "all", model)   — all sources
14189    /// - (agent, source, "all")  — all models
14190    /// - ("all", "all", "all")   — global total
14191    pub fn expand(&self) -> Vec<(i64, String, String, String, TokenStatsDelta)> {
14192        let mut expanded: HashMap<(i64, String, String, String), TokenStatsDelta> = HashMap::new();
14193
14194        for ((day_id, agent, source, model), delta) in &self.deltas {
14195            let permutations = [
14196                (agent.as_str(), source.as_str(), model.as_str()),
14197                ("all", source.as_str(), model.as_str()),
14198                (agent.as_str(), "all", model.as_str()),
14199                (agent.as_str(), source.as_str(), "all"),
14200                ("all", "all", "all"),
14201            ];
14202
14203            for idx in 0..permutations.len() {
14204                let (a, s, m) = permutations[idx];
14205                // Deduplicate if agent/source/model is already "all"
14206                if permutations[..idx].contains(&(a, s, m)) {
14207                    continue;
14208                }
14209                let key = (*day_id, a.to_owned(), s.to_owned(), m.to_owned());
14210                let entry = expanded.entry(key).or_default();
14211                entry.api_call_count += delta.api_call_count;
14212                entry.user_message_count += delta.user_message_count;
14213                entry.assistant_message_count += delta.assistant_message_count;
14214                entry.tool_message_count += delta.tool_message_count;
14215                entry.total_input_tokens += delta.total_input_tokens;
14216                entry.total_output_tokens += delta.total_output_tokens;
14217                entry.total_cache_read_tokens += delta.total_cache_read_tokens;
14218                entry.total_cache_creation_tokens += delta.total_cache_creation_tokens;
14219                entry.total_thinking_tokens += delta.total_thinking_tokens;
14220                entry.grand_total_tokens += delta.grand_total_tokens;
14221                entry.total_content_chars += delta.total_content_chars;
14222                entry.total_tool_calls += delta.total_tool_calls;
14223                entry.estimated_cost_usd += delta.estimated_cost_usd;
14224                entry.session_count += delta.session_count;
14225            }
14226        }
14227
14228        let mut out: Vec<(i64, String, String, String, TokenStatsDelta)> = expanded
14229            .into_iter()
14230            .map(|((d, a, s, m), delta)| (d, a, s, m, delta))
14231            .collect();
14232        out.sort_by(|(d1, a1, s1, m1, _), (d2, a2, s2, m2, _)| {
14233            d1.cmp(d2)
14234                .then_with(|| a1.cmp(a2))
14235                .then_with(|| s1.cmp(s2))
14236                .then_with(|| m1.cmp(m2))
14237        });
14238        out
14239    }
14240
14241    pub fn is_empty(&self) -> bool {
14242        self.deltas.is_empty()
14243    }
14244
14245    pub fn raw_entry_count(&self) -> usize {
14246        self.deltas.len()
14247    }
14248}
14249
14250// -------------------------------------------------------------------------
14251// AnalyticsRollupAggregator — Batched usage_hourly + usage_daily Updates
14252// -------------------------------------------------------------------------
14253// Accumulates per-message deltas in memory, then flushes to both
14254// usage_hourly and usage_daily in a single batched operation.
14255
14256/// Delta for a single (bucket, agent_slug, workspace_id, source_id) rollup key.
14257#[derive(Clone, Debug, Default)]
14258pub struct UsageRollupDelta {
14259    pub message_count: i64,
14260    pub user_message_count: i64,
14261    pub assistant_message_count: i64,
14262    pub tool_call_count: i64,
14263    pub plan_message_count: i64,
14264    pub plan_content_tokens_est_total: i64,
14265    pub plan_api_tokens_total: i64,
14266    pub api_coverage_message_count: i64,
14267    pub content_tokens_est_total: i64,
14268    pub content_tokens_est_user: i64,
14269    pub content_tokens_est_assistant: i64,
14270    pub api_tokens_total: i64,
14271    pub api_input_tokens_total: i64,
14272    pub api_output_tokens_total: i64,
14273    pub api_cache_read_tokens_total: i64,
14274    pub api_cache_creation_tokens_total: i64,
14275    pub api_thinking_tokens_total: i64,
14276}
14277
14278/// Pending message_metrics row for batch insertion.
14279#[derive(Debug, Clone)]
14280pub struct MessageMetricsEntry {
14281    pub message_id: i64,
14282    pub created_at_ms: i64,
14283    pub hour_id: i64,
14284    pub day_id: i64,
14285    pub agent_slug: String,
14286    pub workspace_id: i64,
14287    pub source_id: String,
14288    pub role: String,
14289    pub content_chars: i64,
14290    pub content_tokens_est: i64,
14291    pub model_name: Option<String>,
14292    pub model_family: String,
14293    pub model_tier: String,
14294    pub provider: String,
14295    pub api_input_tokens: Option<i64>,
14296    pub api_output_tokens: Option<i64>,
14297    pub api_cache_read_tokens: Option<i64>,
14298    pub api_cache_creation_tokens: Option<i64>,
14299    pub api_thinking_tokens: Option<i64>,
14300    pub api_service_tier: Option<String>,
14301    pub api_data_source: String,
14302    pub tool_call_count: i64,
14303    pub has_tool_calls: bool,
14304    pub has_plan: bool,
14305}
14306
14307/// In-memory aggregator for batched usage_hourly and usage_daily rollup updates.
14308///
14309/// Keyed by (bucket_id, agent_slug, workspace_id, source_id).
14310/// Maintains separate hourly and daily delta maps.
14311#[derive(Debug, Default)]
14312pub struct AnalyticsRollupAggregator {
14313    hourly: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14314    daily: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14315    models_daily: HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
14316}
14317
14318impl AnalyticsRollupAggregator {
14319    pub fn new() -> Self {
14320        Self::default()
14321    }
14322
14323    /// Record a single message's contribution to both hourly and daily rollups.
14324    pub fn record(&mut self, entry: &MessageMetricsEntry) {
14325        let content_est = entry.content_tokens_est;
14326        let api_total = entry.api_input_tokens.unwrap_or(0)
14327            + entry.api_output_tokens.unwrap_or(0)
14328            + entry.api_cache_read_tokens.unwrap_or(0)
14329            + entry.api_cache_creation_tokens.unwrap_or(0)
14330            + entry.api_thinking_tokens.unwrap_or(0);
14331        let is_api = entry.api_data_source == "api";
14332        let is_user = entry.role == "user";
14333        let is_assistant = entry.role == "assistant" || entry.role == "agent";
14334
14335        // Apply to both hourly and daily
14336        for (map, bucket_id) in [
14337            (&mut self.hourly, entry.hour_id),
14338            (&mut self.daily, entry.day_id),
14339        ] {
14340            let key = (
14341                bucket_id,
14342                entry.agent_slug.clone(),
14343                entry.workspace_id,
14344                entry.source_id.clone(),
14345            );
14346            let d = map.entry(key).or_default();
14347            d.message_count += 1;
14348            if is_user {
14349                d.user_message_count += 1;
14350                d.content_tokens_est_user += content_est;
14351            }
14352            if is_assistant {
14353                d.assistant_message_count += 1;
14354                d.content_tokens_est_assistant += content_est;
14355            }
14356            d.tool_call_count += entry.tool_call_count;
14357            if entry.has_plan {
14358                d.plan_message_count += 1;
14359                d.plan_content_tokens_est_total += content_est;
14360                if is_api {
14361                    d.plan_api_tokens_total += api_total;
14362                }
14363            }
14364            if is_api {
14365                d.api_coverage_message_count += 1;
14366                d.api_tokens_total += api_total;
14367                d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14368                d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14369                d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14370                d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14371                d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14372            }
14373            d.content_tokens_est_total += content_est;
14374        }
14375
14376        let model_key = (
14377            entry.day_id,
14378            entry.agent_slug.clone(),
14379            entry.workspace_id,
14380            entry.source_id.clone(),
14381            entry.model_family.clone(),
14382            entry.model_tier.clone(),
14383        );
14384        let d = self.models_daily.entry(model_key).or_default();
14385        d.message_count += 1;
14386        if is_user {
14387            d.user_message_count += 1;
14388            d.content_tokens_est_user += content_est;
14389        }
14390        if is_assistant {
14391            d.assistant_message_count += 1;
14392            d.content_tokens_est_assistant += content_est;
14393        }
14394        d.tool_call_count += entry.tool_call_count;
14395        if entry.has_plan {
14396            d.plan_message_count += 1;
14397            d.plan_content_tokens_est_total += content_est;
14398            if is_api {
14399                d.plan_api_tokens_total += api_total;
14400            }
14401        }
14402        if is_api {
14403            d.api_coverage_message_count += 1;
14404            d.api_tokens_total += api_total;
14405            d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14406            d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14407            d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14408            d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14409            d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14410        }
14411        d.content_tokens_est_total += content_est;
14412    }
14413
14414    pub fn is_empty(&self) -> bool {
14415        self.hourly.is_empty() && self.daily.is_empty() && self.models_daily.is_empty()
14416    }
14417
14418    pub fn hourly_entry_count(&self) -> usize {
14419        self.hourly.len()
14420    }
14421
14422    pub fn daily_entry_count(&self) -> usize {
14423        self.daily.len()
14424    }
14425
14426    pub fn models_daily_entry_count(&self) -> usize {
14427        self.models_daily.len()
14428    }
14429}
14430
14431/// Whether the current role should be considered for plan attribution.
14432///
14433/// Plan attribution v2 defaults to assistant/agent messages only.
14434fn has_plan_for_role(role: &str, content: &str) -> bool {
14435    let role = role.trim();
14436    (role.eq_ignore_ascii_case("assistant") || role.eq_ignore_ascii_case("agent"))
14437        && has_plan_heuristic(content)
14438}
14439
14440/// Heuristic to detect "plan" messages.
14441///
14442/// v2 behavior:
14443/// - Require an explicit plan marker near the top of the message.
14444/// - Require structured steps (numbered or bullets) to reduce false positives.
14445/// - Avoid classifying tool-output blobs as plans.
14446fn has_plan_heuristic(content: &str) -> bool {
14447    if content.len() < 24 {
14448        return false;
14449    }
14450
14451    let lower = content.to_lowercase();
14452
14453    // Ignore tool-output-like blobs unless they also have a strong plan header.
14454    let looks_like_tool_blob = lower.contains("```")
14455        || lower.contains("\"tool\"")
14456        || lower.contains("stdout:")
14457        || lower.contains("stderr:")
14458        || lower.contains("exit code:");
14459
14460    let mut lines: Vec<&str> = Vec::with_capacity(60);
14461    let mut in_fenced_code = false;
14462    for raw in lower.lines() {
14463        let line = raw.trim();
14464        if line.starts_with("```") {
14465            in_fenced_code = !in_fenced_code;
14466            continue;
14467        }
14468        if in_fenced_code || line.is_empty() {
14469            continue;
14470        }
14471        lines.push(line);
14472        if lines.len() >= 60 {
14473            break;
14474        }
14475    }
14476
14477    let header_pos = lines.iter().position(|line| {
14478        line.starts_with("## plan")
14479            || line.starts_with("# plan")
14480            || line.starts_with("plan:")
14481            || line.starts_with("implementation plan")
14482            || line.starts_with("next steps:")
14483            || line.starts_with("action plan:")
14484    });
14485    let preview_top = lines.iter().take(8).copied().collect::<Vec<_>>().join("\n");
14486    let header_near_top = header_pos.is_some_and(|idx| idx <= 6) || preview_top.contains("plan:");
14487
14488    if !header_near_top {
14489        return false;
14490    }
14491    if looks_like_tool_blob && header_pos.is_none() {
14492        return false;
14493    }
14494
14495    let numbered_steps = lines
14496        .iter()
14497        .filter(|line| is_numbered_step_line(line))
14498        .count();
14499    let bullet_steps = lines
14500        .iter()
14501        .filter(|line| {
14502            line.starts_with("- ")
14503                || line.starts_with("* ")
14504                || line.starts_with("+ ")
14505                || line.starts_with("- [ ] ")
14506                || line.starts_with("- [x] ")
14507        })
14508        .count();
14509
14510    numbered_steps >= 2 || (numbered_steps >= 1 && bullet_steps >= 1) || bullet_steps >= 3
14511}
14512
14513fn is_numbered_step_line(line: &str) -> bool {
14514    let trimmed = line.trim_start();
14515    let digit_count = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
14516    if digit_count == 0 || digit_count > 3 {
14517        return false;
14518    }
14519    let rest = &trimmed[digit_count..];
14520    rest.starts_with(". ") || rest.starts_with(") ")
14521}
14522
14523/// Pending token_usage row to be batch-inserted.
14524#[derive(Debug, Clone)]
14525pub struct TokenUsageEntry {
14526    pub message_id: i64,
14527    pub conversation_id: i64,
14528    pub agent_id: i64,
14529    pub workspace_id: Option<i64>,
14530    pub source_id: String,
14531    pub timestamp_ms: i64,
14532    pub day_id: i64,
14533    pub model_name: Option<String>,
14534    pub model_family: Option<String>,
14535    pub model_tier: Option<String>,
14536    pub service_tier: Option<String>,
14537    pub provider: Option<String>,
14538    pub input_tokens: Option<i64>,
14539    pub output_tokens: Option<i64>,
14540    pub cache_read_tokens: Option<i64>,
14541    pub cache_creation_tokens: Option<i64>,
14542    pub thinking_tokens: Option<i64>,
14543    pub total_tokens: Option<i64>,
14544    pub estimated_cost_usd: Option<f64>,
14545    pub role: String,
14546    pub content_chars: i64,
14547    pub has_tool_calls: bool,
14548    pub tool_call_count: u32,
14549    pub data_source: String,
14550}
14551
14552// -------------------------------------------------------------------------
14553// PricingTable — In-memory cache for model_pricing lookups (bead z9fse.10)
14554// -------------------------------------------------------------------------
14555
14556/// One pricing row loaded from the `model_pricing` table.
14557#[derive(Debug, Clone)]
14558pub struct PricingEntry {
14559    pub model_pattern: String,
14560    pub provider: String,
14561    pub input_cost_per_mtok: f64,
14562    pub output_cost_per_mtok: f64,
14563    pub cache_read_cost_per_mtok: Option<f64>,
14564    pub cache_creation_cost_per_mtok: Option<f64>,
14565    /// Effective date as day_id (days since 2020-01-01).
14566    pub effective_day_id: i64,
14567}
14568
14569/// Diagnostics for pricing coverage during a batch operation.
14570#[derive(Debug, Clone, Default)]
14571pub struct PricingDiagnostics {
14572    pub priced_count: u64,
14573    pub unpriced_count: u64,
14574    /// Top unknown model names → count.
14575    pub unknown_models: HashMap<String, u64>,
14576}
14577
14578impl PricingDiagnostics {
14579    fn record_priced(&mut self) {
14580        self.priced_count += 1;
14581    }
14582
14583    fn record_unpriced(&mut self, model_name: Option<&str>) {
14584        self.unpriced_count += 1;
14585        let key = model_name.unwrap_or("(none)").to_string();
14586        *self.unknown_models.entry(key).or_insert(0) += 1;
14587    }
14588
14589    /// Log a summary of pricing coverage.
14590    pub fn log_summary(&self) {
14591        let total = self.priced_count + self.unpriced_count;
14592        if total == 0 {
14593            return;
14594        }
14595        let pct = (self.priced_count as f64 / total as f64) * 100.0;
14596        tracing::info!(
14597            target: "cass::analytics::pricing",
14598            priced = self.priced_count,
14599            unpriced = self.unpriced_count,
14600            total = total,
14601            coverage_pct = format!("{pct:.1}%"),
14602            "pricing coverage"
14603        );
14604        if !self.unknown_models.is_empty() {
14605            let mut sorted: Vec<_> = self.unknown_models.iter().collect();
14606            sorted.sort_by(|a, b| b.1.cmp(a.1));
14607            for (model, count) in sorted.iter().take(5) {
14608                tracing::debug!(
14609                    target: "cass::analytics::pricing",
14610                    model = model.as_str(),
14611                    count = count,
14612                    "unknown model (no pricing)"
14613                );
14614            }
14615        }
14616    }
14617}
14618
14619/// In-memory pricing table loaded from `model_pricing` for fast lookups.
14620#[derive(Debug, Clone)]
14621pub struct PricingTable {
14622    entries: Vec<PricingEntry>,
14623}
14624
14625impl PricingTable {
14626    /// Load all pricing entries from the database.
14627    pub fn load(conn: &FrankenConnection) -> Result<Self> {
14628        Self::franken_load(conn)
14629    }
14630
14631    /// Load all pricing entries from a frankensqlite connection.
14632    pub fn franken_load(conn: &FrankenConnection) -> Result<Self> {
14633        let rows = conn.query(
14634            "SELECT model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
14635                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
14636             FROM model_pricing
14637             ORDER BY effective_date DESC",
14638        )?;
14639        let mut entries = Vec::with_capacity(rows.len());
14640        for row in &rows {
14641            let effective_date: String = row.get_typed(6)?;
14642            let effective_day_id = date_str_to_day_id(&effective_date)?;
14643            entries.push(PricingEntry {
14644                model_pattern: row.get_typed(0)?,
14645                provider: row.get_typed(1)?,
14646                input_cost_per_mtok: row.get_typed(2)?,
14647                output_cost_per_mtok: row.get_typed(3)?,
14648                cache_read_cost_per_mtok: row.get_typed(4)?,
14649                cache_creation_cost_per_mtok: row.get_typed(5)?,
14650                effective_day_id,
14651            });
14652        }
14653        Ok(Self { entries })
14654    }
14655
14656    /// Look up the best pricing entry for a given model name and date.
14657    ///
14658    /// Selection rules:
14659    /// 1. Pattern must match model_name (SQL LIKE semantics).
14660    /// 2. effective_day_id must be <= message_day_id.
14661    /// 3. Among matches, prefer the most recent effective_date.
14662    /// 4. Tie-break by pattern specificity (longest pattern wins).
14663    pub fn lookup(&self, model_name: &str, message_day_id: i64) -> Option<&PricingEntry> {
14664        let mut best: Option<&PricingEntry> = None;
14665
14666        for entry in &self.entries {
14667            if entry.effective_day_id > message_day_id {
14668                continue;
14669            }
14670            if !sql_like_match(model_name, &entry.model_pattern) {
14671                continue;
14672            }
14673
14674            match best {
14675                None => best = Some(entry),
14676                Some(current) => {
14677                    if entry.effective_day_id > current.effective_day_id
14678                        || (entry.effective_day_id == current.effective_day_id
14679                            && entry.model_pattern.len() > current.model_pattern.len())
14680                    {
14681                        best = Some(entry);
14682                    }
14683                }
14684            }
14685        }
14686
14687        best
14688    }
14689
14690    /// Compute estimated cost in USD for a set of token counts.
14691    ///
14692    /// Returns `None` if no pricing entry matches or if no token counts are available.
14693    pub fn compute_cost(
14694        &self,
14695        model_name: Option<&str>,
14696        message_day_id: i64,
14697        input_tokens: Option<i64>,
14698        output_tokens: Option<i64>,
14699        cache_read_tokens: Option<i64>,
14700        cache_creation_tokens: Option<i64>,
14701    ) -> Option<f64> {
14702        let model = model_name?;
14703        let pricing = self.lookup(model, message_day_id)?;
14704
14705        if input_tokens.is_none() && output_tokens.is_none() {
14706            return None;
14707        }
14708
14709        let mut cost = 0.0;
14710        let cache_read = cache_read_tokens.unwrap_or(0);
14711        let cache_creation = cache_creation_tokens.unwrap_or(0);
14712        // input_tokens includes cache tokens as a subset; subtract them
14713        // so we don't charge at both the full input rate AND the cache rate.
14714        let non_cache_input = input_tokens
14715            .unwrap_or(0)
14716            .saturating_sub(cache_read)
14717            .saturating_sub(cache_creation)
14718            .max(0);
14719        cost += non_cache_input as f64 * pricing.input_cost_per_mtok / 1_000_000.0;
14720        cost += output_tokens.unwrap_or(0) as f64 * pricing.output_cost_per_mtok / 1_000_000.0;
14721
14722        if let Some(cache_price) = pricing.cache_read_cost_per_mtok {
14723            cost += cache_read as f64 * cache_price / 1_000_000.0;
14724        }
14725        if let Some(cache_price) = pricing.cache_creation_cost_per_mtok {
14726            cost += cache_creation as f64 * cache_price / 1_000_000.0;
14727        }
14728
14729        Some(cost)
14730    }
14731
14732    /// Whether the pricing table has any entries.
14733    pub fn is_empty(&self) -> bool {
14734        self.entries.is_empty()
14735    }
14736}
14737
14738/// Convert "YYYY-MM-DD" date string to day_id (days since 2020-01-01),
14739/// matching the format produced by `day_id_from_millis`.
14740fn date_str_to_day_id(s: &str) -> Result<i64> {
14741    use chrono::NaiveDate;
14742    const EPOCH_2020: NaiveDate = match NaiveDate::from_ymd_opt(2020, 1, 1) {
14743        Some(d) => d,
14744        None => unreachable!(),
14745    };
14746    NaiveDate::parse_from_str(s, "%Y-%m-%d")
14747        .map(|d| (d - EPOCH_2020).num_days())
14748        .with_context(|| format!("invalid effective_date '{s}'"))
14749}
14750
14751/// SQL LIKE pattern matcher (case-insensitive). `%` = any sequence, `_` = any single char.
14752fn sql_like_match(value: &str, pattern: &str) -> bool {
14753    sql_like_match_bytes(
14754        value.to_ascii_lowercase().as_bytes(),
14755        pattern.to_ascii_lowercase().as_bytes(),
14756    )
14757}
14758
14759/// Determine the byte length of the UTF-8 character starting at `b`.
14760fn utf8_char_len(b: u8) -> usize {
14761    if b < 0x80 {
14762        1
14763    } else if b < 0xE0 {
14764        2
14765    } else if b < 0xF0 {
14766        3
14767    } else {
14768        4
14769    }
14770}
14771
14772fn sql_like_match_bytes(val: &[u8], pat: &[u8]) -> bool {
14773    if pat.is_empty() {
14774        return val.is_empty();
14775    }
14776    match pat[0] {
14777        b'%' => {
14778            let mut p = 1;
14779            while p < pat.len() && pat[p] == b'%' {
14780                p += 1;
14781            }
14782            let rest = &pat[p..];
14783            // Iterate only at UTF-8 char boundaries
14784            let mut i = 0;
14785            while i <= val.len() {
14786                if sql_like_match_bytes(&val[i..], rest) {
14787                    return true;
14788                }
14789                if i < val.len() {
14790                    i += utf8_char_len(val[i]);
14791                } else {
14792                    break;
14793                }
14794            }
14795            false
14796        }
14797        b'_' => {
14798            // Match one full UTF-8 character, not just one byte
14799            if val.is_empty() {
14800                return false;
14801            }
14802            let char_len = utf8_char_len(val[0]);
14803            val.len() >= char_len && sql_like_match_bytes(&val[char_len..], &pat[1..])
14804        }
14805        c => !val.is_empty() && val[0] == c && sql_like_match_bytes(&val[1..], &pat[1..]),
14806    }
14807}
14808
14809fn rebuild_batch_size_env(var: &str, default: usize) -> usize {
14810    dotenvy::var(var)
14811        .ok()
14812        .and_then(|raw| raw.parse::<usize>().ok())
14813        .filter(|value| *value > 0)
14814        .unwrap_or(default)
14815}
14816
14817/// Returns true when the error chain represents a real `FrankenError::OutOfMemory`
14818/// (typed variant) or a bare "out of memory" / "not enough memory" message.
14819///
14820/// We *deliberately* do not do substring matching on the rendered chain: frankensqlite's
14821/// `FrankenError::OutOfMemory` renders as the literal "out of memory" and is also emitted
14822/// for several non-process-OOM internal conditions (VFS buffer / VDBE register allocation).
14823/// Contextual messages like "connector parse failed: not enough memory in record" must not
14824/// be promoted into the OOM-bisect/quarantine path. See `retryable_franken_anyhow` above
14825/// for the same downcast idiom.
14826fn is_out_of_memory_error<E: OutOfMemoryProbe + ?Sized>(err: &E) -> bool {
14827    err.is_out_of_memory()
14828}
14829
14830trait OutOfMemoryProbe {
14831    fn is_out_of_memory(&self) -> bool;
14832}
14833
14834impl OutOfMemoryProbe for anyhow::Error {
14835    fn is_out_of_memory(&self) -> bool {
14836        self.chain().any(|cause| {
14837            if cause
14838                .downcast_ref::<frankensqlite::FrankenError>()
14839                .is_some_and(|err| matches!(err, frankensqlite::FrankenError::OutOfMemory))
14840            {
14841                return true;
14842            }
14843            is_exact_out_of_memory_message(&cause.to_string())
14844        })
14845    }
14846}
14847
14848impl OutOfMemoryProbe for frankensqlite::FrankenError {
14849    fn is_out_of_memory(&self) -> bool {
14850        matches!(self, frankensqlite::FrankenError::OutOfMemory)
14851    }
14852}
14853
14854fn is_exact_out_of_memory_message(message: &str) -> bool {
14855    matches!(
14856        message.trim().to_ascii_lowercase().as_str(),
14857        "out of memory" | "not enough memory"
14858    )
14859}
14860
14861// Second SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
14862// All methods (insert_conversation_tree, list_agents, list_conversations, etc.) are
14863// available through FrankenStorage.
14864
14865/// Daily count data for histogram display.
14866#[derive(Debug, Clone)]
14867pub struct DailyCount {
14868    pub day_id: i64,
14869    pub sessions: i64,
14870    pub messages: i64,
14871    pub chars: i64,
14872}
14873
14874/// Result of an analytics rebuild operation.
14875#[derive(Debug, Clone)]
14876pub struct AnalyticsRebuildResult {
14877    pub message_metrics_rows: usize,
14878    pub usage_hourly_rows: usize,
14879    pub usage_daily_rows: usize,
14880    pub usage_models_daily_rows: usize,
14881    pub elapsed_ms: u64,
14882    pub messages_per_sec: f64,
14883}
14884
14885/// Result of rebuilding daily stats.
14886#[derive(Debug, Clone)]
14887pub struct DailyStatsRebuildResult {
14888    pub rows_created: i64,
14889    pub total_sessions: i64,
14890}
14891
14892/// Result of purging archived data for a single agent.
14893#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
14894pub struct AgentArchivePurgeResult {
14895    pub conversations_deleted: usize,
14896    pub messages_deleted: usize,
14897}
14898
14899/// Health status of daily stats table.
14900#[derive(Debug, Clone)]
14901pub struct DailyStatsHealth {
14902    pub populated: bool,
14903    pub row_count: i64,
14904    pub oldest_update_ms: Option<i64>,
14905    pub conversation_count: i64,
14906    pub materialized_total: i64,
14907    pub drift: i64,
14908}
14909
14910// -------------------------------------------------------------------------
14911// FTS5 Batch Insert (P2 Opt 2.1)
14912// -------------------------------------------------------------------------
14913
14914/// Batch size for FTS5 inserts. With 7 columns per row (rowid + 6 cols) and
14915/// SQLite's SQLITE_MAX_VARIABLE_NUMBER default of 999, max batch is ~142 rows.
14916/// Using 100 for safety margin and memory efficiency.
14917const FTS5_BATCH_SIZE: usize = 100;
14918
14919#[derive(Debug, Clone)]
14920struct FtsRebuildMessageRow {
14921    rowid: i64,
14922    message_id: i64,
14923    conversation_id: i64,
14924    content: String,
14925    created_at: Option<i64>,
14926}
14927
14928#[derive(Debug, Clone)]
14929struct FtsConversationProjection {
14930    title: String,
14931    agent_id: Option<i64>,
14932    workspace_id: Option<i64>,
14933    source_path: String,
14934}
14935
14936/// Entry for pending FTS5 insert.
14937#[derive(Debug, Clone)]
14938pub struct FtsEntry {
14939    pub content: String,
14940    pub title: String,
14941    pub agent: String,
14942    pub workspace: String,
14943    pub source_path: String,
14944    pub created_at: Option<i64>,
14945    pub message_id: i64,
14946}
14947
14948impl FtsEntry {
14949    /// Create an FTS entry from a message and conversation.
14950    pub fn from_message(message_id: i64, msg: &Message, conv: &Conversation) -> Self {
14951        FtsEntry {
14952            content: msg.content.clone(),
14953            title: conv.title.clone().unwrap_or_default(),
14954            agent: conv.agent_slug.clone(),
14955            workspace: conv
14956                .workspace
14957                .as_ref()
14958                .map(|p| p.to_string_lossy().into_owned())
14959                .unwrap_or_default(),
14960            source_path: path_to_string(&conv.source_path),
14961            created_at: msg.created_at.or(conv.started_at),
14962            message_id,
14963        }
14964    }
14965}
14966
14967const FTS_ENTRY_BATCH_MAX_DOCS: usize = 512;
14968const FTS_ENTRY_BATCH_MAX_CHARS: usize = 1024 * 1024;
14969
14970/// Default batch size for the FTS rebuild INSERT (Bug #168).  When
14971/// `fts_messages` is empty but `messages` has 100K+ rows, a single unbounded
14972/// INSERT-SELECT OOMs.  This constant caps each batch so peak memory stays
14973/// bounded.  Override via `CASS_FTS_REBUILD_BATCH_SIZE` for tuning.
14974const FTS_REBUILD_BATCH_SIZE_DEFAULT: usize = 5_000;
14975
14976/// Read the FTS rebuild batch size from the environment, falling back to the
14977/// compiled-in default.
14978fn fts_rebuild_batch_size() -> usize {
14979    dotenvy::var("CASS_FTS_REBUILD_BATCH_SIZE")
14980        .ok()
14981        .and_then(|v| v.parse::<usize>().ok())
14982        .filter(|&n| n > 0)
14983        .unwrap_or(FTS_REBUILD_BATCH_SIZE_DEFAULT)
14984}
14985
14986fn flush_pending_fts_entries(
14987    storage: &FrankenStorage,
14988    tx: &FrankenTransaction<'_>,
14989    entries: &mut Vec<FtsEntry>,
14990    pending_chars: &mut usize,
14991    inserted_total: &mut usize,
14992) -> Result<()> {
14993    if entries.is_empty() {
14994        return Ok(());
14995    }
14996
14997    if storage.fts_messages_present_cached(tx) {
14998        *inserted_total += franken_batch_insert_fts(tx, entries)?;
14999    }
15000    entries.clear();
15001    *pending_chars = 0;
15002    Ok(())
15003}
15004
15005fn path_to_string<P: AsRef<Path>>(p: P) -> String {
15006    p.as_ref().to_string_lossy().into_owned()
15007}
15008
15009fn role_str(role: &MessageRole) -> String {
15010    role_as_str(role).to_owned()
15011}
15012
15013fn role_as_str(role: &MessageRole) -> &str {
15014    match role {
15015        MessageRole::User => "user",
15016        MessageRole::Agent => "agent",
15017        MessageRole::Tool => "tool",
15018        MessageRole::System => "system",
15019        MessageRole::Other(v) => v.as_str(),
15020    }
15021}
15022
15023fn agent_kind_str(kind: AgentKind) -> String {
15024    match kind {
15025        AgentKind::Cli => "cli".into(),
15026        AgentKind::VsCode => "vscode".into(),
15027        AgentKind::Hybrid => "hybrid".into(),
15028    }
15029}
15030
15031// =============================================================================
15032// Tests (bead yln.4)
15033// =============================================================================
15034
15035#[cfg(test)]
15036mod tests {
15037    use super::*;
15038    use serial_test::serial;
15039    use tempfile::TempDir;
15040
15041    struct EnvGuard {
15042        key: &'static str,
15043        previous: Option<String>,
15044    }
15045
15046    impl Drop for EnvGuard {
15047        fn drop(&mut self) {
15048            if let Some(value) = &self.previous {
15049                // SAFETY: test helper restores prior process env for isolation.
15050                unsafe {
15051                    std::env::set_var(self.key, value);
15052                }
15053            } else {
15054                // SAFETY: test helper restores prior process env for isolation.
15055                unsafe {
15056                    std::env::remove_var(self.key);
15057                }
15058            }
15059        }
15060    }
15061
15062    fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
15063        let previous = dotenvy::var(key).ok();
15064        // SAFETY: test helper toggles a process-local env var for isolation.
15065        unsafe {
15066            std::env::set_var(key, value.as_ref());
15067        }
15068        EnvGuard { key, previous }
15069    }
15070
15071    #[test]
15072    fn doctor_mutation_open_guard_only_targets_canonical_archive_db() {
15073        let dir = TempDir::new().unwrap();
15074        let canonical = dir.path().join("agent_search.db");
15075        let scratch = dir.path().join("scratch.db");
15076
15077        assert_eq!(
15078            doctor_mutation_lock_path_for_db_open(&canonical),
15079            Some(dir.path().join("doctor/locks/doctor-repair.lock"))
15080        );
15081        assert_eq!(doctor_mutation_lock_path_for_db_open(&scratch), None);
15082    }
15083
15084    #[test]
15085    fn doctor_lock_metadata_pid_detection_is_exact() {
15086        let current = std::process::id();
15087
15088        assert!(doctor_lock_metadata_pid_is_current_process(&format!(
15089            "schema_version=1\npid={current}\nmode=safe_auto_run\n"
15090        )));
15091        assert!(!doctor_lock_metadata_pid_is_current_process(
15092            "schema_version=1\npid=not-a-pid\n"
15093        ));
15094        assert!(!doctor_lock_metadata_pid_is_current_process(&format!(
15095            "pid={}\n",
15096            current.saturating_add(1)
15097        )));
15098    }
15099
15100    #[test]
15101    fn doctor_storage_open_refuses_active_doctor_mutation_lock_from_other_process() {
15102        use std::io::Write as _;
15103
15104        let dir = TempDir::new().unwrap();
15105        let db_path = dir.path().join("agent_search.db");
15106        {
15107            let storage = FrankenStorage::open(&db_path).unwrap();
15108            storage.close().unwrap();
15109        }
15110
15111        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
15112        let mut lock_file = fs::OpenOptions::new()
15113            .create(true)
15114            .truncate(false)
15115            .read(true)
15116            .write(true)
15117            .open(&lock_path)
15118            .unwrap();
15119        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
15120        lock_file.set_len(0).unwrap();
15121        lock_file.write_all(b"schema_version=1\npid=1\n").unwrap();
15122        lock_file.sync_all().unwrap();
15123
15124        let err =
15125            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
15126                .expect_err("active doctor mutation lock must block canonical DB opens");
15127        let message = err.to_string();
15128        assert!(
15129            message.contains("doctor mutation lock") && message.contains("active"),
15130            "error should identify the active doctor mutation lock: {message}"
15131        );
15132
15133        fs2::FileExt::unlock(&lock_file).unwrap();
15134    }
15135
15136    #[test]
15137    fn doctor_storage_open_allows_current_doctor_process_probe() {
15138        use std::io::Write as _;
15139
15140        let dir = TempDir::new().unwrap();
15141        let db_path = dir.path().join("agent_search.db");
15142        {
15143            let storage = FrankenStorage::open(&db_path).unwrap();
15144            storage.close().unwrap();
15145        }
15146
15147        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
15148        let mut lock_file = fs::OpenOptions::new()
15149            .create(true)
15150            .truncate(false)
15151            .read(true)
15152            .write(true)
15153            .open(&lock_path)
15154            .unwrap();
15155        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
15156        lock_file.set_len(0).unwrap();
15157        write!(lock_file, "schema_version=1\npid={}\n", std::process::id()).unwrap();
15158        lock_file.sync_all().unwrap();
15159
15160        let conn =
15161            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
15162                .expect(
15163                    "doctor process must be able to run post-repair read probes under its own lock",
15164                );
15165        drop(conn);
15166
15167        fs2::FileExt::unlock(&lock_file).unwrap();
15168    }
15169
15170    #[test]
15171    fn autocommit_retain_disable_tries_compat_then_canonical_pragma() {
15172        let mut attempts = Vec::new();
15173
15174        let selected = disable_autocommit_retain(|pragma| {
15175            attempts.push(pragma);
15176            if pragma == "PRAGMA fsqlite.autocommit_retain = OFF;" {
15177                Err("compat namespace unavailable")
15178            } else {
15179                Ok(())
15180            }
15181        })
15182        .expect("canonical pragma should disable autocommit retain");
15183
15184        assert_eq!(selected, "PRAGMA autocommit_retain = OFF;");
15185        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
15186    }
15187
15188    #[test]
15189    fn autocommit_retain_disable_fails_closed_when_no_pragma_works() {
15190        let mut attempts = Vec::new();
15191
15192        let err = disable_autocommit_retain(|pragma| {
15193            attempts.push(pragma);
15194            Err("unsupported pragma")
15195        })
15196        .expect_err("unsupported autocommit retain controls should fail closed");
15197
15198        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
15199        let message = err.to_string();
15200        assert!(
15201            message.contains("refusing to keep a long-lived MVCC connection"),
15202            "error should force callers away from unbounded snapshot retention: {message}"
15203        );
15204        assert!(
15205            message.contains("PRAGMA fsqlite.autocommit_retain = OFF;")
15206                && message.contains("PRAGMA autocommit_retain = OFF;"),
15207            "error should preserve attempted PRAGMAs for diagnostics: {message}"
15208        );
15209    }
15210
15211    /// Open a rusqlite connection on `db_path` for the narrow purpose of
15212    /// injecting (or inspecting the raw projection of) sqlite_master
15213    /// corruption patterns in test fixtures. Frankensqlite intentionally does
15214    /// not support `PRAGMA writable_schema` writes or raw inserts to
15215    /// sqlite_master (see AGENTS.md: "PRAGMA writable_schema: Not supported for
15216    /// write operations"), so these fixtures retain rusqlite as the standard-
15217    /// SQLite interop layer. All callers are in this test module and run under
15218    /// #[cfg(test)]; no production code path touches rusqlite here.
15219    fn rusqlite_test_fixture_conn(db_path: &Path) -> rusqlite::Connection {
15220        rusqlite::Connection::open(db_path).expect("open rusqlite test fixture connection")
15221    }
15222
15223    fn seed_historical_db_direct(
15224        db_path: &Path,
15225        conversations: &[crate::model::types::Conversation],
15226    ) {
15227        if let Some(parent) = db_path.parent() {
15228            fs::create_dir_all(parent).unwrap();
15229        }
15230
15231        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
15232        conn.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA).unwrap();
15233        conn.execute_compat(
15234            "INSERT INTO agents(id, slug, name, version, kind, created_at, updated_at)
15235             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
15236            fparams![1_i64, "codex", "Codex", "0.2.3", "cli", 0_i64, 0_i64],
15237        )
15238        .unwrap();
15239
15240        let mut next_message_id = 1_i64;
15241        for (conv_index, conv) in conversations.iter().enumerate() {
15242            let conversation_id = i64::try_from(conv_index + 1).unwrap();
15243            let workspace_id = conv.workspace.as_ref().map(|workspace| {
15244                let workspace_id = conversation_id;
15245                let workspace_path = workspace.to_string_lossy().into_owned();
15246                conn.execute_compat(
15247                    "INSERT INTO workspaces(id, path, display_name) VALUES(?1, ?2, ?3)",
15248                    fparams![
15249                        workspace_id,
15250                        workspace_path.as_str(),
15251                        workspace_path.as_str()
15252                    ],
15253                )
15254                .unwrap();
15255                workspace_id
15256            });
15257            let source_path = conv.source_path.to_string_lossy().into_owned();
15258            let metadata_json = conv.metadata_json.to_string();
15259            conn.execute_compat(
15260                "INSERT INTO conversations (
15261                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
15262                    started_at, ended_at, approx_tokens, metadata_json, origin_host
15263                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
15264                fparams![
15265                    conversation_id,
15266                    1_i64,
15267                    workspace_id,
15268                    conv.source_id.as_str(),
15269                    conv.external_id.as_deref(),
15270                    conv.title.as_deref(),
15271                    source_path.as_str(),
15272                    conv.started_at,
15273                    conv.ended_at,
15274                    conv.approx_tokens,
15275                    metadata_json.as_str(),
15276                    conv.origin_host.as_deref()
15277                ],
15278            )
15279            .unwrap();
15280
15281            for msg in &conv.messages {
15282                let extra_json = msg.extra_json.to_string();
15283                let role = role_str(&msg.role);
15284                conn.execute_compat(
15285                    "INSERT INTO messages(
15286                        id, conversation_id, idx, role, author, created_at, content, extra_json
15287                     ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
15288                    fparams![
15289                        next_message_id,
15290                        conversation_id,
15291                        msg.idx,
15292                        role.as_str(),
15293                        msg.author.as_deref(),
15294                        msg.created_at,
15295                        msg.content.as_str(),
15296                        extra_json.as_str()
15297                    ],
15298                )
15299                .unwrap();
15300                next_message_id += 1;
15301            }
15302        }
15303    }
15304
15305    // =========================================================================
15306    // User data file protection tests (bead yln.4)
15307    // =========================================================================
15308
15309    #[test]
15310    fn is_user_data_file_detects_bookmarks() {
15311        assert!(is_user_data_file(Path::new("/data/bookmarks.db")));
15312        assert!(is_user_data_file(Path::new("bookmarks.db")));
15313    }
15314
15315    #[test]
15316    fn is_user_data_file_detects_tui_state() {
15317        assert!(is_user_data_file(Path::new("/data/tui_state.json")));
15318    }
15319
15320    #[test]
15321    fn is_user_data_file_detects_sources_toml() {
15322        assert!(is_user_data_file(Path::new("/config/sources.toml")));
15323    }
15324
15325    #[test]
15326    fn is_user_data_file_detects_env() {
15327        assert!(is_user_data_file(Path::new(".env")));
15328    }
15329
15330    #[test]
15331    fn is_user_data_file_rejects_other_files() {
15332        assert!(!is_user_data_file(Path::new("index.db")));
15333        assert!(!is_user_data_file(Path::new("conversations.db")));
15334        assert!(!is_user_data_file(Path::new("random.txt")));
15335    }
15336
15337    // =========================================================================
15338    // Backup creation tests (bead yln.4)
15339    // =========================================================================
15340
15341    #[test]
15342    fn create_backup_returns_none_for_nonexistent() {
15343        let dir = TempDir::new().unwrap();
15344        let db_path = dir.path().join("nonexistent.db");
15345        let result = create_backup(&db_path).unwrap();
15346        assert!(result.is_none());
15347    }
15348
15349    #[test]
15350    fn create_backup_creates_named_file() {
15351        let dir = TempDir::new().unwrap();
15352        let db_path = dir.path().join("test.db");
15353        std::fs::write(&db_path, b"test data").unwrap();
15354
15355        let backup_path = create_backup(&db_path).unwrap();
15356        assert!(backup_path.is_some());
15357        let backup = backup_path.unwrap();
15358        assert!(backup.exists());
15359        assert!(
15360            backup
15361                .file_name()
15362                .unwrap()
15363                .to_str()
15364                .unwrap()
15365                .contains("backup")
15366        );
15367    }
15368
15369    #[test]
15370    fn create_backup_paths_are_unique() {
15371        let dir = TempDir::new().unwrap();
15372        let db_path = dir.path().join("test.db");
15373        std::fs::write(&db_path, b"test data").unwrap();
15374
15375        let first = create_backup(&db_path).unwrap().unwrap();
15376        let second = create_backup(&db_path).unwrap().unwrap();
15377
15378        assert_ne!(first, second);
15379        assert!(first.exists());
15380        assert!(second.exists());
15381    }
15382
15383    #[test]
15384    fn lexical_rebuild_messages_query_uses_conversation_idx_access_path() {
15385        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
15386        use std::path::PathBuf;
15387
15388        let dir = TempDir::new().unwrap();
15389        let db_path = dir.path().join("agent_search.db");
15390        let storage = SqliteStorage::open(&db_path).unwrap();
15391
15392        let agent = Agent {
15393            id: None,
15394            slug: "claude_code".into(),
15395            name: "Claude Code".into(),
15396            version: None,
15397            kind: AgentKind::Cli,
15398        };
15399        let agent_id = storage.ensure_agent(&agent).unwrap();
15400        let conversation = Conversation {
15401            id: None,
15402            agent_slug: "claude_code".into(),
15403            workspace: Some(PathBuf::from("/tmp/workspace")),
15404            external_id: Some("conv-1".into()),
15405            title: Some("Lexical rebuild".into()),
15406            source_path: PathBuf::from("/tmp/conv-1.jsonl"),
15407            started_at: Some(1_700_000_000_000),
15408            ended_at: Some(1_700_000_000_100),
15409            approx_tokens: None,
15410            metadata_json: serde_json::Value::Null,
15411            messages: vec![
15412                Message {
15413                    id: None,
15414                    idx: 0,
15415                    role: MessageRole::User,
15416                    author: Some("user".into()),
15417                    created_at: Some(1_700_000_000_010),
15418                    content: "first".into(),
15419                    extra_json: serde_json::Value::Null,
15420                    snippets: Vec::new(),
15421                },
15422                Message {
15423                    id: None,
15424                    idx: 1,
15425                    role: MessageRole::Agent,
15426                    author: Some("assistant".into()),
15427                    created_at: Some(1_700_000_000_020),
15428                    content: "second".into(),
15429                    extra_json: serde_json::Value::Null,
15430                    snippets: Vec::new(),
15431                },
15432            ],
15433            source_id: LOCAL_SOURCE_ID.into(),
15434            origin_host: None,
15435        };
15436        storage
15437            .insert_conversation_tree(agent_id, None, &conversation)
15438            .unwrap();
15439        let conversation_id = storage
15440            .conn
15441            .query_row_map(
15442                "SELECT id FROM conversations WHERE external_id = ?1",
15443                fparams!["conv-1"],
15444                |row| row.get_typed::<i64>(0),
15445            )
15446            .unwrap();
15447
15448        let opcodes: Vec<String> = storage
15449            .conn
15450            .query_map_collect(
15451                "EXPLAIN \
15452                 SELECT id, idx, role, author, created_at, content \
15453                 FROM messages \
15454                 WHERE conversation_id = ?1 ORDER BY idx",
15455                fparams![conversation_id],
15456                |row| row.get_typed(1),
15457            )
15458            .unwrap();
15459
15460        assert!(
15461            opcodes.iter().any(|opcode| opcode == "SeekGE"),
15462            "expected lexical rebuild message fetch to seek into the conversation_id/idx access path, got {opcodes:?}"
15463        );
15464        assert!(
15465            !opcodes.iter().any(|opcode| opcode == "SorterOpen"),
15466            "expected lexical rebuild message fetch to avoid sorter temp b-trees, got {opcodes:?}"
15467        );
15468    }
15469
15470    #[test]
15471    fn schema_check_rebuild_classification_ignores_transient_errors() {
15472        assert!(!schema_check_error_requires_rebuild(
15473            &frankensqlite::FrankenError::Busy
15474        ));
15475        assert!(!schema_check_error_requires_rebuild(
15476            &frankensqlite::FrankenError::DatabaseLocked {
15477                path: PathBuf::from("/tmp/test.db"),
15478            }
15479        ));
15480        assert!(!schema_check_error_requires_rebuild(
15481            &frankensqlite::FrankenError::CannotOpen {
15482                path: PathBuf::from("/tmp/test.db"),
15483            }
15484        ));
15485        assert!(!schema_check_error_requires_rebuild(
15486            &frankensqlite::FrankenError::Io(std::io::Error::other("disk hiccup"))
15487        ));
15488    }
15489
15490    #[test]
15491    fn schema_check_rebuild_classification_keeps_corruption_errors() {
15492        assert!(schema_check_error_requires_rebuild(
15493            &frankensqlite::FrankenError::DatabaseCorrupt {
15494                detail: "bad header".to_string(),
15495            }
15496        ));
15497        assert!(schema_check_error_requires_rebuild(
15498            &frankensqlite::FrankenError::WalCorrupt {
15499                detail: "bad wal".to_string(),
15500            }
15501        ));
15502        assert!(schema_check_error_requires_rebuild(
15503            &frankensqlite::FrankenError::NotADatabase {
15504                path: PathBuf::from("/tmp/test.db"),
15505            }
15506        ));
15507        assert!(schema_check_error_requires_rebuild(
15508            &frankensqlite::FrankenError::ShortRead {
15509                expected: 4096,
15510                actual: 64,
15511            }
15512        ));
15513    }
15514
15515    #[test]
15516    fn create_backup_refuses_raw_copy_after_retryable_vacuum_errors() {
15517        let retryable_errors = [
15518            frankensqlite::FrankenError::Busy,
15519            frankensqlite::FrankenError::BusyRecovery,
15520            frankensqlite::FrankenError::BusySnapshot {
15521                conflicting_pages: "1,2".to_string(),
15522            },
15523            frankensqlite::FrankenError::DatabaseLocked {
15524                path: PathBuf::from("/tmp/test.db"),
15525            },
15526            frankensqlite::FrankenError::LockFailed {
15527                detail: "fcntl lock still held".to_string(),
15528            },
15529            frankensqlite::FrankenError::WriteConflict { page: 7, holder: 9 },
15530            frankensqlite::FrankenError::SerializationFailure { page: 11 },
15531            frankensqlite::FrankenError::Internal("database is locked".to_string()),
15532        ];
15533
15534        for err in retryable_errors {
15535            assert!(
15536                backup_vacuum_error_requires_consistent_retry(&err),
15537                "retryable VACUUM failure must not fall back to raw bundle copy: {err}"
15538            );
15539        }
15540
15541        assert!(!backup_vacuum_error_requires_consistent_retry(
15542            &frankensqlite::FrankenError::NotADatabase {
15543                path: PathBuf::from("/tmp/test.db")
15544            }
15545        ));
15546        assert!(!backup_vacuum_error_requires_consistent_retry(
15547            &frankensqlite::FrankenError::DatabaseCorrupt {
15548                detail: "bad header".to_string()
15549            }
15550        ));
15551    }
15552
15553    #[test]
15554    fn create_backup_uses_hidden_vacuum_stage_path() {
15555        let backup_path = PathBuf::from("/tmp/test.db.backup.123.456.0");
15556        let stage_path = vacuum_stage_backup_path(&backup_path);
15557        let stage_name = stage_path
15558            .file_name()
15559            .and_then(|name| name.to_str())
15560            .unwrap_or_default();
15561
15562        assert!(stage_name.starts_with('.'));
15563        assert!(stage_name.ends_with(".vacuum-in-progress"));
15564        assert!(
15565            !is_backup_root_name(stage_name, "test.db.backup."),
15566            "incomplete VACUUM output must not be discoverable as a backup root"
15567        );
15568    }
15569
15570    #[test]
15571    fn create_backup_preserves_content() {
15572        let dir = TempDir::new().unwrap();
15573        let db_path = dir.path().join("test.db");
15574        let original_content = b"test database content 12345";
15575        std::fs::write(&db_path, original_content).unwrap();
15576
15577        let backup_path = create_backup(&db_path).unwrap().unwrap();
15578        let backup_content = std::fs::read(&backup_path).unwrap();
15579        assert_eq!(backup_content, original_content);
15580    }
15581
15582    #[test]
15583    fn create_backup_copies_sidecars_when_present() {
15584        let dir = TempDir::new().unwrap();
15585        let db_path = dir.path().join("test.db");
15586        std::fs::write(&db_path, b"db").unwrap();
15587        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15588        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15589
15590        let backup_path = create_backup(&db_path).unwrap().unwrap();
15591
15592        assert_eq!(
15593            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15594            b"wal"
15595        );
15596        assert_eq!(
15597            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15598            b"shm"
15599        );
15600    }
15601
15602    #[test]
15603    #[cfg(unix)]
15604    fn create_backup_rejects_symlink_root_during_raw_fallback() {
15605        use std::os::unix::fs::symlink;
15606
15607        let dir = TempDir::new().unwrap();
15608        let outside_db = dir.path().join("outside.db");
15609        let db_path = dir.path().join("test.db");
15610        std::fs::write(&outside_db, b"not sqlite").unwrap();
15611        symlink(&outside_db, &db_path).unwrap();
15612
15613        let err = create_backup(&db_path).unwrap_err();
15614
15615        assert!(
15616            err.to_string().contains("bundle symlink"),
15617            "unexpected error: {err:#}"
15618        );
15619        assert_eq!(std::fs::read(&outside_db).unwrap(), b"not sqlite");
15620        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15621            .unwrap()
15622            .filter_map(|entry| entry.ok())
15623            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15624            .filter(|name| name.starts_with("test.db.backup."))
15625            .collect();
15626        assert!(
15627            backup_roots.is_empty(),
15628            "symlinked backup source must not publish backup roots: {backup_roots:?}"
15629        );
15630    }
15631
15632    #[test]
15633    #[cfg(unix)]
15634    fn create_backup_rejects_symlink_sidecar_without_partial_backup() {
15635        use std::os::unix::fs::symlink;
15636
15637        let dir = TempDir::new().unwrap();
15638        let db_path = dir.path().join("test.db");
15639        let outside_wal = dir.path().join("outside.wal");
15640        let wal_path = database_sidecar_path(&db_path, "-wal");
15641        std::fs::write(&db_path, b"not sqlite").unwrap();
15642        std::fs::write(&outside_wal, b"outside wal").unwrap();
15643        symlink(&outside_wal, &wal_path).unwrap();
15644
15645        let err = create_backup(&db_path).unwrap_err();
15646
15647        assert!(
15648            err.to_string().contains("bundle symlink"),
15649            "unexpected error: {err:#}"
15650        );
15651        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15652        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15653            .unwrap()
15654            .filter_map(|entry| entry.ok())
15655            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15656            .filter(|name| name.starts_with("test.db.backup."))
15657            .collect();
15658        assert!(
15659            backup_roots.is_empty(),
15660            "sidecar preflight failure must not leave a partial backup root: {backup_roots:?}"
15661        );
15662    }
15663
15664    // =========================================================================
15665    // Backup cleanup tests (bead yln.4)
15666    // =========================================================================
15667
15668    #[test]
15669    fn cleanup_old_backups_keeps_recent() {
15670        let dir = TempDir::new().unwrap();
15671        let db_path = dir.path().join("test.db");
15672
15673        // Create 5 backup files with different timestamps
15674        for i in 0..5 {
15675            let backup_name = format!("test.db.backup.{}", 1000 + i);
15676            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15677        }
15678
15679        cleanup_old_backups(&db_path, 3).unwrap();
15680
15681        // Count remaining backup files
15682        let backups: Vec<_> = std::fs::read_dir(dir.path())
15683            .unwrap()
15684            .filter_map(|e| e.ok())
15685            .filter(|e| e.file_name().to_str().unwrap_or("").contains("backup"))
15686            .collect();
15687
15688        assert_eq!(backups.len(), 3);
15689    }
15690
15691    #[test]
15692    fn cleanup_old_backups_ignores_wal_and_shm_sidecars() {
15693        let dir = TempDir::new().unwrap();
15694        let db_path = dir.path().join("test.db");
15695
15696        for i in 0..3 {
15697            let backup_name = format!("test.db.backup.{}", 1000 + i);
15698            let backup_path = dir.path().join(&backup_name);
15699            std::fs::write(&backup_path, format!("backup {i}")).unwrap();
15700            std::fs::write(format!("{}-wal", backup_path.display()), b"wal").unwrap();
15701            std::fs::write(format!("{}-shm", backup_path.display()), b"shm").unwrap();
15702            std::thread::sleep(std::time::Duration::from_millis(20));
15703        }
15704
15705        cleanup_old_backups(&db_path, 2).unwrap();
15706
15707        let mut roots = Vec::new();
15708        let mut wals = Vec::new();
15709        let mut shms = Vec::new();
15710        for entry in std::fs::read_dir(dir.path())
15711            .unwrap()
15712            .filter_map(|e| e.ok())
15713        {
15714            let name = entry.file_name().to_string_lossy().into_owned();
15715            if name.ends_with("-wal") {
15716                wals.push(name);
15717            } else if name.ends_with("-shm") {
15718                shms.push(name);
15719            } else if name.contains("backup") {
15720                roots.push(name);
15721            }
15722        }
15723
15724        assert_eq!(roots.len(), 2, "should keep two backup roots");
15725        assert_eq!(
15726            wals.len(),
15727            2,
15728            "should keep WAL sidecars only for retained backups"
15729        );
15730        assert_eq!(
15731            shms.len(),
15732            2,
15733            "should keep SHM sidecars only for retained backups"
15734        );
15735    }
15736
15737    #[test]
15738    fn move_database_bundle_moves_database_and_sidecars() {
15739        let dir = TempDir::new().unwrap();
15740        let db_path = dir.path().join("test.db");
15741        let backup_path = dir.path().join("test.db.corrupt");
15742
15743        std::fs::write(&db_path, b"db").unwrap();
15744        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15745        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15746
15747        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15748        assert_eq!(
15749            moved,
15750            DatabaseBundleMoveResult {
15751                database: true,
15752                wal: true,
15753                shm: true
15754            }
15755        );
15756        assert!(moved.moved_any());
15757
15758        assert!(!db_path.exists());
15759        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15760        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15761
15762        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15763        assert_eq!(
15764            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15765            b"wal"
15766        );
15767        assert_eq!(
15768            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15769            b"shm"
15770        );
15771    }
15772
15773    #[test]
15774    fn move_database_bundle_preserves_orphan_sidecars_without_main_db() {
15775        let dir = TempDir::new().unwrap();
15776        let db_path = dir.path().join("test.db");
15777        let backup_path = dir.path().join("test.db.corrupt");
15778
15779        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15780        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15781
15782        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15783        assert_eq!(
15784            moved,
15785            DatabaseBundleMoveResult {
15786                database: false,
15787                wal: true,
15788                shm: true
15789            }
15790        );
15791        assert!(moved.moved_any());
15792        assert!(!db_path.exists());
15793        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15794        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15795        assert_eq!(
15796            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15797            b"wal"
15798        );
15799        assert_eq!(
15800            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15801            b"shm"
15802        );
15803    }
15804
15805    #[test]
15806    #[cfg(unix)]
15807    fn move_database_bundle_moves_dangling_symlink_database_root() {
15808        use std::os::unix::fs::symlink;
15809
15810        let dir = TempDir::new().unwrap();
15811        let db_path = dir.path().join("test.db");
15812        let backup_path = dir.path().join("test.db.corrupt");
15813        let missing_target = dir.path().join("missing-target.db");
15814
15815        symlink(&missing_target, &db_path).unwrap();
15816
15817        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15818
15819        assert_eq!(
15820            moved,
15821            DatabaseBundleMoveResult {
15822                database: true,
15823                wal: false,
15824                shm: false
15825            }
15826        );
15827        assert!(std::fs::symlink_metadata(&db_path).is_err());
15828        assert!(
15829            std::fs::symlink_metadata(&backup_path)
15830                .unwrap()
15831                .file_type()
15832                .is_symlink()
15833        );
15834        assert!(!missing_target.exists());
15835    }
15836
15837    #[test]
15838    #[cfg(unix)]
15839    fn move_database_bundle_moves_dangling_symlink_sidecars_without_main_db() {
15840        use std::os::unix::fs::symlink;
15841
15842        let dir = TempDir::new().unwrap();
15843        let db_path = dir.path().join("test.db");
15844        let backup_path = dir.path().join("test.db.corrupt");
15845        let missing_wal_target = dir.path().join("missing-wal");
15846        let missing_shm_target = dir.path().join("missing-shm");
15847        let wal_path = database_sidecar_path(&db_path, "-wal");
15848        let shm_path = database_sidecar_path(&db_path, "-shm");
15849
15850        symlink(&missing_wal_target, &wal_path).unwrap();
15851        symlink(&missing_shm_target, &shm_path).unwrap();
15852
15853        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15854
15855        assert_eq!(
15856            moved,
15857            DatabaseBundleMoveResult {
15858                database: false,
15859                wal: true,
15860                shm: true
15861            }
15862        );
15863        assert!(std::fs::symlink_metadata(&wal_path).is_err());
15864        assert!(std::fs::symlink_metadata(&shm_path).is_err());
15865        assert!(
15866            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-wal"))
15867                .unwrap()
15868                .file_type()
15869                .is_symlink()
15870        );
15871        assert!(
15872            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-shm"))
15873                .unwrap()
15874                .file_type()
15875                .is_symlink()
15876        );
15877        assert!(!missing_wal_target.exists());
15878        assert!(!missing_shm_target.exists());
15879    }
15880
15881    #[test]
15882    fn copy_database_bundle_copies_database_and_sidecars() {
15883        let dir = TempDir::new().unwrap();
15884        let db_path = dir.path().join("test.db");
15885        let copied_path = dir.path().join("copy.db");
15886
15887        std::fs::write(&db_path, b"db").unwrap();
15888        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15889        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15890
15891        copy_database_bundle(&db_path, &copied_path).unwrap();
15892
15893        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15894        assert_eq!(
15895            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15896            b"wal"
15897        );
15898        assert_eq!(
15899            std::fs::read(database_sidecar_path(&copied_path, "-shm")).unwrap(),
15900            b"shm"
15901        );
15902        assert_eq!(std::fs::read(&db_path).unwrap(), b"db");
15903    }
15904
15905    #[test]
15906    fn copy_database_bundle_creates_destination_parent() {
15907        let dir = TempDir::new().unwrap();
15908        let db_path = dir.path().join("test.db");
15909        let copied_path = dir.path().join("nested/copies/copy.db");
15910
15911        std::fs::write(&db_path, b"db").unwrap();
15912        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15913
15914        copy_database_bundle(&db_path, &copied_path).unwrap();
15915
15916        assert!(copied_path.parent().unwrap().is_dir());
15917        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15918        assert_eq!(
15919            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15920            b"wal"
15921        );
15922    }
15923
15924    #[test]
15925    #[cfg(unix)]
15926    fn copy_database_bundle_rejects_symlink_source_root() {
15927        use std::os::unix::fs::symlink;
15928
15929        let dir = TempDir::new().unwrap();
15930        let outside_db = dir.path().join("outside.db");
15931        let db_path = dir.path().join("test.db");
15932        let copied_path = dir.path().join("copy.db");
15933
15934        std::fs::write(&outside_db, b"outside").unwrap();
15935        symlink(&outside_db, &db_path).unwrap();
15936
15937        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15938
15939        assert!(
15940            err.to_string().contains("bundle symlink"),
15941            "unexpected error: {err:#}"
15942        );
15943        assert!(!copied_path.exists());
15944        assert_eq!(std::fs::read(&outside_db).unwrap(), b"outside");
15945    }
15946
15947    #[test]
15948    #[cfg(unix)]
15949    fn copy_database_bundle_rejects_symlink_sidecar() {
15950        use std::os::unix::fs::symlink;
15951
15952        let dir = TempDir::new().unwrap();
15953        let db_path = dir.path().join("test.db");
15954        let copied_path = dir.path().join("copy.db");
15955        let outside_wal = dir.path().join("outside.wal");
15956        let wal_path = database_sidecar_path(&db_path, "-wal");
15957
15958        std::fs::write(&db_path, b"db").unwrap();
15959        std::fs::write(&outside_wal, b"outside wal").unwrap();
15960        symlink(&outside_wal, &wal_path).unwrap();
15961
15962        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15963
15964        assert!(
15965            err.to_string().contains("bundle symlink"),
15966            "unexpected error: {err:#}"
15967        );
15968        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15969        assert!(!copied_path.exists());
15970        assert!(!database_sidecar_path(&copied_path, "-wal").exists());
15971    }
15972
15973    #[test]
15974    fn move_database_bundle_creates_destination_parent_and_moves_sidecars() {
15975        let dir = TempDir::new().unwrap();
15976        let db_path = dir.path().join("test.db");
15977        let backup_path = dir.path().join("nested/backups/test.db.corrupt");
15978
15979        std::fs::write(&db_path, b"db").unwrap();
15980        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15981        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15982
15983        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15984        assert_eq!(
15985            moved,
15986            DatabaseBundleMoveResult {
15987                database: true,
15988                wal: true,
15989                shm: true
15990            }
15991        );
15992        assert!(backup_path.parent().unwrap().is_dir());
15993        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15994        assert_eq!(
15995            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15996            b"wal"
15997        );
15998        assert_eq!(
15999            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
16000            b"shm"
16001        );
16002    }
16003
16004    #[test]
16005    fn remove_database_files_removes_orphan_sidecars_without_main_db() {
16006        let dir = TempDir::new().unwrap();
16007        let db_path = dir.path().join("test.db");
16008
16009        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16010        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16011
16012        remove_database_files(&db_path).unwrap();
16013
16014        assert!(!db_path.exists());
16015        assert!(!database_sidecar_path(&db_path, "-wal").exists());
16016        assert!(!database_sidecar_path(&db_path, "-shm").exists());
16017    }
16018
16019    #[test]
16020    fn cleanup_old_backups_ignores_backup_named_directories() {
16021        let dir = TempDir::new().unwrap();
16022        let db_path = dir.path().join("test.db");
16023
16024        for i in 0..3 {
16025            let backup_name = format!("test.db.backup.{}", 1000 + i);
16026            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
16027        }
16028        std::fs::create_dir(dir.path().join("test.db.backup.directory")).unwrap();
16029
16030        cleanup_old_backups(&db_path, 2).unwrap();
16031
16032        let mut backup_files = Vec::new();
16033        let mut backup_dirs = Vec::new();
16034        for entry in std::fs::read_dir(dir.path())
16035            .unwrap()
16036            .filter_map(|e| e.ok())
16037        {
16038            let name = entry.file_name().to_string_lossy().into_owned();
16039            if !name.starts_with("test.db.backup.") {
16040                continue;
16041            }
16042            if entry.path().is_dir() {
16043                backup_dirs.push(name);
16044            } else {
16045                backup_files.push(name);
16046            }
16047        }
16048
16049        assert_eq!(
16050            backup_files.len(),
16051            2,
16052            "only real backup files count toward retention"
16053        );
16054        assert_eq!(
16055            backup_dirs.len(),
16056            1,
16057            "backup-named directories should be ignored"
16058        );
16059    }
16060
16061    // =========================================================================
16062    // Storage open/create tests (bead yln.4)
16063    // =========================================================================
16064
16065    #[test]
16066    fn open_creates_new_database() {
16067        let dir = TempDir::new().unwrap();
16068        let db_path = dir.path().join("new.db");
16069        assert!(!db_path.exists());
16070
16071        let storage = SqliteStorage::open(&db_path).unwrap();
16072        assert!(db_path.exists());
16073        storage.close().unwrap();
16074    }
16075
16076    #[test]
16077    fn open_readonly_fails_for_nonexistent() {
16078        let dir = TempDir::new().unwrap();
16079        let db_path = dir.path().join("nonexistent.db");
16080        let result = SqliteStorage::open_readonly(&db_path);
16081        assert!(result.is_err());
16082    }
16083
16084    #[test]
16085    fn open_readonly_succeeds_for_existing() {
16086        let dir = TempDir::new().unwrap();
16087        let db_path = dir.path().join("existing.db");
16088
16089        // Create first
16090        let _storage = SqliteStorage::open(&db_path).unwrap();
16091        drop(_storage);
16092
16093        // Now open readonly
16094        let storage = SqliteStorage::open_readonly(&db_path).unwrap();
16095        assert!(storage.schema_version().is_ok());
16096    }
16097
16098    #[test]
16099    fn reopen_existing_current_schema_is_idempotent() {
16100        let dir = TempDir::new().unwrap();
16101        let db_path = dir.path().join("existing.db");
16102
16103        // First open creates and migrates to current schema.
16104        {
16105            let storage = SqliteStorage::open(&db_path).unwrap();
16106            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
16107        }
16108
16109        // Re-open should not fail on current schema.
16110        let reopened = SqliteStorage::open(&db_path).unwrap();
16111        assert_eq!(
16112            reopened.schema_version().unwrap(),
16113            CURRENT_SCHEMA_VERSION,
16114            "reopening current schema DB should be idempotent"
16115        );
16116    }
16117
16118    #[test]
16119    fn open_or_rebuild_current_schema_does_not_trigger_rebuild() {
16120        let dir = TempDir::new().unwrap();
16121        let db_path = dir.path().join("existing.db");
16122
16123        // Create DB at current schema.
16124        {
16125            let storage = SqliteStorage::open(&db_path).unwrap();
16126            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
16127        }
16128
16129        // Should open normally, not require rebuild.
16130        let reopened = SqliteStorage::open_or_rebuild(&db_path)
16131            .expect("current schema DB should open without rebuild");
16132        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
16133    }
16134
16135    #[test]
16136    fn open_or_rebuild_does_not_treat_non_database_paths_as_corruption() {
16137        let dir = TempDir::new().unwrap();
16138        let db_path = dir.path().join("db_dir");
16139        std::fs::create_dir(&db_path).unwrap();
16140
16141        let result = SqliteStorage::open_or_rebuild(&db_path);
16142
16143        match result {
16144            Err(MigrationError::Database(_)) | Err(MigrationError::Io(_)) => {}
16145            Err(MigrationError::RebuildRequired { reason, .. }) => {
16146                panic!("should not rebuild non-database path: {reason}")
16147            }
16148            Err(MigrationError::Other(msg)) => {
16149                panic!("should preserve underlying open error, got Other: {msg}")
16150            }
16151            Ok(_) => panic!("directory path must not open as a database"),
16152        }
16153
16154        assert!(
16155            db_path.is_dir(),
16156            "non-database directory must be left in place"
16157        );
16158    }
16159
16160    // =========================================================================
16161    // Schema version tests (bead yln.4)
16162    // =========================================================================
16163
16164    #[test]
16165    fn schema_version_returns_current() {
16166        let dir = TempDir::new().unwrap();
16167        let db_path = dir.path().join("test.db");
16168        let storage = SqliteStorage::open(&db_path).unwrap();
16169        let version = storage.schema_version().unwrap();
16170        assert!(version >= 5, "Schema version should be at least 5");
16171    }
16172
16173    // =========================================================================
16174    // Current analytics/schema smoke test (bead z9fse.11)
16175    // =========================================================================
16176
16177    #[test]
16178    fn migration_v13_creates_analytics_tables() {
16179        let dir = TempDir::new().unwrap();
16180        let db_path = dir.path().join("test.db");
16181        let storage = SqliteStorage::open(&db_path).unwrap();
16182
16183        // Schema version should be current.
16184        let version = storage.schema_version().unwrap();
16185        assert_eq!(
16186            version, CURRENT_SCHEMA_VERSION,
16187            "Schema version must match CURRENT_SCHEMA_VERSION after migration"
16188        );
16189
16190        let conn = storage.raw();
16191
16192        // Helper: collect column names from PRAGMA table_info
16193        fn col_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
16194            conn.query_map_collect(
16195                &format!("PRAGMA table_info({})", table),
16196                fparams![],
16197                |row: &FrankenRow| row.get_typed(1),
16198            )
16199            .unwrap()
16200        }
16201
16202        // Helper: collect index names from PRAGMA index_list
16203        fn idx_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
16204            conn.query_map_collect(
16205                &format!("PRAGMA index_list({})", table),
16206                fparams![],
16207                |row: &FrankenRow| row.get_typed(1),
16208            )
16209            .unwrap()
16210        }
16211
16212        // Verify message_metrics table exists with expected columns
16213        let mm_cols = col_names(conn, "message_metrics");
16214        for expected in &[
16215            "message_id",
16216            "hour_id",
16217            "day_id",
16218            "content_tokens_est",
16219            "model_name",
16220            "model_family",
16221            "model_tier",
16222            "provider",
16223            "api_input_tokens",
16224            "has_plan",
16225            "agent_slug",
16226            "role",
16227            "api_data_source",
16228        ] {
16229            assert!(
16230                mm_cols.contains(&expected.to_string()),
16231                "message_metrics missing column: {expected}"
16232            );
16233        }
16234
16235        // Verify usage_hourly table
16236        let uh_cols = col_names(conn, "usage_hourly");
16237        for expected in &[
16238            "hour_id",
16239            "plan_message_count",
16240            "plan_content_tokens_est_total",
16241            "plan_api_tokens_total",
16242            "api_coverage_message_count",
16243            "content_tokens_est_user",
16244            "api_thinking_tokens_total",
16245        ] {
16246            assert!(
16247                uh_cols.contains(&expected.to_string()),
16248                "usage_hourly missing column: {expected}"
16249            );
16250        }
16251
16252        // Verify usage_daily table
16253        let ud_cols = col_names(conn, "usage_daily");
16254        for expected in &[
16255            "day_id",
16256            "plan_content_tokens_est_total",
16257            "plan_api_tokens_total",
16258            "api_thinking_tokens_total",
16259            "content_tokens_est_assistant",
16260            "message_count",
16261        ] {
16262            assert!(
16263                ud_cols.contains(&expected.to_string()),
16264                "usage_daily missing column: {expected}"
16265            );
16266        }
16267
16268        // Verify usage_models_daily table
16269        let umd_cols = col_names(conn, "usage_models_daily");
16270        for expected in &[
16271            "day_id",
16272            "model_family",
16273            "model_tier",
16274            "message_count",
16275            "api_tokens_total",
16276            "api_coverage_message_count",
16277        ] {
16278            assert!(
16279                umd_cols.contains(&expected.to_string()),
16280                "usage_models_daily missing column: {expected}"
16281            );
16282        }
16283
16284        // Verify indexes on message_metrics
16285        let mm_idxs = idx_names(conn, "message_metrics");
16286        assert!(
16287            mm_idxs.iter().any(|n| n.contains("idx_mm_hour")),
16288            "message_metrics must have hour index"
16289        );
16290        assert!(
16291            mm_idxs.iter().any(|n| n.contains("idx_mm_agent_day")),
16292            "message_metrics must have agent+day index"
16293        );
16294        assert!(
16295            mm_idxs
16296                .iter()
16297                .any(|n| n.contains("idx_mm_model_family_day")),
16298            "message_metrics must have model_family+day index"
16299        );
16300
16301        // Verify indexes on usage_hourly
16302        let uh_idxs = idx_names(conn, "usage_hourly");
16303        assert!(
16304            uh_idxs.iter().any(|n| n.contains("idx_uh_agent")),
16305            "usage_hourly must have agent index"
16306        );
16307
16308        // Verify indexes on usage_daily
16309        let ud_idxs = idx_names(conn, "usage_daily");
16310        assert!(
16311            ud_idxs.iter().any(|n| n.contains("idx_ud_agent")),
16312            "usage_daily must have agent index"
16313        );
16314
16315        // Verify indexes on usage_models_daily
16316        let umd_idxs = idx_names(conn, "usage_models_daily");
16317        assert!(
16318            umd_idxs.iter().any(|n| n.contains("idx_umd_model_day")),
16319            "usage_models_daily must have model+day index"
16320        );
16321
16322        let conversation_cols = col_names(conn, "conversations");
16323        assert!(
16324            conversation_cols.contains(&"last_message_idx".to_string())
16325                && conversation_cols.contains(&"last_message_created_at".to_string()),
16326            "fresh schema must include V15 tail columns without ALTER TABLE on conversations"
16327        );
16328        let fts_schema_rows: i64 = conn
16329            .query_row_map(
16330                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
16331                fparams![],
16332                |row: &FrankenRow| row.get_typed(0),
16333            )
16334            .unwrap();
16335        assert_eq!(
16336            fts_schema_rows, 0,
16337            "fresh schema should not create and immediately drop derived fts_messages"
16338        );
16339        let integrity: Vec<String> = conn
16340            .query_map_collect("PRAGMA integrity_check;", fparams![], |row: &FrankenRow| {
16341                row.get_typed(0)
16342            })
16343            .unwrap();
16344        assert_eq!(
16345            integrity,
16346            vec!["ok".to_string()],
16347            "fresh schema must pass SQLite integrity_check"
16348        );
16349    }
16350
16351    #[test]
16352    fn hour_id_round_trip() {
16353        // 2026-02-06 12:00:00 UTC
16354        let ts_ms = 1_770_508_800_000_i64;
16355        let hour_id = SqliteStorage::hour_id_from_millis(ts_ms);
16356        let day_id = SqliteStorage::day_id_from_millis(ts_ms);
16357
16358        // hour_id should be 24x day_id (approximately)
16359        assert_eq!(hour_id / 24, day_id, "hour_id/24 should equal day_id");
16360
16361        // Round-trip: millis_from_hour_id should give start of that hour
16362        let back = SqliteStorage::millis_from_hour_id(hour_id);
16363        assert!(
16364            back <= ts_ms && ts_ms - back < 3_600_000,
16365            "Round-trip should land within the same hour"
16366        );
16367    }
16368
16369    #[test]
16370    fn day_and_hour_ids_floor_negative_millis() {
16371        // One millisecond before the Unix epoch should still floor into the
16372        // previous second/hour/day rather than truncating toward zero.
16373        let ts_ms = -1_i64;
16374        let expected_secs = -1_i64;
16375        let epoch_2020_secs = 1_577_836_800_i64;
16376
16377        assert_eq!(
16378            SqliteStorage::day_id_from_millis(ts_ms),
16379            (expected_secs - epoch_2020_secs).div_euclid(86_400)
16380        );
16381        assert_eq!(
16382            SqliteStorage::hour_id_from_millis(ts_ms),
16383            (expected_secs - epoch_2020_secs).div_euclid(3_600)
16384        );
16385    }
16386
16387    #[test]
16388    fn migration_v13_from_v10() {
16389        let dir = TempDir::new().unwrap();
16390        let db_path = dir.path().join("test.db");
16391
16392        // Open at v10 first by faking it
16393        {
16394            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
16395            conn.execute_batch("PRAGMA journal_mode=WAL;").unwrap();
16396            conn.execute_batch(
16397                "CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT);",
16398            )
16399            .unwrap();
16400            conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', '10')")
16401                .unwrap();
16402            // Apply V1-V10 so schema is correct. Keep each historical DDL batch
16403            // in autocommit mode; the fixture is testing cass migration
16404            // transition behavior, not frankensqlite's handling of a giant
16405            // synthetic legacy-DDL transaction.
16406            conn.execute_batch(MIGRATION_V1).unwrap();
16407            conn.execute_batch(MIGRATION_V2).unwrap();
16408            conn.execute_batch(MIGRATION_V4).unwrap();
16409            conn.execute_batch(MIGRATION_V5).unwrap();
16410            conn.execute_batch(MIGRATION_V6).unwrap();
16411            conn.execute_batch(MIGRATION_V7).unwrap();
16412            conn.execute_batch(MIGRATION_V8).unwrap();
16413            conn.execute_batch(MIGRATION_V9).unwrap();
16414            conn.execute_batch(MIGRATION_V10).unwrap();
16415            conn.execute("UPDATE meta SET value = '10' WHERE key = 'schema_version'")
16416                .unwrap();
16417        }
16418        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
16419
16420        // Now open with SqliteStorage — should auto-migrate to current schema
16421        let storage = SqliteStorage::open(&db_path).unwrap();
16422        let version = storage.schema_version().unwrap();
16423        assert_eq!(
16424            version, CURRENT_SCHEMA_VERSION,
16425            "Should have migrated from v10 to the current schema"
16426        );
16427
16428        // Verify new tables exist
16429        let count: i64 = storage
16430            .raw()
16431            .query_row_map(
16432                "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name IN ('message_metrics', 'usage_hourly', 'usage_daily', 'usage_models_daily')",
16433                &[],
16434                |row: &FrankenRow| row.get_typed::<i64>(0),
16435            )
16436            .unwrap();
16437        assert_eq!(count, 4, "All 4 analytics tables should exist");
16438    }
16439
16440    // =========================================================================
16441    // Analytics ingest integration test (bead z9fse.2)
16442    // =========================================================================
16443
16444    #[test]
16445    fn analytics_ingest_populates_metrics_and_rollups() {
16446        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16447        use std::path::PathBuf;
16448
16449        let dir = TempDir::new().unwrap();
16450        let db_path = dir.path().join("test.db");
16451        let storage = SqliteStorage::open(&db_path).unwrap();
16452
16453        // Register agent + workspace
16454        let agent = Agent {
16455            id: None,
16456            slug: "claude_code".into(),
16457            name: "Claude Code".into(),
16458            version: Some("1.0".into()),
16459            kind: AgentKind::Cli,
16460        };
16461        let agent_id = storage.ensure_agent(&agent).unwrap();
16462
16463        // Create a synthetic conversation with 3 messages at a known timestamp
16464        // 2026-02-06 10:30:00 UTC → day_id = 2228, hour_id = 53472
16465        let ts_ms = 1_770_551_400_000_i64;
16466        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16467        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16468
16469        // Include a JSON usage block on the assistant message (like Claude Code data)
16470        let usage_json = serde_json::json!({
16471            "message": {
16472                "model": "claude-opus-4-6",
16473                "usage": {
16474                    "input_tokens": 100,
16475                    "output_tokens": 50,
16476                    "cache_read_input_tokens": 200,
16477                    "cache_creation_input_tokens": 30,
16478                    "service_tier": "standard"
16479                }
16480            }
16481        });
16482
16483        let conv = Conversation {
16484            id: None,
16485            agent_slug: "claude_code".into(),
16486            workspace: None,
16487            external_id: Some("test-conv-1".into()),
16488            title: Some("Test conversation".into()),
16489            source_path: PathBuf::from("/tmp/test.jsonl"),
16490            started_at: Some(ts_ms),
16491            ended_at: Some(ts_ms + 60_000),
16492            approx_tokens: None,
16493            metadata_json: serde_json::Value::Null,
16494            messages: vec![
16495                Message {
16496                    id: None,
16497                    idx: 0,
16498                    role: MessageRole::User,
16499                    author: None,
16500                    created_at: Some(ts_ms),
16501                    content: "Hello, can you help me with a plan?".into(),
16502                    extra_json: serde_json::Value::Null,
16503                    snippets: vec![],
16504                },
16505                Message {
16506                    id: None,
16507                    idx: 1,
16508                    role: MessageRole::Agent,
16509                    author: None,
16510                    created_at: Some(ts_ms + 30_000),
16511                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16512                    extra_json: usage_json,
16513                    snippets: vec![],
16514                },
16515                Message {
16516                    id: None,
16517                    idx: 2,
16518                    role: MessageRole::User,
16519                    author: None,
16520                    created_at: Some(ts_ms + 60_000),
16521                    content: "Great, let's proceed!".into(),
16522                    extra_json: serde_json::Value::Null,
16523                    snippets: vec![],
16524                },
16525            ],
16526            source_id: "local".into(),
16527            origin_host: None,
16528        };
16529
16530        let outcomes = storage
16531            .insert_conversations_batched(&[(agent_id, None, &conv)])
16532            .unwrap();
16533        assert_eq!(outcomes.len(), 1);
16534        assert_eq!(outcomes[0].inserted_indices.len(), 3);
16535
16536        let conn = storage.raw();
16537
16538        // Verify message_metrics rows
16539        let mm_count: i64 = conn
16540            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16541                row.get_typed::<i64>(0)
16542            })
16543            .unwrap();
16544        assert_eq!(mm_count, 3, "Should have 3 message_metrics rows");
16545
16546        // Verify hour_id and day_id are correct
16547        #[allow(clippy::type_complexity)]
16548        let rows: Vec<(i64, i64, String, i64, i64, String, String, String, String)> = conn
16549            .query_map_collect(
16550                "SELECT hour_id, day_id, role, content_tokens_est, has_plan, api_data_source, model_family, model_tier, provider FROM message_metrics ORDER BY message_id",
16551                fparams![],
16552                |row: &FrankenRow| {
16553                    Ok((
16554                        row.get_typed(0)?,
16555                        row.get_typed(1)?,
16556                        row.get_typed(2)?,
16557                        row.get_typed(3)?,
16558                        row.get_typed(4)?,
16559                        row.get_typed(5)?,
16560                        row.get_typed(6)?,
16561                        row.get_typed(7)?,
16562                        row.get_typed(8)?,
16563                    ))
16564                },
16565            )
16566            .unwrap();
16567
16568        assert_eq!(rows.len(), 3);
16569        // All messages in the same hour/day
16570        assert_eq!(rows[0].0, expected_hour);
16571        assert_eq!(rows[0].1, expected_day);
16572        // First message is user
16573        assert_eq!(rows[0].2, "user");
16574        // Second message (assistant) should have has_plan=1 (contains "## Plan" + numbered steps)
16575        assert_eq!(
16576            rows[1].4, 1,
16577            "Assistant message with plan should have has_plan=1"
16578        );
16579        // Second message should have api data source
16580        assert_eq!(
16581            rows[1].5, "api",
16582            "Claude Code assistant message should have api data source"
16583        );
16584        // First and third (user) messages should be estimated
16585        assert_eq!(rows[0].5, "estimated");
16586        assert_eq!(rows[2].5, "estimated");
16587        assert_eq!(rows[1].6, "claude");
16588        assert_eq!(rows[1].7, "opus");
16589        assert_eq!(rows[1].8, "anthropic");
16590        assert_eq!(rows[0].6, "unknown");
16591        // content_tokens_est = chars / 4
16592        let user_chars = "Hello, can you help me with a plan?".len() as i64;
16593        assert_eq!(rows[0].3, user_chars / 4);
16594
16595        // Verify usage_hourly rollup
16596        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api, uh_api_cov): (
16597            i64,
16598            i64,
16599            i64,
16600            i64,
16601            i64,
16602            i64,
16603            i64,
16604        ) = conn
16605            .query_row_map(
16606                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16607                        plan_content_tokens_est_total, plan_api_tokens_total, api_coverage_message_count
16608                 FROM usage_hourly WHERE hour_id = ?",
16609                fparams![expected_hour],
16610                |row: &FrankenRow| {
16611                    Ok((
16612                        row.get_typed(0)?,
16613                        row.get_typed(1)?,
16614                        row.get_typed(2)?,
16615                        row.get_typed(3)?,
16616                        row.get_typed(4)?,
16617                        row.get_typed(5)?,
16618                        row.get_typed(6)?,
16619                    ))
16620                },
16621            )
16622            .unwrap();
16623        assert_eq!(uh_msg, 3, "Hourly rollup should have 3 messages");
16624        assert_eq!(uh_user, 2, "Hourly rollup should have 2 user messages");
16625        assert_eq!(uh_asst, 1, "Hourly rollup should have 1 assistant message");
16626        assert_eq!(uh_plan, 1, "Hourly rollup should have 1 plan message");
16627        assert!(
16628            uh_plan_content > 0,
16629            "Hourly rollup should include plan content tokens"
16630        );
16631        assert!(
16632            uh_plan_api > 0,
16633            "Hourly rollup should include plan API tokens"
16634        );
16635        assert_eq!(
16636            uh_api_cov, 1,
16637            "Hourly rollup should have 1 API-covered message"
16638        );
16639
16640        // Verify usage_daily rollup matches hourly (same day)
16641        let (ud_msg, ud_api_cov): (i64, i64) = conn
16642            .query_row_map(
16643                "SELECT message_count, api_coverage_message_count FROM usage_daily WHERE day_id = ?",
16644                fparams![expected_day],
16645                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16646            )
16647            .unwrap();
16648        assert_eq!(ud_msg, 3, "Daily rollup should match hourly");
16649        assert_eq!(
16650            ud_api_cov, 1,
16651            "Daily api_coverage should be 1 (only assistant msg has real API data)"
16652        );
16653
16654        // Verify the API input tokens from message_metrics (only API-sourced)
16655        let api_only_input: i64 = conn
16656            .query_row_map(
16657                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE day_id = ? AND api_data_source = 'api'",
16658                fparams![expected_day],
16659                |row: &FrankenRow| row.get_typed::<i64>(0),
16660            )
16661            .unwrap();
16662        assert_eq!(
16663            api_only_input, 100,
16664            "Only API-sourced input tokens should be 100"
16665        );
16666
16667        // Verify rollups match summed message_metrics
16668        let mm_total_content_est: i64 = conn
16669            .query_row_map(
16670                "SELECT SUM(content_tokens_est) FROM message_metrics WHERE day_id = ?",
16671                fparams![expected_day],
16672                |row| row.get_typed::<i64>(0),
16673            )
16674            .unwrap();
16675        let mm_plan_content_est: i64 = conn
16676            .query_row_map(
16677                "SELECT COALESCE(SUM(content_tokens_est), 0) FROM message_metrics WHERE day_id = ? AND has_plan = 1",
16678                fparams![expected_day],
16679                |row: &FrankenRow| row.get_typed::<i64>(0),
16680            )
16681            .unwrap();
16682        let mm_plan_api_total: i64 = conn
16683            .query_row_map(
16684                "SELECT COALESCE(SUM(COALESCE(api_input_tokens, 0) + COALESCE(api_output_tokens, 0) + COALESCE(api_cache_read_tokens, 0) + COALESCE(api_cache_creation_tokens, 0) + COALESCE(api_thinking_tokens, 0)), 0)
16685                 FROM message_metrics WHERE day_id = ? AND has_plan = 1 AND api_data_source = 'api'",
16686                fparams![expected_day],
16687                |row: &FrankenRow| row.get_typed::<i64>(0),
16688            )
16689            .unwrap();
16690        let ud_content_est: i64 = conn
16691            .query_row_map(
16692                "SELECT content_tokens_est_total FROM usage_daily WHERE day_id = ?",
16693                fparams![expected_day],
16694                |row| row.get_typed::<i64>(0),
16695            )
16696            .unwrap();
16697        let (ud_plan_content_est, ud_plan_api_total): (i64, i64) = conn
16698            .query_row_map(
16699                "SELECT plan_content_tokens_est_total, plan_api_tokens_total FROM usage_daily WHERE day_id = ?",
16700                fparams![expected_day],
16701                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16702            )
16703            .unwrap();
16704        assert_eq!(
16705            mm_total_content_est, ud_content_est,
16706            "Daily rollup content_tokens_est_total must equal SUM of message_metrics"
16707        );
16708        assert_eq!(
16709            mm_plan_content_est, ud_plan_content_est,
16710            "Daily rollup plan_content_tokens_est_total must equal planned message_metrics content sum"
16711        );
16712        assert_eq!(
16713            mm_plan_api_total, ud_plan_api_total,
16714            "Daily rollup plan_api_tokens_total must equal planned message_metrics API token sum"
16715        );
16716
16717        // Verify model rollup rows
16718        let (claude_msg, claude_user, claude_asst, claude_api_total, claude_api_cov): (
16719            i64,
16720            i64,
16721            i64,
16722            i64,
16723            i64,
16724        ) = conn
16725            .query_row_map(
16726                "SELECT message_count, user_message_count, assistant_message_count, api_tokens_total, api_coverage_message_count
16727                 FROM usage_models_daily
16728                 WHERE day_id = ? AND model_family = 'claude' AND model_tier = 'opus'",
16729                fparams![expected_day],
16730                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?, row.get_typed(3)?, row.get_typed(4)?)),
16731            )
16732            .unwrap();
16733        assert_eq!(claude_msg, 1);
16734        assert_eq!(claude_user, 0);
16735        assert_eq!(claude_asst, 1);
16736        assert_eq!(claude_api_total, 380);
16737        assert_eq!(claude_api_cov, 1);
16738
16739        let unknown_msg: i64 = conn
16740            .query_row_map(
16741                "SELECT message_count FROM usage_models_daily
16742                 WHERE day_id = ? AND model_family = 'unknown' AND model_tier = 'unknown'",
16743                fparams![expected_day],
16744                |row| row.get_typed(0),
16745            )
16746            .unwrap();
16747        assert_eq!(
16748            unknown_msg, 2,
16749            "user messages should land in unknown model bucket"
16750        );
16751    }
16752
16753    #[test]
16754    fn has_plan_heuristic_detects_plans() {
16755        assert!(has_plan_heuristic(
16756            "## Plan\n\n1. First step\n2. Second step"
16757        ));
16758        assert!(has_plan_heuristic(
16759            "# Plan\nHere is what we will do:\n1. Step one\n2. Step two"
16760        ));
16761        assert!(has_plan_heuristic(
16762            "Plan:\n- Gather baseline\n- Implement changes\n- Validate with tests"
16763        ));
16764        assert!(has_plan_heuristic(
16765            "Next steps:\n1. Update schema\n2. Rebuild rollups"
16766        ));
16767        assert!(!has_plan_heuristic("Hello world"));
16768        assert!(!has_plan_heuristic("Short"));
16769        assert!(!has_plan_heuristic(
16770            "This is a regular message without plans"
16771        ));
16772        assert!(!has_plan_heuristic(
16773            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. install\\n2. run\"}\n```"
16774        ));
16775    }
16776
16777    #[test]
16778    fn has_plan_for_role_only_counts_assistant_messages() {
16779        let plan_text = "## Plan\n1. First\n2. Second";
16780        assert!(has_plan_for_role("assistant", plan_text));
16781        assert!(has_plan_for_role("agent", plan_text));
16782        assert!(has_plan_for_role("Assistant", plan_text));
16783        assert!(!has_plan_for_role("user", plan_text));
16784        assert!(!has_plan_for_role("tool", plan_text));
16785    }
16786
16787    #[test]
16788    fn api_rollups_require_api_data_source() {
16789        let mut agg = AnalyticsRollupAggregator::new();
16790
16791        let estimated_plan = MessageMetricsEntry {
16792            message_id: 1,
16793            created_at_ms: 0,
16794            hour_id: 1,
16795            day_id: 1,
16796            agent_slug: "codex".into(),
16797            workspace_id: 0,
16798            source_id: "local".into(),
16799            role: "assistant".into(),
16800            content_chars: 120,
16801            content_tokens_est: 30,
16802            model_name: None,
16803            model_family: "unknown".into(),
16804            model_tier: "unknown".into(),
16805            provider: "unknown".into(),
16806            api_input_tokens: Some(100),
16807            api_output_tokens: Some(50),
16808            api_cache_read_tokens: Some(0),
16809            api_cache_creation_tokens: Some(0),
16810            api_thinking_tokens: Some(0),
16811            api_service_tier: None,
16812            api_data_source: "estimated".into(),
16813            tool_call_count: 0,
16814            has_tool_calls: false,
16815            has_plan: true,
16816        };
16817        agg.record(&estimated_plan);
16818
16819        let api_plan = MessageMetricsEntry {
16820            message_id: 2,
16821            created_at_ms: 0,
16822            hour_id: 1,
16823            day_id: 1,
16824            agent_slug: "codex".into(),
16825            workspace_id: 0,
16826            source_id: "local".into(),
16827            role: "assistant".into(),
16828            content_chars: 80,
16829            content_tokens_est: 20,
16830            model_name: None,
16831            model_family: "unknown".into(),
16832            model_tier: "unknown".into(),
16833            provider: "unknown".into(),
16834            api_input_tokens: Some(40),
16835            api_output_tokens: Some(10),
16836            api_cache_read_tokens: Some(0),
16837            api_cache_creation_tokens: Some(0),
16838            api_thinking_tokens: Some(0),
16839            api_service_tier: None,
16840            api_data_source: "api".into(),
16841            tool_call_count: 0,
16842            has_tool_calls: false,
16843            has_plan: true,
16844        };
16845        agg.record(&api_plan);
16846
16847        let key = (1_i64, "codex".to_string(), 0_i64, "local".to_string());
16848        let hourly = agg.hourly.get(&key).expect("hourly rollup key must exist");
16849        let daily = agg.daily.get(&key).expect("daily rollup key must exist");
16850        let model_key = (
16851            1_i64,
16852            "codex".to_string(),
16853            0_i64,
16854            "local".to_string(),
16855            "unknown".to_string(),
16856            "unknown".to_string(),
16857        );
16858        let models_daily = agg
16859            .models_daily
16860            .get(&model_key)
16861            .expect("model rollup key must exist");
16862
16863        // Content rollup includes both plan messages.
16864        assert_eq!(hourly.plan_message_count, 2);
16865        assert_eq!(hourly.plan_content_tokens_est_total, 50);
16866        // API plan tokens must include only api_data_source='api' rows.
16867        assert_eq!(hourly.plan_api_tokens_total, 50);
16868        assert_eq!(daily.plan_api_tokens_total, 50);
16869        assert_eq!(models_daily.plan_api_tokens_total, 50);
16870        // Overall API totals must also exclude estimated rows.
16871        assert_eq!(hourly.api_tokens_total, 50);
16872        assert_eq!(hourly.api_input_tokens_total, 40);
16873        assert_eq!(hourly.api_output_tokens_total, 10);
16874        assert_eq!(hourly.api_coverage_message_count, 1);
16875        assert_eq!(daily.api_tokens_total, 50);
16876        assert_eq!(models_daily.api_tokens_total, 50);
16877    }
16878
16879    #[test]
16880    fn has_plan_heuristic_curated_corpus_thresholds() {
16881        // Cross-agent-style positives.
16882        let positives = [
16883            "## Plan\n1. Inspect current schema\n2. Add migration\n3. Verify rebuild",
16884            "Plan:\n1) Reproduce\n2) Patch\n3) Add tests",
16885            "Implementation plan:\n- Parse inputs\n- Update rollups\n- Run checks",
16886            "Next steps:\n1. Reserve file\n2. Implement\n3. Report status",
16887            "# Plan\n1. Gather requirements\n2. Ship changes",
16888            "Action plan:\n- Identify root cause\n- Fix it\n- Validate",
16889        ];
16890
16891        // Typical false positives we want to avoid.
16892        let negatives = [
16893            "The plan is to move fast and fix things later.",
16894            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. ls\\n2. cat\"}\n```",
16895            "stdout:\n1. Build started\n2. Build finished\nexit code: 0",
16896            "I can help with that request. Let me know if you want details.",
16897            "Here is a list:\n- apples\n- oranges",
16898            "Status update: completed tasks and blockers below.",
16899        ];
16900
16901        let tp = positives
16902            .iter()
16903            .filter(|msg| has_plan_heuristic(msg))
16904            .count();
16905        let fp = negatives
16906            .iter()
16907            .filter(|msg| has_plan_heuristic(msg))
16908            .count();
16909
16910        let recall = tp as f64 / positives.len() as f64;
16911        let false_positive_rate = fp as f64 / negatives.len() as f64;
16912
16913        assert!(
16914            recall >= 0.80,
16915            "plan heuristic recall too low: got {recall:.2}"
16916        );
16917        assert!(
16918            false_positive_rate <= 0.20,
16919            "plan heuristic false-positive rate too high: got {false_positive_rate:.2}"
16920        );
16921    }
16922
16923    #[test]
16924    fn rebuild_analytics_repopulates_from_messages() {
16925        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16926        use std::path::PathBuf;
16927
16928        let dir = TempDir::new().unwrap();
16929        let db_path = dir.path().join("test.db");
16930        let storage = SqliteStorage::open(&db_path).unwrap();
16931
16932        // Register agent
16933        let agent = Agent {
16934            id: None,
16935            slug: "claude_code".into(),
16936            name: "Claude Code".into(),
16937            version: Some("1.0".into()),
16938            kind: AgentKind::Cli,
16939        };
16940        let agent_id = storage.ensure_agent(&agent).unwrap();
16941
16942        // 2026-02-06 10:30:00 UTC
16943        let ts_ms = 1_770_551_400_000_i64;
16944        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16945        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16946
16947        let usage_json = serde_json::json!({
16948            "message": {
16949                "model": "claude-opus-4-6",
16950                "usage": {
16951                    "input_tokens": 100,
16952                    "output_tokens": 50,
16953                    "cache_read_input_tokens": 200,
16954                    "cache_creation_input_tokens": 30,
16955                    "service_tier": "standard"
16956                }
16957            }
16958        });
16959
16960        let conv = Conversation {
16961            id: None,
16962            agent_slug: "claude_code".into(),
16963            workspace: None,
16964            external_id: Some("test-rebuild-1".into()),
16965            title: Some("Test conversation".into()),
16966            source_path: PathBuf::from("/tmp/test.jsonl"),
16967            started_at: Some(ts_ms),
16968            ended_at: Some(ts_ms + 60_000),
16969            approx_tokens: None,
16970            metadata_json: serde_json::Value::Null,
16971            messages: vec![
16972                Message {
16973                    id: None,
16974                    idx: 0,
16975                    role: MessageRole::User,
16976                    author: None,
16977                    created_at: Some(ts_ms),
16978                    content: "Hello, can you help me with a plan?".into(),
16979                    extra_json: serde_json::Value::Null,
16980                    snippets: vec![],
16981                },
16982                Message {
16983                    id: None,
16984                    idx: 1,
16985                    role: MessageRole::Agent,
16986                    author: None,
16987                    created_at: Some(ts_ms + 30_000),
16988                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16989                    extra_json: usage_json,
16990                    snippets: vec![],
16991                },
16992                Message {
16993                    id: None,
16994                    idx: 2,
16995                    role: MessageRole::User,
16996                    author: None,
16997                    created_at: Some(ts_ms + 60_000),
16998                    content: "Great, let's proceed!".into(),
16999                    extra_json: serde_json::Value::Null,
17000                    snippets: vec![],
17001                },
17002            ],
17003            source_id: "local".into(),
17004            origin_host: None,
17005        };
17006
17007        storage
17008            .insert_conversations_batched(&[(agent_id, None, &conv)])
17009            .unwrap();
17010
17011        // Save original analytics state
17012        let conn = storage.raw();
17013        let orig_mm: i64 = conn
17014            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17015                row.get_typed(0)
17016            })
17017            .unwrap();
17018        let orig_hourly: i64 = conn
17019            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
17020                row.get_typed(0)
17021            })
17022            .unwrap();
17023        let orig_daily: i64 = conn
17024            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
17025                row.get_typed(0)
17026            })
17027            .unwrap();
17028        let orig_models_daily: i64 = conn
17029            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
17030                row.get_typed(0)
17031            })
17032            .unwrap();
17033        let orig_api_input: i64 = conn
17034            .query_row_map(
17035                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
17036                &[],
17037                |row: &FrankenRow| row.get_typed(0),
17038            )
17039            .unwrap();
17040
17041        assert_eq!(orig_mm, 3);
17042        assert!(orig_hourly > 0);
17043        assert!(orig_daily > 0);
17044        assert!(orig_models_daily > 0);
17045
17046        // Destroy analytics tables (simulate corruption)
17047        conn.execute("DELETE FROM message_metrics").unwrap();
17048        conn.execute("DELETE FROM usage_hourly").unwrap();
17049        conn.execute("DELETE FROM usage_daily").unwrap();
17050        conn.execute("DELETE FROM usage_models_daily").unwrap();
17051
17052        // Verify they're empty
17053        let zero: i64 = conn
17054            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17055                row.get_typed(0)
17056            })
17057            .unwrap();
17058        assert_eq!(zero, 0);
17059
17060        // Rebuild analytics
17061        let result = storage.rebuild_analytics().unwrap();
17062
17063        assert_eq!(result.message_metrics_rows, 3);
17064        assert!(result.usage_hourly_rows > 0);
17065        assert!(result.usage_daily_rows > 0);
17066        assert!(result.usage_models_daily_rows > 0);
17067        assert!(
17068            result.elapsed_ms < 10_000,
17069            "Rebuild should be fast for 3 msgs"
17070        );
17071
17072        // Verify rebuilt data matches
17073        let conn = storage.raw();
17074        let rebuilt_mm: i64 = conn
17075            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17076                row.get_typed(0)
17077            })
17078            .unwrap();
17079        assert_eq!(
17080            rebuilt_mm, orig_mm,
17081            "Rebuilt message_metrics count should match"
17082        );
17083
17084        let rebuilt_hourly: i64 = conn
17085            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
17086                row.get_typed(0)
17087            })
17088            .unwrap();
17089        assert_eq!(
17090            rebuilt_hourly, orig_hourly,
17091            "Rebuilt hourly rows should match"
17092        );
17093
17094        let rebuilt_daily: i64 = conn
17095            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
17096                row.get_typed(0)
17097            })
17098            .unwrap();
17099        assert_eq!(rebuilt_daily, orig_daily, "Rebuilt daily rows should match");
17100
17101        let rebuilt_models_daily: i64 = conn
17102            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
17103                row.get_typed(0)
17104            })
17105            .unwrap();
17106        assert_eq!(
17107            rebuilt_models_daily, orig_models_daily,
17108            "Rebuilt model rollup rows should match"
17109        );
17110
17111        // Verify API token data preserved through rebuild
17112        let rebuilt_api_input: i64 = conn
17113            .query_row_map(
17114                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
17115                &[],
17116                |row: &FrankenRow| row.get_typed(0),
17117            )
17118            .unwrap();
17119        assert_eq!(
17120            rebuilt_api_input, orig_api_input,
17121            "Rebuilt API input tokens should match original"
17122        );
17123
17124        // Verify rollups have correct data
17125        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api): (
17126            i64,
17127            i64,
17128            i64,
17129            i64,
17130            i64,
17131            i64,
17132        ) = conn
17133            .query_row_map(
17134                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
17135                        plan_content_tokens_est_total, plan_api_tokens_total
17136                 FROM usage_hourly WHERE hour_id = ?",
17137                fparams![expected_hour],
17138                |row: &FrankenRow| {
17139                    Ok((
17140                        row.get_typed(0)?,
17141                        row.get_typed(1)?,
17142                        row.get_typed(2)?,
17143                        row.get_typed(3)?,
17144                        row.get_typed(4)?,
17145                        row.get_typed(5)?,
17146                    ))
17147                },
17148            )
17149            .unwrap();
17150        assert_eq!(uh_msg, 3);
17151        assert_eq!(uh_user, 2);
17152        assert_eq!(uh_asst, 1);
17153        assert_eq!(uh_plan, 1);
17154        assert!(uh_plan_content > 0);
17155        assert!(uh_plan_api > 0);
17156
17157        let ud_msg: i64 = conn
17158            .query_row_map(
17159                "SELECT message_count FROM usage_daily WHERE day_id = ?",
17160                fparams![expected_day],
17161                |row| row.get_typed(0),
17162            )
17163            .unwrap();
17164        assert_eq!(ud_msg, 3);
17165    }
17166
17167    #[test]
17168    fn insert_conversations_batched_flushes_large_fts_batches() {
17169        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17170        use std::path::PathBuf;
17171
17172        let dir = TempDir::new().unwrap();
17173        let db_path = dir.path().join("test.db");
17174        let storage = SqliteStorage::open(&db_path).unwrap();
17175        // V14 drops fts_messages during migration; cass normally recreates it
17176        // during startup via `ensure_search_fallback_fts_consistency`. Tests
17177        // that inspect fts_messages directly need to run the same repair pass
17178        // to exercise the "insert flushes FTS" contract.
17179        storage
17180            .ensure_search_fallback_fts_consistency()
17181            .expect("ensure FTS consistency before insert");
17182
17183        let agent = Agent {
17184            id: None,
17185            slug: "codex".into(),
17186            name: "Codex".into(),
17187            version: Some("0.2.3".into()),
17188            kind: AgentKind::Cli,
17189        };
17190        let agent_id = storage.ensure_agent(&agent).unwrap();
17191
17192        let content = "y".repeat((FTS_ENTRY_BATCH_MAX_CHARS / 2) + 1);
17193        let messages: Vec<_> = (0_i64..2)
17194            .map(|i| Message {
17195                id: None,
17196                idx: i,
17197                role: MessageRole::Agent,
17198                author: None,
17199                created_at: Some(1_700_000_000_000 + i),
17200                content: format!("{i}-{content}"),
17201                extra_json: serde_json::Value::Null,
17202                snippets: Vec::new(),
17203            })
17204            .collect();
17205        let conv = Conversation {
17206            id: None,
17207            agent_slug: "codex".into(),
17208            workspace: Some(PathBuf::from("/tmp/workspace")),
17209            external_id: Some("fts-large-batch".into()),
17210            title: Some("FTS Large Batch".into()),
17211            source_path: PathBuf::from("/tmp/rollout.jsonl"),
17212            started_at: Some(1_700_000_000_000),
17213            ended_at: Some(1_700_000_000_999),
17214            approx_tokens: None,
17215            metadata_json: serde_json::Value::Null,
17216            messages,
17217            source_id: "local".into(),
17218            origin_host: None,
17219        };
17220
17221        let outcomes = storage
17222            .insert_conversations_batched(&[(agent_id, None, &conv)])
17223            .unwrap();
17224        assert_eq!(outcomes.len(), 1);
17225        assert_eq!(outcomes[0].inserted_indices.len(), conv.messages.len());
17226
17227        let message_count: i64 = storage
17228            .conn
17229            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
17230                row.get_typed(0)
17231            })
17232            .unwrap();
17233        let fts_count: i64 = storage
17234            .conn
17235            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
17236                row.get_typed(0)
17237            })
17238            .unwrap();
17239
17240        assert_eq!(message_count, conv.messages.len() as i64);
17241        assert_eq!(fts_count, conv.messages.len() as i64);
17242    }
17243
17244    fn make_profiled_storage_remote_conversation(
17245        external_id: i64,
17246        msg_count: usize,
17247    ) -> Conversation {
17248        Conversation {
17249            id: None,
17250            agent_slug: "codex".into(),
17251            workspace: Some(PathBuf::from("/ws/profiled-storage-remote")),
17252            external_id: Some(format!("profiled-storage-remote-{external_id}")),
17253            title: Some(format!(
17254                "Profiled storage remote conversation {external_id}"
17255            )),
17256            source_path: PathBuf::from(format!("/log/profiled-storage-remote-{external_id}.jsonl")),
17257            started_at: Some(10_000 + external_id * 100),
17258            ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
17259            approx_tokens: Some(msg_count as i64 * 32),
17260            metadata_json: serde_json::json!({ "bench": true }),
17261            messages: (0..msg_count)
17262                .map(|idx| Message {
17263                    id: None,
17264                    idx: idx as i64,
17265                    role: if idx % 2 == 0 {
17266                        MessageRole::User
17267                    } else {
17268                        MessageRole::Agent
17269                    },
17270                    author: Some("tester".into()),
17271                    created_at: Some(20_000 + external_id * 100 + idx as i64),
17272                    content: format!(
17273                        "profiled storage remote content ext={external_id} idx={idx} {}",
17274                        "x".repeat(64)
17275                    ),
17276                    extra_json: serde_json::json!({ "idx": idx }),
17277                    snippets: Vec::new(),
17278                })
17279                .collect(),
17280            source_id: "profiled-storage-remote-source".into(),
17281            origin_host: Some("builder-profile".into()),
17282        }
17283    }
17284
17285    fn make_profiled_append_remote_merge_conversation(
17286        external_id: i64,
17287        msg_count: usize,
17288    ) -> Conversation {
17289        let base_ts = 100_000 + external_id * 1_000;
17290        Conversation {
17291            id: None,
17292            agent_slug: "codex".into(),
17293            workspace: Some(PathBuf::from("/ws/profiled-append-remote")),
17294            external_id: Some(format!("profiled-append-remote-{external_id}")),
17295            title: Some(format!("Profiled append remote conversation {external_id}")),
17296            source_path: PathBuf::from(format!("/log/profiled-append-remote-{external_id}.jsonl")),
17297            started_at: Some(base_ts),
17298            ended_at: Some(base_ts + msg_count as i64),
17299            approx_tokens: Some(msg_count as i64 * 50),
17300            metadata_json: serde_json::json!({ "bench": true }),
17301            messages: (0..msg_count)
17302                .map(|idx| Message {
17303                    id: None,
17304                    idx: idx as i64,
17305                    role: if idx % 2 == 0 {
17306                        MessageRole::User
17307                    } else {
17308                        MessageRole::Agent
17309                    },
17310                    author: Some(format!("model-{}", external_id % 5)),
17311                    created_at: Some(base_ts + idx as i64),
17312                    content: format!(
17313                        "Profiled append remote conversation {} message {}: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
17314                        external_id, idx
17315                    ),
17316                    extra_json: serde_json::json!({ "bench": true }),
17317                    snippets: Vec::new(),
17318                })
17319                .collect(),
17320            source_id: "profiled-append-remote-source".into(),
17321            origin_host: Some("builder-profile".into()),
17322        }
17323    }
17324
17325    #[test]
17326    fn insert_conversation_tree_batched_new_message_ids_match_snippet_rows() {
17327        let dir = TempDir::new().unwrap();
17328        let db_path = dir.path().join("batched-message-ids.db");
17329        let storage = SqliteStorage::open(&db_path).unwrap();
17330        let agent_id = storage
17331            .ensure_agent(&Agent {
17332                id: None,
17333                slug: "codex".into(),
17334                name: "Codex".into(),
17335                version: None,
17336                kind: AgentKind::Cli,
17337            })
17338            .unwrap();
17339        let workspace_id = storage
17340            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17341            .unwrap();
17342        let mut conv = make_profiled_storage_remote_conversation(42, 5);
17343        for (idx, msg) in conv.messages.iter_mut().enumerate() {
17344            msg.snippets.push(Snippet {
17345                id: None,
17346                file_path: Some(PathBuf::from(format!("src/file_{idx}.rs"))),
17347                start_line: Some((idx + 1) as i64),
17348                end_line: Some((idx + 2) as i64),
17349                language: Some("rust".into()),
17350                snippet_text: Some(format!("fn snippet_{idx}() {{}}")),
17351            });
17352        }
17353        let outcome = storage
17354            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
17355            .unwrap();
17356
17357        let message_count: i64 = storage
17358            .conn
17359            .query_row_map(
17360                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17361                fparams![outcome.conversation_id],
17362                |row| row.get_typed(0),
17363            )
17364            .unwrap();
17365        let joined_snippet_count: i64 = storage
17366            .conn
17367            .query_row_map(
17368                "SELECT COUNT(*)
17369                 FROM snippets s
17370                 JOIN messages m ON s.message_id = m.id
17371                 WHERE m.conversation_id = ?1",
17372                fparams![outcome.conversation_id],
17373                |row| row.get_typed(0),
17374            )
17375            .unwrap();
17376
17377        assert_eq!(message_count, conv.messages.len() as i64);
17378        assert_eq!(joined_snippet_count, conv.messages.len() as i64);
17379    }
17380
17381    #[test]
17382    fn insert_conversation_tree_batched_appended_message_ids_match_snippet_rows() {
17383        let dir = TempDir::new().unwrap();
17384        let db_path = dir.path().join("batched-append-message-ids.db");
17385        let storage = SqliteStorage::open(&db_path).unwrap();
17386        let agent_id = storage
17387            .ensure_agent(&Agent {
17388                id: None,
17389                slug: "codex".into(),
17390                name: "Codex".into(),
17391                version: None,
17392                kind: AgentKind::Cli,
17393            })
17394            .unwrap();
17395        let workspace_id = storage
17396            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17397            .unwrap();
17398
17399        let mut initial = make_profiled_storage_remote_conversation(77, 2);
17400        for (idx, msg) in initial.messages.iter_mut().enumerate() {
17401            msg.snippets.push(Snippet {
17402                id: None,
17403                file_path: Some(PathBuf::from(format!("src/append_initial_{idx}.rs"))),
17404                start_line: Some((idx + 1) as i64),
17405                end_line: Some((idx + 2) as i64),
17406                language: Some("rust".into()),
17407                snippet_text: Some(format!("fn append_initial_{idx}() {{}}")),
17408            });
17409        }
17410        let first = storage
17411            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17412            .unwrap();
17413        assert_eq!(first.inserted_indices, vec![0, 1]);
17414
17415        let mut appended = make_profiled_storage_remote_conversation(77, 5);
17416        for (idx, msg) in appended.messages.iter_mut().enumerate() {
17417            msg.snippets.push(Snippet {
17418                id: None,
17419                file_path: Some(PathBuf::from(format!("src/append_full_{idx}.rs"))),
17420                start_line: Some((idx + 10) as i64),
17421                end_line: Some((idx + 11) as i64),
17422                language: Some("rust".into()),
17423                snippet_text: Some(format!("fn append_full_{idx}() {{}}")),
17424            });
17425        }
17426        let second = storage
17427            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17428            .unwrap();
17429        assert_eq!(second.conversation_id, first.conversation_id);
17430        assert_eq!(second.inserted_indices, vec![2, 3, 4]);
17431
17432        let message_count: i64 = storage
17433            .conn
17434            .query_row_map(
17435                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17436                fparams![first.conversation_id],
17437                |row| row.get_typed(0),
17438            )
17439            .unwrap();
17440        let joined_snippets: Vec<(i64, String)> = storage
17441            .conn
17442            .query_map_collect(
17443                "SELECT m.idx, s.file_path
17444                 FROM snippets s
17445                 JOIN messages m ON s.message_id = m.id
17446                 WHERE m.conversation_id = ?1
17447                 ORDER BY m.idx, s.id",
17448                fparams![first.conversation_id],
17449                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17450            )
17451            .unwrap();
17452
17453        assert_eq!(message_count, 5);
17454        assert_eq!(
17455            joined_snippets,
17456            vec![
17457                (0, "src/append_initial_0.rs".to_string()),
17458                (1, "src/append_initial_1.rs".to_string()),
17459                (2, "src/append_full_2.rs".to_string()),
17460                (3, "src/append_full_3.rs".to_string()),
17461                (4, "src/append_full_4.rs".to_string()),
17462            ]
17463        );
17464    }
17465
17466    #[test]
17467    fn insert_conversation_tree_rehydrates_external_lookup_after_manual_clear() {
17468        let dir = TempDir::new().unwrap();
17469        let db_path = dir.path().join("external-lookup-rehydrate.db");
17470        let storage = SqliteStorage::open(&db_path).unwrap();
17471        let agent_id = storage
17472            .ensure_agent(&Agent {
17473                id: None,
17474                slug: "codex".into(),
17475                name: "Codex".into(),
17476                version: None,
17477                kind: AgentKind::Cli,
17478            })
17479            .unwrap();
17480        let workspace_id = storage
17481            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17482            .unwrap();
17483
17484        let initial = make_profiled_storage_remote_conversation(88, 2);
17485        let first = storage
17486            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17487            .unwrap();
17488        let external_id = initial.external_id.as_deref().unwrap();
17489        let lookup_key =
17490            conversation_external_lookup_key(&initial.source_id, agent_id, external_id);
17491        let lookup_id: i64 = storage
17492            .conn
17493            .query_row_map(
17494                "SELECT conversation_id
17495                 FROM conversation_external_tail_lookup
17496                 WHERE lookup_key = ?1",
17497                fparams![lookup_key.as_str()],
17498                |row| row.get_typed(0),
17499            )
17500            .unwrap();
17501        assert_eq!(lookup_id, first.conversation_id);
17502
17503        storage
17504            .conn
17505            .execute_compat(
17506                "DELETE FROM conversation_external_tail_lookup WHERE lookup_key = ?1",
17507                fparams![lookup_key.as_str()],
17508            )
17509            .unwrap();
17510
17511        let appended = make_profiled_storage_remote_conversation(88, 4);
17512        let second = storage
17513            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17514            .unwrap();
17515        assert_eq!(second.conversation_id, first.conversation_id);
17516        assert_eq!(second.inserted_indices, vec![2, 3]);
17517
17518        let conversation_count: i64 = storage
17519            .conn
17520            .query_row_map(
17521                "SELECT COUNT(*)
17522                 FROM conversations
17523                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
17524                fparams![initial.source_id.as_str(), agent_id, external_id],
17525                |row| row.get_typed(0),
17526            )
17527            .unwrap();
17528        let restored_lookup: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
17529            .conn
17530            .query_row_map(
17531                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
17532                 FROM conversation_external_tail_lookup
17533                 WHERE lookup_key = ?1",
17534                fparams![lookup_key.as_str()],
17535                |row| {
17536                    Ok((
17537                        row.get_typed(0)?,
17538                        row.get_typed(1)?,
17539                        row.get_typed(2)?,
17540                        row.get_typed(3)?,
17541                    ))
17542                },
17543            )
17544            .unwrap();
17545        let tail_state: (Option<i64>, Option<i64>, Option<i64>) = storage
17546            .conn
17547            .query_row_map(
17548                "SELECT ended_at, last_message_idx, last_message_created_at
17549                 FROM conversation_tail_state
17550                 WHERE conversation_id = ?1",
17551                fparams![first.conversation_id],
17552                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
17553            )
17554            .unwrap();
17555        assert_eq!(conversation_count, 1);
17556        assert_eq!(
17557            restored_lookup,
17558            (
17559                first.conversation_id,
17560                tail_state.0,
17561                tail_state.1,
17562                tail_state.2
17563            )
17564        );
17565        assert_eq!(
17566            tail_state,
17567            (
17568                appended.messages[3].created_at,
17569                Some(3),
17570                appended.messages[3].created_at
17571            )
17572        );
17573    }
17574
17575    #[test]
17576    fn insert_conversation_tree_recreates_daily_stats_after_manual_clear() {
17577        let dir = TempDir::new().unwrap();
17578        let db_path = dir.path().join("test.db");
17579        let storage = SqliteStorage::open(&db_path).unwrap();
17580        let agent_id = storage
17581            .ensure_agent(&Agent {
17582                id: None,
17583                slug: "codex".into(),
17584                name: "Codex".into(),
17585                version: None,
17586                kind: AgentKind::Cli,
17587            })
17588            .unwrap();
17589        let workspace = PathBuf::from("/ws/profiled-storage-remote");
17590        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17591
17592        storage
17593            .insert_conversation_tree(
17594                agent_id,
17595                Some(workspace_id),
17596                &make_profiled_storage_remote_conversation(0, 3),
17597            )
17598            .unwrap();
17599        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17600
17601        storage
17602            .insert_conversation_tree(
17603                agent_id,
17604                Some(workspace_id),
17605                &make_profiled_storage_remote_conversation(1, 2),
17606            )
17607            .unwrap();
17608
17609        let row_count: i64 = storage
17610            .conn
17611            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
17612                row.get_typed(0)
17613            })
17614            .unwrap();
17615        let (session_count, message_count): (i64, i64) = storage
17616            .conn
17617            .query_row_map(
17618                "SELECT session_count, message_count
17619                 FROM daily_stats
17620                 WHERE agent_slug = 'all' AND source_id = 'all'",
17621                fparams![],
17622                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17623            )
17624            .unwrap();
17625
17626        assert_eq!(row_count, 4);
17627        assert_eq!(session_count, 1);
17628        assert_eq!(message_count, 2);
17629    }
17630
17631    #[test]
17632    #[serial]
17633    fn insert_conversation_tree_stage_profile_tracks_steady_state_remote_reuse() {
17634        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17635
17636        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17637            let dir = TempDir::new().unwrap();
17638            let db_path = dir.path().join(format!("profile-{msg_count}.db"));
17639            let storage = SqliteStorage::open(&db_path).unwrap();
17640            let agent_id = storage
17641                .ensure_agent(&Agent {
17642                    id: None,
17643                    slug: "codex".into(),
17644                    name: "Codex".into(),
17645                    version: None,
17646                    kind: AgentKind::Cli,
17647                })
17648                .unwrap();
17649            let workspace = PathBuf::from("/ws/profiled-storage-remote");
17650            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17651
17652            storage
17653                .insert_conversation_tree(
17654                    agent_id,
17655                    Some(workspace_id),
17656                    &make_profiled_storage_remote_conversation(0, msg_count),
17657                )
17658                .unwrap();
17659
17660            let mut profile = InsertConversationTreePerfProfile::default();
17661            for external_id in 1..=iterations {
17662                storage
17663                    .insert_conversation_tree_with_profile(
17664                        agent_id,
17665                        Some(workspace_id),
17666                        &make_profiled_storage_remote_conversation(external_id as i64, msg_count),
17667                        &mut profile,
17668                    )
17669                    .unwrap();
17670            }
17671
17672            let accounted_duration = profile.source_duration
17673                + profile.tx_open_duration
17674                + profile.existing_lookup_duration
17675                + profile.conversation_row_duration
17676                + profile.message_insert_duration
17677                + profile.snippet_insert_duration
17678                + profile.fts_entry_duration
17679                + profile.fts_flush_duration
17680                + profile.analytics_duration
17681                + profile.commit_duration;
17682            assert_eq!(profile.invocations, iterations);
17683            assert_eq!(profile.messages, iterations * msg_count);
17684            assert_eq!(profile.inserted_messages, iterations * msg_count);
17685            assert!(
17686                profile.total_duration >= accounted_duration,
17687                "accounted stage durations cannot exceed total duration"
17688            );
17689
17690            profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
17691        }
17692    }
17693
17694    #[test]
17695    #[serial]
17696    fn insert_conversation_tree_stage_profile_tracks_append_remote_source_merge() {
17697        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17698
17699        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17700            let dir = TempDir::new().unwrap();
17701            let db_path = dir.path().join(format!("append-profile-{msg_count}.db"));
17702            let storage = SqliteStorage::open(&db_path).unwrap();
17703            let agent_id = storage
17704                .ensure_agent(&Agent {
17705                    id: None,
17706                    slug: "codex".into(),
17707                    name: "Codex".into(),
17708                    version: None,
17709                    kind: AgentKind::Cli,
17710                })
17711                .unwrap();
17712            let workspace = PathBuf::from("/ws/profiled-append-remote");
17713            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17714
17715            for external_id in 0..iterations {
17716                storage
17717                    .insert_conversation_tree(
17718                        agent_id,
17719                        Some(workspace_id),
17720                        &make_profiled_append_remote_merge_conversation(
17721                            external_id as i64,
17722                            msg_count,
17723                        ),
17724                    )
17725                    .unwrap();
17726            }
17727
17728            let mut profile = InsertConversationTreePerfProfile::default();
17729            for external_id in 0..iterations {
17730                storage
17731                    .append_existing_conversation_with_profile(
17732                        agent_id,
17733                        Some(workspace_id),
17734                        &make_profiled_append_remote_merge_conversation(
17735                            external_id as i64,
17736                            msg_count * 2,
17737                        ),
17738                        &mut profile,
17739                    )
17740                    .unwrap();
17741            }
17742
17743            let accounted_duration = profile.source_duration
17744                + profile.tx_open_duration
17745                + profile.existing_lookup_duration
17746                + profile.existing_idx_lookup_duration
17747                + profile.existing_replay_lookup_duration
17748                + profile.dedupe_filter_duration
17749                + profile.conversation_row_duration
17750                + profile.message_insert_duration
17751                + profile.snippet_insert_duration
17752                + profile.fts_entry_duration
17753                + profile.fts_flush_duration
17754                + profile.analytics_duration
17755                + profile.commit_duration;
17756            assert_eq!(profile.invocations, iterations);
17757            assert_eq!(profile.messages, iterations * msg_count * 2);
17758            assert_eq!(profile.inserted_messages, iterations * msg_count);
17759            assert!(
17760                profile.total_duration >= accounted_duration,
17761                "accounted append stage durations cannot exceed total duration"
17762            );
17763
17764            profile.log_summary(&format!("append_remote_merge_{msg_count}_msgs"));
17765        }
17766    }
17767
17768    #[test]
17769    fn rebuild_daily_stats_recomputes_materialized_totals_without_monolithic_group_by() {
17770        let dir = TempDir::new().unwrap();
17771        let db_path = dir.path().join("test.db");
17772        let storage = SqliteStorage::open(&db_path).unwrap();
17773        let started_at = 1_700_000_000_000_i64;
17774        let day_id = FrankenStorage::day_id_from_millis(started_at);
17775        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17776
17777        storage
17778            .conn
17779            .execute_compat(
17780                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17781                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17782                fparams![1_i64, "codex", "Codex", "cli"],
17783            )
17784            .unwrap();
17785        storage
17786            .conn
17787            .execute_compat(
17788                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17789                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17790                fparams![2_i64, "claude", "Claude", "cli"],
17791            )
17792            .unwrap();
17793
17794        storage
17795            .conn
17796            .execute_compat(
17797                "INSERT INTO conversations (
17798                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17799                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17800                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17801                fparams![
17802                    1_i64,
17803                    1_i64,
17804                    LOCAL_SOURCE_ID,
17805                    "daily-a",
17806                    "Daily A",
17807                    "/tmp/daily-a.jsonl",
17808                    started_at,
17809                    started_at + 200,
17810                    "{}"
17811                ],
17812            )
17813            .unwrap();
17814        storage
17815            .conn
17816            .execute_compat(
17817                "INSERT INTO conversations (
17818                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17819                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17820                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17821                fparams![
17822                    2_i64,
17823                    2_i64,
17824                    LOCAL_SOURCE_ID,
17825                    "daily-b",
17826                    "Daily B",
17827                    "/tmp/daily-b.jsonl",
17828                    started_at,
17829                    started_at + 300,
17830                    "{}"
17831                ],
17832            )
17833            .unwrap();
17834
17835        storage
17836            .conn
17837            .execute_compat(
17838                "INSERT INTO messages (
17839                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17840                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17841                fparams![1_i64, 1_i64, 0_i64, "user", started_at, "hello"],
17842            )
17843            .unwrap();
17844        storage
17845            .conn
17846            .execute_compat(
17847                "INSERT INTO messages (
17848                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17849                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17850                fparams![2_i64, 1_i64, 1_i64, "assistant", started_at + 100, "response"],
17851            )
17852            .unwrap();
17853        storage
17854            .conn
17855            .execute_compat(
17856                "INSERT INTO messages (
17857                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17858                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17859                fparams![3_i64, 2_i64, 0_i64, "user", started_at + 50, "abc"],
17860            )
17861            .unwrap();
17862
17863        for (message_id, agent_slug, role, content_len) in [
17864            (1_i64, "codex", "user", 5_i64),
17865            (2_i64, "codex", "assistant", 8_i64),
17866            (3_i64, "claude", "user", 3_i64),
17867        ] {
17868            storage
17869                .conn
17870                .execute_compat(
17871                    "INSERT INTO message_metrics (
17872                        message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17873                        role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17874                        api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17875                        api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17876                        model_name, model_family, model_tier, provider
17877                     ) VALUES (
17878                        ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17879                        ?8, ?9, ?10, ?11, ?12,
17880                        ?13, ?14, ?15,
17881                        ?16, ?17, ?18, ?19, ?20,
17882                        ?21, ?22, ?23, ?24
17883                     )",
17884                    fparams![
17885                        message_id,
17886                        started_at,
17887                        hour_id,
17888                        day_id,
17889                        agent_slug,
17890                        0_i64,
17891                        LOCAL_SOURCE_ID,
17892                        role,
17893                        content_len,
17894                        content_len / 4,
17895                        0_i64,
17896                        0_i64,
17897                        0_i64,
17898                        0_i64,
17899                        0_i64,
17900                        "",
17901                        "estimated",
17902                        0_i64,
17903                        0_i64,
17904                        0_i64,
17905                        "",
17906                        "unknown",
17907                        "unknown",
17908                        "unknown"
17909                    ],
17910                )
17911                .unwrap();
17912        }
17913
17914        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17915
17916        let rebuilt = storage.rebuild_daily_stats().unwrap();
17917        assert_eq!(rebuilt.total_sessions, 2);
17918
17919        let health = storage.daily_stats_health().unwrap();
17920        assert_eq!(health.conversation_count, 2);
17921        assert_eq!(health.materialized_total, 2);
17922        assert_eq!(health.drift, 0);
17923
17924        let total_messages: i64 = storage
17925            .conn
17926            .query_row_map(
17927                "SELECT message_count FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17928                fparams![],
17929                |row| row.get_typed(0),
17930            )
17931            .unwrap();
17932        assert_eq!(total_messages, 3);
17933    }
17934
17935    #[test]
17936    fn rebuild_daily_stats_preserves_byte_counts_with_message_metrics() {
17937        let dir = TempDir::new().unwrap();
17938        let db_path = dir.path().join("test.db");
17939        let storage = SqliteStorage::open(&db_path).unwrap();
17940
17941        let content = "ASCII🙂é漢字";
17942        let expected_bytes = content.len() as i64;
17943        let started_at = 1_704_067_200_000_i64;
17944        let day_id = FrankenStorage::day_id_from_millis(started_at);
17945        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17946
17947        storage
17948            .conn
17949            .execute_compat(
17950                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17951                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17952                fparams![1_i64, "tester", "Tester", "cli"],
17953            )
17954            .unwrap();
17955        storage
17956            .conn
17957            .execute_compat(
17958                "INSERT INTO conversations (
17959                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17960                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17961                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
17962                fparams![
17963                    1_i64,
17964                    1_i64,
17965                    LOCAL_SOURCE_ID,
17966                    "unicode-metrics",
17967                    "Unicode Metrics",
17968                    "/tmp/unicode-metrics.jsonl",
17969                    started_at,
17970                    "{}"
17971                ],
17972            )
17973            .unwrap();
17974        storage
17975            .conn
17976            .execute_compat(
17977                "INSERT INTO messages (
17978                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17979                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17980                fparams![1_i64, 1_i64, 0_i64, "user", started_at, content],
17981            )
17982            .unwrap();
17983        storage
17984            .conn
17985            .execute_compat(
17986                "INSERT INTO message_metrics (
17987                    message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17988                    role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17989                    api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17990                    api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17991                    model_name, model_family, model_tier, provider
17992                 ) VALUES (
17993                    ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17994                    ?8, ?9, ?10, ?11, ?12,
17995                    ?13, ?14, ?15,
17996                    ?16, ?17, ?18, ?19, ?20,
17997                    ?21, ?22, ?23, ?24
17998                 )",
17999                fparams![
18000                    1_i64,
18001                    started_at,
18002                    hour_id,
18003                    day_id,
18004                    "tester",
18005                    0_i64,
18006                    LOCAL_SOURCE_ID,
18007                    "user",
18008                    expected_bytes,
18009                    expected_bytes / 4,
18010                    0_i64,
18011                    0_i64,
18012                    0_i64,
18013                    0_i64,
18014                    0_i64,
18015                    "",
18016                    "estimated",
18017                    0_i64,
18018                    0_i64,
18019                    0_i64,
18020                    "",
18021                    "unknown",
18022                    "unknown",
18023                    "unknown"
18024                ],
18025            )
18026            .unwrap();
18027
18028        let mut tx = storage.conn.transaction().unwrap();
18029        franken_update_daily_stats_in_tx(
18030            &storage,
18031            &tx,
18032            "tester",
18033            LOCAL_SOURCE_ID,
18034            Some(started_at),
18035            StatsDelta {
18036                session_count_delta: 1,
18037                message_count_delta: 1,
18038                total_chars_delta: expected_bytes,
18039            },
18040        )
18041        .unwrap();
18042        tx.commit().unwrap();
18043
18044        let inline_total: i64 = storage
18045            .conn
18046            .query_row_map(
18047                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18048                fparams![],
18049                |row| row.get_typed(0),
18050            )
18051            .unwrap();
18052        assert_eq!(inline_total, expected_bytes);
18053
18054        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18055
18056        let rebuilt = storage.rebuild_daily_stats().unwrap();
18057        assert_eq!(rebuilt.total_sessions, 1);
18058
18059        let rebuilt_total: i64 = storage
18060            .conn
18061            .query_row_map(
18062                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18063                fparams![],
18064                |row| row.get_typed(0),
18065            )
18066            .unwrap();
18067        assert_eq!(rebuilt_total, expected_bytes);
18068    }
18069
18070    #[test]
18071    fn rebuild_daily_stats_raw_fallback_preserves_byte_counts() {
18072        let dir = TempDir::new().unwrap();
18073        let db_path = dir.path().join("test.db");
18074        let storage = SqliteStorage::open(&db_path).unwrap();
18075
18076        let content = "fallback🙂é漢字";
18077        let expected_bytes = content.len() as i64;
18078        let started_at = 1_704_067_200_000_i64;
18079        storage
18080            .conn
18081            .execute_compat(
18082                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
18083                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
18084                fparams![1_i64, "tester", "Tester", "cli"],
18085            )
18086            .unwrap();
18087        storage
18088            .conn
18089            .execute_compat(
18090                "INSERT INTO conversations (
18091                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
18092                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
18093                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
18094                fparams![
18095                    1_i64,
18096                    1_i64,
18097                    LOCAL_SOURCE_ID,
18098                    "unicode-fallback",
18099                    "Unicode Fallback",
18100                    "/tmp/unicode-fallback.jsonl",
18101                    started_at,
18102                    "{}"
18103                ],
18104            )
18105            .unwrap();
18106        storage
18107            .conn
18108            .execute_compat(
18109                "INSERT INTO messages (
18110                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18111                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18112                fparams![1_i64, 1_i64, 0_i64, "assistant", started_at, content],
18113            )
18114            .unwrap();
18115
18116        let mut tx = storage.conn.transaction().unwrap();
18117        franken_update_daily_stats_in_tx(
18118            &storage,
18119            &tx,
18120            "tester",
18121            LOCAL_SOURCE_ID,
18122            Some(started_at),
18123            StatsDelta {
18124                session_count_delta: 1,
18125                message_count_delta: 1,
18126                total_chars_delta: expected_bytes,
18127            },
18128        )
18129        .unwrap();
18130        tx.commit().unwrap();
18131
18132        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18133
18134        let rebuilt = storage.rebuild_daily_stats().unwrap();
18135        assert_eq!(rebuilt.total_sessions, 1);
18136
18137        let rebuilt_total: i64 = storage
18138            .conn
18139            .query_row_map(
18140                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18141                fparams![],
18142                |row| row.get_typed(0),
18143            )
18144            .unwrap();
18145        assert_eq!(rebuilt_total, expected_bytes);
18146    }
18147
18148    #[test]
18149    fn insert_conversations_batched_appends_duplicate_external_id() {
18150        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18151        use std::path::PathBuf;
18152
18153        let dir = TempDir::new().unwrap();
18154        let db_path = dir.path().join("test.db");
18155        let storage = SqliteStorage::open(&db_path).unwrap();
18156
18157        let agent = Agent {
18158            id: None,
18159            slug: "codex".into(),
18160            name: "Codex".into(),
18161            version: Some("0.2.3".into()),
18162            kind: AgentKind::Cli,
18163        };
18164        let agent_id = storage.ensure_agent(&agent).unwrap();
18165
18166        let base_conv = |messages: Vec<Message>| Conversation {
18167            id: None,
18168            agent_slug: "codex".into(),
18169            workspace: Some(PathBuf::from("/tmp/workspace")),
18170            external_id: Some("shared-session".into()),
18171            title: Some("Shared Session".into()),
18172            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18173            started_at: Some(1_700_000_000_000),
18174            ended_at: Some(1_700_000_000_999),
18175            approx_tokens: None,
18176            metadata_json: serde_json::Value::Null,
18177            messages,
18178            source_id: "local".into(),
18179            origin_host: None,
18180        };
18181
18182        let conv_a = base_conv(vec![
18183            Message {
18184                id: None,
18185                idx: 0,
18186                role: MessageRole::User,
18187                author: None,
18188                created_at: Some(1_700_000_000_000),
18189                content: "first".into(),
18190                extra_json: serde_json::Value::Null,
18191                snippets: Vec::new(),
18192            },
18193            Message {
18194                id: None,
18195                idx: 1,
18196                role: MessageRole::Agent,
18197                author: None,
18198                created_at: Some(1_700_000_000_100),
18199                content: "second".into(),
18200                extra_json: serde_json::Value::Null,
18201                snippets: Vec::new(),
18202            },
18203        ]);
18204        let conv_b = base_conv(vec![
18205            Message {
18206                id: None,
18207                idx: 0,
18208                role: MessageRole::User,
18209                author: None,
18210                created_at: Some(1_700_000_000_000),
18211                content: "first".into(),
18212                extra_json: serde_json::Value::Null,
18213                snippets: Vec::new(),
18214            },
18215            Message {
18216                id: None,
18217                idx: 1,
18218                role: MessageRole::Agent,
18219                author: None,
18220                created_at: Some(1_700_000_000_100),
18221                content: "second".into(),
18222                extra_json: serde_json::Value::Null,
18223                snippets: Vec::new(),
18224            },
18225            Message {
18226                id: None,
18227                idx: 2,
18228                role: MessageRole::User,
18229                author: None,
18230                created_at: Some(1_700_000_000_200),
18231                content: "third".into(),
18232                extra_json: serde_json::Value::Null,
18233                snippets: Vec::new(),
18234            },
18235            Message {
18236                id: None,
18237                idx: 3,
18238                role: MessageRole::Agent,
18239                author: None,
18240                created_at: Some(1_700_000_000_300),
18241                content: "fourth".into(),
18242                extra_json: serde_json::Value::Null,
18243                snippets: Vec::new(),
18244            },
18245        ]);
18246
18247        let outcomes = storage
18248            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18249            .unwrap();
18250        assert_eq!(outcomes.len(), 2);
18251        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
18252        assert_eq!(outcomes[1].inserted_indices, vec![2, 3]);
18253        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18254
18255        let conversation_count: i64 = storage
18256            .conn
18257            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18258                row.get_typed(0)
18259            })
18260            .unwrap();
18261        let conversation_count_not_indexed: i64 = storage
18262            .conn
18263            .query_row_map(
18264                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18265                fparams![],
18266                |row| row.get_typed(0),
18267            )
18268            .unwrap();
18269        let conversation_count_source_index: i64 = storage
18270            .conn
18271            .query_row_map(
18272                "SELECT COUNT(*) FROM conversations INDEXED BY idx_conversations_source_id",
18273                fparams![],
18274                |row| row.get_typed(0),
18275            )
18276            .unwrap();
18277        let message_count: i64 = storage
18278            .conn
18279            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18280                row.get_typed(0)
18281            })
18282            .unwrap();
18283        let reopened_storage = SqliteStorage::open(&db_path).unwrap();
18284        let reopened_conversation_count: i64 = reopened_storage
18285            .conn
18286            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18287                row.get_typed(0)
18288            })
18289            .unwrap();
18290        let reopened_conversation_count_not_indexed: i64 = reopened_storage
18291            .conn
18292            .query_row_map(
18293                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18294                fparams![],
18295                |row| row.get_typed(0),
18296            )
18297            .unwrap();
18298        let reopened_conversation_ids: Vec<i64> = reopened_storage
18299            .conn
18300            .query_map_collect(
18301                "SELECT id FROM conversations ORDER BY id",
18302                fparams![],
18303                |row| row.get_typed(0),
18304            )
18305            .unwrap();
18306        let reopened_conversation_ids_not_indexed: Vec<i64> = reopened_storage
18307            .conn
18308            .query_map_collect(
18309                "SELECT id FROM conversations NOT INDEXED ORDER BY id",
18310                fparams![],
18311                |row| row.get_typed(0),
18312            )
18313            .unwrap();
18314        let reopened_conversation_ids_source_index: Vec<i64> = reopened_storage
18315            .conn
18316            .query_map_collect(
18317                "SELECT id FROM conversations INDEXED BY idx_conversations_source_id ORDER BY id",
18318                fparams![],
18319                |row| row.get_typed(0),
18320            )
18321            .unwrap();
18322
18323        assert_eq!(reopened_conversation_ids, vec![outcomes[0].conversation_id]);
18324        assert_eq!(
18325            reopened_conversation_ids_not_indexed,
18326            vec![outcomes[0].conversation_id]
18327        );
18328        assert_eq!(
18329            reopened_conversation_ids_source_index,
18330            vec![outcomes[0].conversation_id]
18331        );
18332        assert_eq!(reopened_conversation_count, 1);
18333        assert_eq!(reopened_conversation_count_not_indexed, 1);
18334        assert_eq!(conversation_count_not_indexed, 1);
18335        assert_eq!(conversation_count_source_index, 1);
18336        assert_eq!(conversation_count, 1);
18337        assert_eq!(message_count, 4);
18338    }
18339
18340    #[test]
18341    fn franken_insert_conversation_or_get_existing_recovers_unique_conflict() {
18342        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18343        use std::path::PathBuf;
18344
18345        let dir = TempDir::new().unwrap();
18346        let db_path = dir.path().join("test.db");
18347        let storage = SqliteStorage::open(&db_path).unwrap();
18348
18349        let agent = Agent {
18350            id: None,
18351            slug: "codex".into(),
18352            name: "Codex".into(),
18353            version: Some("0.2.3".into()),
18354            kind: AgentKind::Cli,
18355        };
18356        let agent_id = storage.ensure_agent(&agent).unwrap();
18357
18358        let conv = Conversation {
18359            id: None,
18360            agent_slug: "codex".into(),
18361            workspace: Some(PathBuf::from("/tmp/workspace")),
18362            external_id: Some("recover-duplicate".into()),
18363            title: Some("Recover Duplicate".into()),
18364            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18365            started_at: Some(1_700_000_000_000),
18366            ended_at: Some(1_700_000_000_100),
18367            approx_tokens: None,
18368            metadata_json: serde_json::Value::Null,
18369            messages: vec![Message {
18370                id: None,
18371                idx: 0,
18372                role: MessageRole::User,
18373                author: None,
18374                created_at: Some(1_700_000_000_000),
18375                content: "hello".into(),
18376                extra_json: serde_json::Value::Null,
18377                snippets: Vec::new(),
18378            }],
18379            source_id: "local".into(),
18380            origin_host: None,
18381        };
18382
18383        let tx = storage.conn.transaction().unwrap();
18384        let inserted_id = franken_insert_conversation(&tx, agent_id, None, &conv)
18385            .unwrap()
18386            .expect("first insert should succeed");
18387
18388        let conversation_key = conversation_merge_key(agent_id, &conv);
18389        let resolved = franken_insert_conversation_or_get_existing_after_miss(
18390            &tx,
18391            agent_id,
18392            None,
18393            &conv,
18394            &conversation_key,
18395        )
18396        .unwrap();
18397
18398        match resolved {
18399            ConversationInsertStatus::Existing(existing_id) => {
18400                assert_eq!(existing_id, inserted_id);
18401            }
18402            ConversationInsertStatus::Inserted(new_id) => {
18403                panic!("expected existing conversation id, got freshly inserted {new_id}");
18404            }
18405        }
18406
18407        let conversation_count: i64 = tx
18408            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18409                row.get_typed(0)
18410            })
18411            .unwrap();
18412        assert_eq!(conversation_count, 1);
18413    }
18414
18415    #[test]
18416    fn insert_conversations_batched_merges_duplicate_external_id_with_gaps() {
18417        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18418        use std::path::PathBuf;
18419
18420        let dir = TempDir::new().unwrap();
18421        let db_path = dir.path().join("test.db");
18422        let storage = SqliteStorage::open(&db_path).unwrap();
18423
18424        let agent = Agent {
18425            id: None,
18426            slug: "codex".into(),
18427            name: "Codex".into(),
18428            version: Some("0.2.3".into()),
18429            kind: AgentKind::Cli,
18430        };
18431        let agent_id = storage.ensure_agent(&agent).unwrap();
18432
18433        let base_conv = |messages: Vec<Message>| Conversation {
18434            id: None,
18435            agent_slug: "codex".into(),
18436            workspace: Some(PathBuf::from("/tmp/workspace")),
18437            external_id: Some("shared-session-gap".into()),
18438            title: Some("Shared Session Gap".into()),
18439            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18440            started_at: Some(1_700_000_000_000),
18441            ended_at: Some(1_700_000_000_999),
18442            approx_tokens: None,
18443            metadata_json: serde_json::Value::Null,
18444            messages,
18445            source_id: "local".into(),
18446            origin_host: None,
18447        };
18448
18449        let conv_a = base_conv(vec![
18450            Message {
18451                id: None,
18452                idx: 2,
18453                role: MessageRole::User,
18454                author: None,
18455                created_at: Some(1_700_000_000_200),
18456                content: "third".into(),
18457                extra_json: serde_json::Value::Null,
18458                snippets: Vec::new(),
18459            },
18460            Message {
18461                id: None,
18462                idx: 3,
18463                role: MessageRole::Agent,
18464                author: None,
18465                created_at: Some(1_700_000_000_300),
18466                content: "fourth".into(),
18467                extra_json: serde_json::Value::Null,
18468                snippets: Vec::new(),
18469            },
18470        ]);
18471        let conv_b = base_conv(vec![
18472            Message {
18473                id: None,
18474                idx: 0,
18475                role: MessageRole::User,
18476                author: None,
18477                created_at: Some(1_700_000_000_000),
18478                content: "first".into(),
18479                extra_json: serde_json::Value::Null,
18480                snippets: Vec::new(),
18481            },
18482            Message {
18483                id: None,
18484                idx: 1,
18485                role: MessageRole::Agent,
18486                author: None,
18487                created_at: Some(1_700_000_000_100),
18488                content: "second".into(),
18489                extra_json: serde_json::Value::Null,
18490                snippets: Vec::new(),
18491            },
18492            Message {
18493                id: None,
18494                idx: 3,
18495                role: MessageRole::Agent,
18496                author: None,
18497                created_at: Some(1_700_000_000_300),
18498                content: "fourth".into(),
18499                extra_json: serde_json::Value::Null,
18500                snippets: Vec::new(),
18501            },
18502        ]);
18503
18504        let outcomes = storage
18505            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18506            .unwrap();
18507        assert_eq!(outcomes.len(), 2);
18508        assert_eq!(outcomes[0].inserted_indices, vec![2, 3]);
18509        assert_eq!(outcomes[1].inserted_indices, vec![0, 1]);
18510        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18511
18512        let stored_indices: Vec<i64> = storage
18513            .conn
18514            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18515                row.get_typed(0)
18516            })
18517            .unwrap();
18518        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18519    }
18520
18521    #[test]
18522    fn insert_conversations_batched_refreshes_partial_pending_message_lookup() {
18523        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18524        use std::path::PathBuf;
18525
18526        let dir = TempDir::new().unwrap();
18527        let db_path = dir.path().join("test.db");
18528        let storage = SqliteStorage::open(&db_path).unwrap();
18529
18530        let agent = Agent {
18531            id: None,
18532            slug: "codex".into(),
18533            name: "Codex".into(),
18534            version: Some("0.2.3".into()),
18535            kind: AgentKind::Cli,
18536        };
18537        let agent_id = storage.ensure_agent(&agent).unwrap();
18538
18539        let make_message = |idx: i64, content: &str| Message {
18540            id: None,
18541            idx,
18542            role: if idx == 0 {
18543                MessageRole::User
18544            } else {
18545                MessageRole::Agent
18546            },
18547            author: None,
18548            created_at: Some(1_700_000_000_000 + idx),
18549            content: content.into(),
18550            extra_json: serde_json::Value::Null,
18551            snippets: Vec::new(),
18552        };
18553
18554        let base_conv = |messages: Vec<Message>| Conversation {
18555            id: None,
18556            agent_slug: "codex".into(),
18557            workspace: Some(PathBuf::from("/tmp/workspace")),
18558            external_id: Some("partial-cache-session".into()),
18559            title: Some("Partial cache session".into()),
18560            source_path: PathBuf::from("/tmp/partial-cache.jsonl"),
18561            started_at: Some(1_700_000_000_000),
18562            ended_at: Some(1_700_000_000_100),
18563            approx_tokens: None,
18564            metadata_json: serde_json::Value::Null,
18565            messages,
18566            source_id: "local".into(),
18567            origin_host: None,
18568        };
18569
18570        let canonical = base_conv(vec![
18571            make_message(0, "canonical zero"),
18572            make_message(20, "canonical twenty"),
18573        ]);
18574        storage
18575            .insert_conversation_tree(agent_id, None, &canonical)
18576            .unwrap();
18577
18578        let exact_prefix = base_conv(vec![make_message(0, "canonical zero")]);
18579        let conflicting_tail = base_conv(vec![make_message(20, "conflicting twenty")]);
18580
18581        let outcomes = storage
18582            .insert_conversations_batched(&[
18583                (agent_id, None, &exact_prefix),
18584                (agent_id, None, &conflicting_tail),
18585            ])
18586            .unwrap();
18587
18588        assert_eq!(outcomes.len(), 2);
18589        assert!(outcomes[0].inserted_indices.is_empty());
18590        assert!(
18591            outcomes[1].inserted_indices.is_empty(),
18592            "the second batch item must refresh the partial pending lookup and retain the canonical idx=20 row"
18593        );
18594
18595        let stored_messages: Vec<(i64, String)> = storage
18596            .conn
18597            .query_map_collect(
18598                "SELECT idx, content FROM messages ORDER BY idx",
18599                fparams![],
18600                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18601            )
18602            .unwrap();
18603        assert_eq!(
18604            stored_messages,
18605            vec![
18606                (0, "canonical zero".to_string()),
18607                (20, "canonical twenty".to_string()),
18608            ]
18609        );
18610    }
18611
18612    #[test]
18613    fn insert_conversations_batched_reprocessing_conversation_is_idempotent() {
18614        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18615        use std::path::PathBuf;
18616
18617        const MESSAGE_COUNT: i64 = 64;
18618
18619        let dir = TempDir::new().unwrap();
18620        let db_path = dir.path().join("test.db");
18621        let storage = SqliteStorage::open(&db_path).unwrap();
18622
18623        let agent = Agent {
18624            id: None,
18625            slug: "codex".into(),
18626            name: "Codex".into(),
18627            version: Some("0.2.3".into()),
18628            kind: AgentKind::Cli,
18629        };
18630        let agent_id = storage.ensure_agent(&agent).unwrap();
18631
18632        let messages: Vec<Message> = (0..MESSAGE_COUNT)
18633            .map(|idx| Message {
18634                id: None,
18635                idx,
18636                role: if idx % 2 == 0 {
18637                    MessageRole::User
18638                } else {
18639                    MessageRole::Agent
18640                },
18641                author: None,
18642                created_at: Some(1_700_000_000_000 + idx),
18643                content: format!("message {idx}"),
18644                extra_json: serde_json::Value::Null,
18645                snippets: Vec::new(),
18646            })
18647            .collect();
18648
18649        let conversation = Conversation {
18650            id: None,
18651            agent_slug: "codex".into(),
18652            workspace: Some(PathBuf::from("/tmp/workspace")),
18653            external_id: Some("large-reprocess-session".into()),
18654            title: Some("Large Reprocess Session".into()),
18655            source_path: PathBuf::from("/tmp/large-reprocess-session.jsonl"),
18656            started_at: Some(1_700_000_000_000),
18657            ended_at: Some(1_700_000_000_000 + MESSAGE_COUNT - 1),
18658            approx_tokens: None,
18659            metadata_json: serde_json::Value::Null,
18660            messages,
18661            source_id: "local".into(),
18662            origin_host: None,
18663        };
18664
18665        let first = storage
18666            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18667            .unwrap();
18668        let second = storage
18669            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18670            .unwrap();
18671
18672        assert_eq!(first.len(), 1);
18673        assert_eq!(second.len(), 1);
18674        assert_eq!(first[0].inserted_indices.len(), MESSAGE_COUNT as usize);
18675        assert!(
18676            second[0].inserted_indices.is_empty(),
18677            "full reprocessing of a large conversation must not attempt duplicate idx inserts"
18678        );
18679        assert_eq!(first[0].conversation_id, second[0].conversation_id);
18680
18681        let conversation_count: i64 = storage
18682            .conn
18683            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18684                row.get_typed(0)
18685            })
18686            .unwrap();
18687        let message_count: i64 = storage
18688            .conn
18689            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18690                row.get_typed(0)
18691            })
18692            .unwrap();
18693
18694        assert_eq!(conversation_count, 1);
18695        assert_eq!(message_count, MESSAGE_COUNT);
18696    }
18697
18698    #[test]
18699    fn parallel_insert_conversation_tree_keeps_unique_external_ids_distinct() {
18700        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18701        use crate::indexer::persist::map_to_internal;
18702        use crate::model::types::{Agent, AgentKind};
18703        use frankensqlite::compat::{ConnectionExt, RowExt};
18704        use rand::RngExt;
18705        use rayon::prelude::*;
18706
18707        fn retryable_franken_error(err: &anyhow::Error) -> bool {
18708            err.downcast_ref::<frankensqlite::FrankenError>()
18709                .or_else(|| {
18710                    err.root_cause()
18711                        .downcast_ref::<frankensqlite::FrankenError>()
18712                })
18713                .is_some_and(|inner| {
18714                    matches!(
18715                        inner,
18716                        frankensqlite::FrankenError::Busy
18717                            | frankensqlite::FrankenError::BusyRecovery
18718                            | frankensqlite::FrankenError::BusySnapshot { .. }
18719                            | frankensqlite::FrankenError::WriteConflict { .. }
18720                            | frankensqlite::FrankenError::SerializationFailure { .. }
18721                    )
18722                })
18723        }
18724
18725        fn with_retry<F, T>(mut f: F) -> anyhow::Result<T>
18726        where
18727            F: FnMut() -> anyhow::Result<T>,
18728        {
18729            let mut rng = rand::rng();
18730            let mut backoff_ms = 4_u64;
18731            for attempt in 0..=24 {
18732                match f() {
18733                    Ok(value) => return Ok(value),
18734                    Err(err) if attempt < 24 && retryable_franken_error(&err) => {
18735                        let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
18736                        std::thread::sleep(Duration::from_millis(sleep_ms));
18737                        backoff_ms = (backoff_ms * 2).min(512);
18738                    }
18739                    Err(err) => return Err(err),
18740                }
18741            }
18742            unreachable!("retry loop must return on success or final failure")
18743        }
18744
18745        let dir = TempDir::new().unwrap();
18746        let db_path = dir.path().join("parallel_insert_conversation_tree.db");
18747        let seed = FrankenStorage::open(&db_path).unwrap();
18748        drop(seed);
18749
18750        let conversations: Vec<NormalizedConversation> = (0..10)
18751            .map(|i| NormalizedConversation {
18752                agent_slug: format!("agent-{}", i % 3),
18753                external_id: Some(format!("conv-{i}")),
18754                title: Some(format!("Conversation {i}")),
18755                workspace: Some(PathBuf::from(format!("/ws/{i}"))),
18756                source_path: PathBuf::from(format!("/log/{i}.jsonl")),
18757                started_at: Some(1_000 + i * 100),
18758                ended_at: Some(1_000 + i * 100 + 50),
18759                metadata: serde_json::json!({}),
18760                messages: (0..3)
18761                    .map(|j| NormalizedMessage {
18762                        idx: j,
18763                        role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
18764                        author: Some("tester".into()),
18765                        created_at: Some(1_000 + i * 100 + j * 10),
18766                        content: format!("parallel-distinct-test conv={i} msg={j}"),
18767                        extra: serde_json::json!({}),
18768                        snippets: vec![],
18769                        invocations: Vec::new(),
18770                    })
18771                    .collect(),
18772            })
18773            .collect();
18774
18775        let mut outcomes: Vec<(String, i64, Vec<i64>)> = conversations
18776            .par_chunks(3)
18777            .map(|chunk| {
18778                let storage = FrankenStorage::open_writer(&db_path).unwrap();
18779                let mut agent_cache: HashMap<String, i64> = HashMap::new();
18780                let mut workspace_cache: HashMap<PathBuf, i64> = HashMap::new();
18781                let mut chunk_outcomes = Vec::with_capacity(chunk.len());
18782
18783                for conv in chunk {
18784                    let agent_slug = conv.agent_slug.clone();
18785                    let workspace = conv.workspace.clone();
18786                    let external_id = conv.external_id.clone().expect("external id");
18787                    let internal = map_to_internal(conv);
18788                    let outcome = with_retry(|| {
18789                        let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
18790                            *id
18791                        } else {
18792                            let agent = Agent {
18793                                id: None,
18794                                slug: agent_slug.clone(),
18795                                name: agent_slug.clone(),
18796                                version: None,
18797                                kind: AgentKind::Cli,
18798                            };
18799                            let id = storage.ensure_agent(&agent)?;
18800                            agent_cache.insert(agent_slug.clone(), id);
18801                            id
18802                        };
18803                        let workspace_id = if let Some(path) = &workspace {
18804                            if let Some(id) = workspace_cache.get(path) {
18805                                Some(*id)
18806                            } else {
18807                                let id = storage.ensure_workspace(path, None)?;
18808                                workspace_cache.insert(path.clone(), id);
18809                                Some(id)
18810                            }
18811                        } else {
18812                            None
18813                        };
18814                        storage.insert_conversation_tree(agent_id, workspace_id, &internal)
18815                    })
18816                    .unwrap();
18817                    chunk_outcomes.push((
18818                        external_id,
18819                        outcome.conversation_id,
18820                        outcome.inserted_indices,
18821                    ));
18822                }
18823
18824                storage.close().unwrap();
18825                chunk_outcomes
18826            })
18827            .flatten()
18828            .collect();
18829        outcomes.sort_by(|left, right| left.0.cmp(&right.0));
18830
18831        assert!(
18832            outcomes
18833                .iter()
18834                .all(|(_, _, inserted_indices)| inserted_indices == &vec![0, 1, 2]),
18835            "unique external ids must not be routed through the existing-conversation merge path: {outcomes:?}"
18836        );
18837
18838        let distinct_ids: HashSet<i64> = outcomes
18839            .iter()
18840            .map(|(_, conversation_id, _)| *conversation_id)
18841            .collect();
18842        assert_eq!(
18843            distinct_ids.len(),
18844            conversations.len(),
18845            "unique external ids must produce distinct conversation ids: {outcomes:?}"
18846        );
18847
18848        let reader = FrankenStorage::open(&db_path).unwrap();
18849        let stored_rows: Vec<(i64, String)> = reader
18850            .raw()
18851            .query_map_collect(
18852                "SELECT id, external_id FROM conversations ORDER BY id",
18853                &[],
18854                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18855            )
18856            .unwrap();
18857        let stored_count: i64 = reader
18858            .raw()
18859            .query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
18860                row.get_typed(0)
18861            })
18862            .unwrap();
18863
18864        assert_eq!(
18865            stored_count as usize,
18866            conversations.len(),
18867            "parallel distinct inserts must persist one row per external id; rows={stored_rows:?}; outcomes={outcomes:?}"
18868        );
18869        assert_eq!(
18870            stored_rows.len(),
18871            conversations.len(),
18872            "parallel distinct inserts must remain visible after reopening; rows={stored_rows:?}; outcomes={outcomes:?}"
18873        );
18874    }
18875
18876    #[test]
18877    fn insert_conversation_tree_merges_duplicate_external_id_with_gaps() {
18878        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18879        use crate::indexer::persist::map_to_internal;
18880        use crate::model::types::{Agent, AgentKind};
18881        use std::path::PathBuf;
18882
18883        let dir = TempDir::new().unwrap();
18884        let db_path = dir.path().join("test.db");
18885        let storage = SqliteStorage::open(&db_path).unwrap();
18886
18887        let agent = Agent {
18888            id: None,
18889            slug: "codex".into(),
18890            name: "Codex".into(),
18891            version: Some("0.2.3".into()),
18892            kind: AgentKind::Cli,
18893        };
18894        let agent_id = storage.ensure_agent(&agent).unwrap();
18895
18896        let base_conv = |messages: Vec<NormalizedMessage>| NormalizedConversation {
18897            agent_slug: "codex".into(),
18898            workspace: Some(PathBuf::from("/tmp/workspace")),
18899            external_id: Some("tree-gap-session".into()),
18900            title: Some("Tree Gap Session".into()),
18901            source_path: PathBuf::from("/tmp/tree.jsonl"),
18902            started_at: Some(1_700_000_000_000),
18903            ended_at: Some(1_700_000_000_999),
18904            metadata: serde_json::Value::Null,
18905            messages,
18906        };
18907
18908        let conv_a = map_to_internal(&base_conv(vec![
18909            NormalizedMessage {
18910                idx: 2,
18911                role: "user".into(),
18912                author: None,
18913                created_at: Some(1_700_000_000_200),
18914                content: "third".into(),
18915                extra: serde_json::Value::Null,
18916                snippets: Vec::new(),
18917                invocations: Vec::new(),
18918            },
18919            NormalizedMessage {
18920                idx: 3,
18921                role: "assistant".into(),
18922                author: None,
18923                created_at: Some(1_700_000_000_300),
18924                content: "fourth".into(),
18925                extra: serde_json::Value::Null,
18926                snippets: Vec::new(),
18927                invocations: Vec::new(),
18928            },
18929        ]));
18930        let conv_b = map_to_internal(&base_conv(vec![
18931            NormalizedMessage {
18932                idx: 0,
18933                role: "user".into(),
18934                author: None,
18935                created_at: Some(1_700_000_000_000),
18936                content: "first".into(),
18937                extra: serde_json::Value::Null,
18938                snippets: Vec::new(),
18939                invocations: Vec::new(),
18940            },
18941            NormalizedMessage {
18942                idx: 1,
18943                role: "assistant".into(),
18944                author: None,
18945                created_at: Some(1_700_000_000_100),
18946                content: "second".into(),
18947                extra: serde_json::Value::Null,
18948                snippets: Vec::new(),
18949                invocations: Vec::new(),
18950            },
18951            NormalizedMessage {
18952                idx: 3,
18953                role: "assistant".into(),
18954                author: None,
18955                created_at: Some(1_700_000_000_300),
18956                content: "fourth".into(),
18957                extra: serde_json::Value::Null,
18958                snippets: Vec::new(),
18959                invocations: Vec::new(),
18960            },
18961        ]));
18962
18963        let first = storage
18964            .insert_conversation_tree(agent_id, None, &conv_a)
18965            .unwrap();
18966        let second = storage
18967            .insert_conversation_tree(agent_id, None, &conv_b)
18968            .unwrap();
18969
18970        assert_eq!(first.inserted_indices, vec![2, 3]);
18971        assert_eq!(second.inserted_indices, vec![0, 1]);
18972        assert_eq!(first.conversation_id, second.conversation_id);
18973
18974        let stored_indices: Vec<i64> = storage
18975            .conn
18976            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18977                row.get_typed(0)
18978            })
18979            .unwrap();
18980        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18981    }
18982
18983    #[test]
18984    fn insert_conversation_tree_skips_duplicate_message_indices_for_new_conversation() {
18985        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18986        use std::path::PathBuf;
18987
18988        let dir = TempDir::new().unwrap();
18989        let db_path = dir.path().join("test.db");
18990        let storage = SqliteStorage::open(&db_path).unwrap();
18991
18992        let agent = Agent {
18993            id: None,
18994            slug: "codex".into(),
18995            name: "Codex".into(),
18996            version: Some("0.2.3".into()),
18997            kind: AgentKind::Cli,
18998        };
18999        let agent_id = storage.ensure_agent(&agent).unwrap();
19000
19001        let conversation = Conversation {
19002            id: None,
19003            agent_slug: "codex".into(),
19004            workspace: Some(PathBuf::from("/tmp/workspace")),
19005            external_id: Some("duplicate-new-session".into()),
19006            title: Some("Duplicate New Session".into()),
19007            source_path: PathBuf::from("/tmp/duplicate-new-session.jsonl"),
19008            started_at: Some(1_700_000_000_000),
19009            ended_at: Some(1_700_000_000_999),
19010            approx_tokens: None,
19011            metadata_json: serde_json::Value::Null,
19012            messages: vec![
19013                Message {
19014                    id: None,
19015                    idx: 0,
19016                    role: MessageRole::User,
19017                    author: None,
19018                    created_at: Some(1_700_000_000_000),
19019                    content: "first canonical".into(),
19020                    extra_json: serde_json::Value::Null,
19021                    snippets: Vec::new(),
19022                },
19023                Message {
19024                    id: None,
19025                    idx: 0,
19026                    role: MessageRole::User,
19027                    author: None,
19028                    created_at: Some(1_700_000_000_001),
19029                    content: "duplicate idx should be skipped".into(),
19030                    extra_json: serde_json::Value::Null,
19031                    snippets: Vec::new(),
19032                },
19033                Message {
19034                    id: None,
19035                    idx: 1,
19036                    role: MessageRole::Agent,
19037                    author: None,
19038                    created_at: Some(1_700_000_000_100),
19039                    content: "second".into(),
19040                    extra_json: serde_json::Value::Null,
19041                    snippets: Vec::new(),
19042                },
19043            ],
19044            source_id: "local".into(),
19045            origin_host: None,
19046        };
19047
19048        let outcome = storage
19049            .insert_conversation_tree(agent_id, None, &conversation)
19050            .unwrap();
19051
19052        assert_eq!(outcome.inserted_indices, vec![0, 1]);
19053
19054        let stored_messages: Vec<(i64, String)> = storage
19055            .conn
19056            .query_map_collect(
19057                "SELECT idx, content FROM messages ORDER BY idx",
19058                fparams![],
19059                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
19060            )
19061            .unwrap();
19062        assert_eq!(
19063            stored_messages,
19064            vec![
19065                (0, "first canonical".to_string()),
19066                (1, "second".to_string())
19067            ]
19068        );
19069    }
19070
19071    #[test]
19072    fn insert_conversation_tree_merges_duplicate_source_path_without_external_id() {
19073        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19074        use std::path::PathBuf;
19075
19076        let dir = TempDir::new().unwrap();
19077        let db_path = dir.path().join("test.db");
19078        let storage = SqliteStorage::open(&db_path).unwrap();
19079
19080        let agent = Agent {
19081            id: None,
19082            slug: "codex".into(),
19083            name: "Codex".into(),
19084            version: Some("0.2.3".into()),
19085            kind: AgentKind::Cli,
19086        };
19087        let agent_id = storage.ensure_agent(&agent).unwrap();
19088
19089        let base_conv = |messages: Vec<Message>| Conversation {
19090            id: None,
19091            agent_slug: "codex".into(),
19092            workspace: Some(PathBuf::from("/tmp/workspace")),
19093            external_id: None,
19094            title: Some("Source Path Merge".into()),
19095            source_path: PathBuf::from("/tmp/shared-session.jsonl"),
19096            started_at: Some(1_700_000_000_000),
19097            ended_at: Some(1_700_000_000_999),
19098            approx_tokens: None,
19099            metadata_json: serde_json::Value::Null,
19100            messages,
19101            source_id: "local".into(),
19102            origin_host: None,
19103        };
19104
19105        let first = storage
19106            .insert_conversation_tree(
19107                agent_id,
19108                None,
19109                &base_conv(vec![
19110                    Message {
19111                        id: None,
19112                        idx: 0,
19113                        role: MessageRole::User,
19114                        author: None,
19115                        created_at: Some(1_700_000_000_000),
19116                        content: "first".into(),
19117                        extra_json: serde_json::Value::Null,
19118                        snippets: Vec::new(),
19119                    },
19120                    Message {
19121                        id: None,
19122                        idx: 1,
19123                        role: MessageRole::Agent,
19124                        author: None,
19125                        created_at: Some(1_700_000_000_100),
19126                        content: "second".into(),
19127                        extra_json: serde_json::Value::Null,
19128                        snippets: Vec::new(),
19129                    },
19130                ]),
19131            )
19132            .unwrap();
19133
19134        let second = storage
19135            .insert_conversation_tree(
19136                agent_id,
19137                None,
19138                &base_conv(vec![
19139                    Message {
19140                        id: None,
19141                        idx: 1,
19142                        role: MessageRole::Agent,
19143                        author: None,
19144                        created_at: Some(1_700_000_000_100),
19145                        content: "second".into(),
19146                        extra_json: serde_json::Value::Null,
19147                        snippets: Vec::new(),
19148                    },
19149                    Message {
19150                        id: None,
19151                        idx: 2,
19152                        role: MessageRole::User,
19153                        author: None,
19154                        created_at: Some(1_700_000_000_200),
19155                        content: "third".into(),
19156                        extra_json: serde_json::Value::Null,
19157                        snippets: Vec::new(),
19158                    },
19159                ]),
19160            )
19161            .unwrap();
19162
19163        assert_eq!(first.conversation_id, second.conversation_id);
19164        assert_eq!(first.inserted_indices, vec![0, 1]);
19165        assert_eq!(second.inserted_indices, vec![2]);
19166
19167        let stored_indices: Vec<i64> = storage
19168            .conn
19169            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
19170                row.get_typed(0)
19171            })
19172            .unwrap();
19173        assert_eq!(stored_indices, vec![0, 1, 2]);
19174    }
19175
19176    #[test]
19177    fn insert_conversation_tree_merges_source_path_duplicates_with_start_drift() {
19178        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19179        use std::path::PathBuf;
19180
19181        let dir = TempDir::new().unwrap();
19182        let db_path = dir.path().join("test.db");
19183        let storage = SqliteStorage::open(&db_path).unwrap();
19184
19185        let agent = Agent {
19186            id: None,
19187            slug: "codex".into(),
19188            name: "Codex".into(),
19189            version: Some("0.2.3".into()),
19190            kind: AgentKind::Cli,
19191        };
19192        let agent_id = storage.ensure_agent(&agent).unwrap();
19193
19194        let base_conv = |started_at: Option<i64>, messages: Vec<Message>| Conversation {
19195            id: None,
19196            agent_slug: "codex".into(),
19197            workspace: Some(PathBuf::from("/tmp/workspace")),
19198            external_id: None,
19199            title: Some("Drift Merge".into()),
19200            source_path: PathBuf::from("/tmp/drift-session.jsonl"),
19201            started_at,
19202            ended_at: Some(1_700_000_000_999),
19203            approx_tokens: None,
19204            metadata_json: serde_json::Value::Null,
19205            messages,
19206            source_id: "local".into(),
19207            origin_host: None,
19208        };
19209
19210        let first = storage
19211            .insert_conversation_tree(
19212                agent_id,
19213                None,
19214                &base_conv(
19215                    Some(1_700_000_000_000),
19216                    vec![
19217                        Message {
19218                            id: None,
19219                            idx: 0,
19220                            role: MessageRole::User,
19221                            author: None,
19222                            created_at: Some(1_700_000_000_000),
19223                            content: "first".into(),
19224                            extra_json: serde_json::Value::Null,
19225                            snippets: Vec::new(),
19226                        },
19227                        Message {
19228                            id: None,
19229                            idx: 1,
19230                            role: MessageRole::Agent,
19231                            author: None,
19232                            created_at: Some(1_700_000_000_100),
19233                            content: "second".into(),
19234                            extra_json: serde_json::Value::Null,
19235                            snippets: Vec::new(),
19236                        },
19237                    ],
19238                ),
19239            )
19240            .unwrap();
19241
19242        let second = storage
19243            .insert_conversation_tree(
19244                agent_id,
19245                None,
19246                &base_conv(
19247                    Some(1_700_000_004_000),
19248                    vec![
19249                        Message {
19250                            id: None,
19251                            idx: 1,
19252                            role: MessageRole::Agent,
19253                            author: None,
19254                            created_at: Some(1_700_000_000_100),
19255                            content: "second".into(),
19256                            extra_json: serde_json::Value::Null,
19257                            snippets: Vec::new(),
19258                        },
19259                        Message {
19260                            id: None,
19261                            idx: 2,
19262                            role: MessageRole::User,
19263                            author: None,
19264                            created_at: Some(1_700_000_004_200),
19265                            content: "third".into(),
19266                            extra_json: serde_json::Value::Null,
19267                            snippets: Vec::new(),
19268                        },
19269                    ],
19270                ),
19271            )
19272            .unwrap();
19273
19274        assert_eq!(first.conversation_id, second.conversation_id);
19275        assert_eq!(second.inserted_indices, vec![2]);
19276    }
19277
19278    #[test]
19279    fn insert_conversation_tree_keeps_single_message_overlap_sessions_separate() {
19280        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19281        use std::path::PathBuf;
19282
19283        let dir = TempDir::new().unwrap();
19284        let db_path = dir.path().join("test.db");
19285        let storage = SqliteStorage::open(&db_path).unwrap();
19286
19287        let agent = Agent {
19288            id: None,
19289            slug: "codex".into(),
19290            name: "Codex".into(),
19291            version: Some("0.2.3".into()),
19292            kind: AgentKind::Cli,
19293        };
19294        let agent_id = storage.ensure_agent(&agent).unwrap();
19295
19296        let make_conv = |started_at: i64, idx: i64, content: &str| Conversation {
19297            id: None,
19298            agent_slug: "codex".into(),
19299            workspace: Some(PathBuf::from("/tmp/workspace")),
19300            external_id: None,
19301            title: Some("Partial overlap".into()),
19302            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19303            started_at: Some(started_at),
19304            ended_at: Some(started_at + 500),
19305            approx_tokens: None,
19306            metadata_json: serde_json::Value::Null,
19307            messages: vec![Message {
19308                id: None,
19309                idx,
19310                role: MessageRole::User,
19311                author: None,
19312                created_at: Some(started_at),
19313                content: content.into(),
19314                extra_json: serde_json::Value::Null,
19315                snippets: Vec::new(),
19316            }],
19317            source_id: "local".into(),
19318            origin_host: None,
19319        };
19320
19321        storage
19322            .insert_conversation_tree(
19323                agent_id,
19324                None,
19325                &Conversation {
19326                    messages: vec![
19327                        Message {
19328                            id: None,
19329                            idx: 0,
19330                            role: MessageRole::User,
19331                            author: None,
19332                            created_at: Some(1_700_000_000_000),
19333                            content: "shared opener".into(),
19334                            extra_json: serde_json::Value::Null,
19335                            snippets: Vec::new(),
19336                        },
19337                        Message {
19338                            id: None,
19339                            idx: 1,
19340                            role: MessageRole::Agent,
19341                            author: None,
19342                            created_at: Some(1_700_000_000_100),
19343                            content: "first session unique".into(),
19344                            extra_json: serde_json::Value::Null,
19345                            snippets: Vec::new(),
19346                        },
19347                    ],
19348                    ..make_conv(1_700_000_000_000, 0, "unused")
19349                },
19350            )
19351            .unwrap();
19352        storage
19353            .insert_conversation_tree(
19354                agent_id,
19355                None,
19356                &make_conv(1_700_000_900_000, 0, "shared opener"),
19357            )
19358            .unwrap();
19359
19360        let conversation_count: i64 = storage
19361            .conn
19362            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19363                row.get_typed(0)
19364            })
19365            .unwrap();
19366        assert_eq!(conversation_count, 2);
19367    }
19368
19369    #[test]
19370    fn insert_conversation_tree_keeps_distinct_source_path_sessions_separate() {
19371        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19372        use std::path::PathBuf;
19373
19374        let dir = TempDir::new().unwrap();
19375        let db_path = dir.path().join("test.db");
19376        let storage = SqliteStorage::open(&db_path).unwrap();
19377
19378        let agent = Agent {
19379            id: None,
19380            slug: "codex".into(),
19381            name: "Codex".into(),
19382            version: Some("0.2.3".into()),
19383            kind: AgentKind::Cli,
19384        };
19385        let agent_id = storage.ensure_agent(&agent).unwrap();
19386
19387        let make_conv = |started_at: i64, created_at: i64, content: &str| Conversation {
19388            id: None,
19389            agent_slug: "codex".into(),
19390            workspace: Some(PathBuf::from("/tmp/workspace")),
19391            external_id: None,
19392            title: Some("Same Path Different Session".into()),
19393            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19394            started_at: Some(started_at),
19395            ended_at: Some(started_at + 500),
19396            approx_tokens: None,
19397            metadata_json: serde_json::Value::Null,
19398            messages: vec![Message {
19399                id: None,
19400                idx: 0,
19401                role: MessageRole::User,
19402                author: None,
19403                created_at: Some(created_at),
19404                content: content.into(),
19405                extra_json: serde_json::Value::Null,
19406                snippets: Vec::new(),
19407            }],
19408            source_id: "local".into(),
19409            origin_host: None,
19410        };
19411
19412        storage
19413            .insert_conversation_tree(
19414                agent_id,
19415                None,
19416                &make_conv(1_700_000_000_000, 1_700_000_000_000, "first session"),
19417            )
19418            .unwrap();
19419        storage
19420            .insert_conversation_tree(
19421                agent_id,
19422                None,
19423                &make_conv(1_700_000_900_000, 1_700_000_900_000, "second session"),
19424            )
19425            .unwrap();
19426
19427        let conversation_count: i64 = storage
19428            .conn
19429            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19430                row.get_typed(0)
19431            })
19432            .unwrap();
19433        assert_eq!(conversation_count, 2);
19434    }
19435
19436    #[test]
19437    fn insert_conversation_tree_merges_replay_equivalent_messages_with_shifted_idx() {
19438        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19439        use std::path::PathBuf;
19440
19441        let dir = TempDir::new().unwrap();
19442        let db_path = dir.path().join("test.db");
19443        let storage = SqliteStorage::open(&db_path).unwrap();
19444
19445        let agent = Agent {
19446            id: None,
19447            slug: "codex".into(),
19448            name: "Codex".into(),
19449            version: Some("0.2.3".into()),
19450            kind: AgentKind::Cli,
19451        };
19452        let agent_id = storage.ensure_agent(&agent).unwrap();
19453
19454        let make_conv = |started_at: i64, messages: Vec<Message>| Conversation {
19455            id: None,
19456            agent_slug: "codex".into(),
19457            workspace: Some(PathBuf::from("/tmp/workspace")),
19458            external_id: None,
19459            title: Some("Shifted replay".into()),
19460            source_path: PathBuf::from("/tmp/replay-session.jsonl"),
19461            started_at: Some(started_at),
19462            ended_at: Some(started_at + 500),
19463            approx_tokens: None,
19464            metadata_json: serde_json::Value::Null,
19465            messages,
19466            source_id: "local".into(),
19467            origin_host: None,
19468        };
19469
19470        let first = storage
19471            .insert_conversation_tree(
19472                agent_id,
19473                None,
19474                &make_conv(
19475                    1_700_000_000_000,
19476                    vec![
19477                        Message {
19478                            id: None,
19479                            idx: 0,
19480                            role: MessageRole::User,
19481                            author: None,
19482                            created_at: Some(1_700_000_000_000),
19483                            content: "first".into(),
19484                            extra_json: serde_json::Value::Null,
19485                            snippets: Vec::new(),
19486                        },
19487                        Message {
19488                            id: None,
19489                            idx: 1,
19490                            role: MessageRole::Agent,
19491                            author: None,
19492                            created_at: Some(1_700_000_000_100),
19493                            content: "second".into(),
19494                            extra_json: serde_json::Value::Null,
19495                            snippets: Vec::new(),
19496                        },
19497                    ],
19498                ),
19499            )
19500            .unwrap();
19501
19502        let second = storage
19503            .insert_conversation_tree(
19504                agent_id,
19505                None,
19506                &make_conv(
19507                    1_700_000_900_000,
19508                    vec![
19509                        Message {
19510                            id: None,
19511                            idx: 10,
19512                            role: MessageRole::User,
19513                            author: None,
19514                            created_at: Some(1_700_000_000_000),
19515                            content: "first".into(),
19516                            extra_json: serde_json::Value::Null,
19517                            snippets: Vec::new(),
19518                        },
19519                        Message {
19520                            id: None,
19521                            idx: 11,
19522                            role: MessageRole::Agent,
19523                            author: None,
19524                            created_at: Some(1_700_000_000_100),
19525                            content: "second".into(),
19526                            extra_json: serde_json::Value::Null,
19527                            snippets: Vec::new(),
19528                        },
19529                        Message {
19530                            id: None,
19531                            idx: 12,
19532                            role: MessageRole::User,
19533                            author: None,
19534                            created_at: Some(1_700_000_000_200),
19535                            content: "third".into(),
19536                            extra_json: serde_json::Value::Null,
19537                            snippets: Vec::new(),
19538                        },
19539                    ],
19540                ),
19541            )
19542            .unwrap();
19543
19544        assert_eq!(first.conversation_id, second.conversation_id);
19545        assert_eq!(second.inserted_indices, vec![12]);
19546
19547        let stored_indices: Vec<i64> = storage
19548            .conn
19549            .query_map_collect(
19550                "SELECT idx FROM messages WHERE conversation_id = ?1 ORDER BY idx",
19551                fparams![first.conversation_id],
19552                |row| row.get_typed(0),
19553            )
19554            .unwrap();
19555        assert_eq!(stored_indices, vec![0, 1, 12]);
19556    }
19557
19558    #[test]
19559    fn salvage_historical_databases_imports_backups_once_and_merges_overlap() {
19560        use crate::model::types::{Conversation, Message, MessageRole};
19561        use std::path::PathBuf;
19562
19563        fn base_conv(source_path: &str, messages: Vec<Message>) -> Conversation {
19564            Conversation {
19565                id: None,
19566                agent_slug: "codex".into(),
19567                workspace: Some(PathBuf::from("/tmp/workspace")),
19568                external_id: None,
19569                title: Some("Recovered".into()),
19570                source_path: PathBuf::from(source_path),
19571                started_at: Some(1_700_000_000_000),
19572                ended_at: Some(1_700_000_000_999),
19573                approx_tokens: None,
19574                metadata_json: serde_json::Value::Null,
19575                messages,
19576                source_id: "local".into(),
19577                origin_host: None,
19578            }
19579        }
19580
19581        let dir = TempDir::new().unwrap();
19582        let canonical_db = dir.path().join("agent_search.db");
19583        let storage = SqliteStorage::open(&canonical_db).unwrap();
19584
19585        let overlapping_a = base_conv(
19586            "/tmp/shared-history.jsonl",
19587            vec![
19588                Message {
19589                    id: None,
19590                    idx: 0,
19591                    role: MessageRole::User,
19592                    author: None,
19593                    created_at: Some(1_700_000_000_000),
19594                    content: "first".into(),
19595                    extra_json: serde_json::Value::Null,
19596                    snippets: Vec::new(),
19597                },
19598                Message {
19599                    id: None,
19600                    idx: 1,
19601                    role: MessageRole::Agent,
19602                    author: None,
19603                    created_at: Some(1_700_000_000_100),
19604                    content: "second".into(),
19605                    extra_json: serde_json::Value::Null,
19606                    snippets: Vec::new(),
19607                },
19608            ],
19609        );
19610        let overlapping_b = base_conv(
19611            "/tmp/shared-history.jsonl",
19612            vec![
19613                Message {
19614                    id: None,
19615                    idx: 1,
19616                    role: MessageRole::Agent,
19617                    author: None,
19618                    created_at: Some(1_700_000_000_100),
19619                    content: "second".into(),
19620                    extra_json: serde_json::Value::Null,
19621                    snippets: Vec::new(),
19622                },
19623                Message {
19624                    id: None,
19625                    idx: 2,
19626                    role: MessageRole::User,
19627                    author: None,
19628                    created_at: Some(1_700_000_000_200),
19629                    content: "third".into(),
19630                    extra_json: serde_json::Value::Null,
19631                    snippets: Vec::new(),
19632                },
19633            ],
19634        );
19635        let unique = Conversation {
19636            source_path: PathBuf::from("/tmp/unique-history.jsonl"),
19637            messages: vec![Message {
19638                id: None,
19639                idx: 0,
19640                role: MessageRole::User,
19641                author: None,
19642                created_at: Some(1_700_000_001_000),
19643                content: "unique".into(),
19644                extra_json: serde_json::Value::Null,
19645                snippets: Vec::new(),
19646            }],
19647            started_at: Some(1_700_000_001_000),
19648            ended_at: Some(1_700_000_001_100),
19649            ..base_conv("/tmp/unique-history.jsonl", Vec::new())
19650        };
19651
19652        seed_historical_db_direct(
19653            &dir.path()
19654                .join("backups/agent_search.db.20260322T020200.bak"),
19655            std::slice::from_ref(&overlapping_a),
19656        );
19657        seed_historical_db_direct(
19658            &dir.path().join("agent_search.corrupt.20260324_212907"),
19659            &[overlapping_b, unique],
19660        );
19661
19662        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19663        assert_eq!(first.bundles_considered, 2);
19664        assert_eq!(first.bundles_imported, 2);
19665        assert_eq!(first.messages_imported, 4);
19666
19667        let conversations = storage.list_conversations(10, 0).unwrap();
19668        assert_eq!(conversations.len(), 2);
19669
19670        let shared_id = conversations
19671            .iter()
19672            .find(|conv| conv.source_path == std::path::Path::new("/tmp/shared-history.jsonl"))
19673            .and_then(|conv| conv.id)
19674            .unwrap();
19675        let shared_indices: Vec<i64> = storage
19676            .fetch_messages(shared_id)
19677            .unwrap()
19678            .into_iter()
19679            .map(|msg| msg.idx)
19680            .collect();
19681        assert_eq!(shared_indices, vec![0, 1, 2]);
19682
19683        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19684        assert_eq!(second.bundles_imported, 0);
19685        assert_eq!(second.messages_imported, 0);
19686    }
19687
19688    #[test]
19689    fn salvage_historical_databases_normalizes_host_only_remote_provenance() {
19690        use crate::model::types::{Conversation, Message, MessageRole};
19691        use std::path::PathBuf;
19692
19693        let dir = TempDir::new().unwrap();
19694        let canonical_db = dir.path().join("agent_search.db");
19695        let storage = SqliteStorage::open(&canonical_db).unwrap();
19696
19697        let host_only_remote = Conversation {
19698            id: None,
19699            agent_slug: "codex".into(),
19700            workspace: Some(PathBuf::from("/tmp/workspace")),
19701            external_id: None,
19702            title: Some("Recovered Host Only Remote".into()),
19703            source_path: PathBuf::from("/tmp/host-only-history.jsonl"),
19704            started_at: Some(1_700_000_000_000),
19705            ended_at: Some(1_700_000_000_999),
19706            approx_tokens: None,
19707            metadata_json: serde_json::Value::Null,
19708            messages: vec![Message {
19709                id: None,
19710                idx: 0,
19711                role: MessageRole::User,
19712                author: None,
19713                created_at: Some(1_700_000_000_000),
19714                content: "host-only remote".into(),
19715                extra_json: serde_json::Value::Null,
19716                snippets: Vec::new(),
19717            }],
19718            source_id: "   ".into(),
19719            origin_host: Some("builder-5".into()),
19720        };
19721
19722        let historical_db = dir
19723            .path()
19724            .join("backups/agent_search.db.20260322T020200.bak");
19725        seed_historical_db_direct(&historical_db, std::slice::from_ref(&host_only_remote));
19726
19727        let historical_conn =
19728            FrankenConnection::open(historical_db.to_string_lossy().into_owned()).unwrap();
19729        historical_conn
19730            .execute_compat(
19731                "INSERT INTO sources(id, kind, host_label, created_at, updated_at) VALUES(?1, ?2, ?3, ?4, ?5)",
19732                fparams!["   ", "ssh", "builder-5", 0_i64, 0_i64],
19733            )
19734            .unwrap();
19735        historical_conn
19736            .execute_compat(
19737                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE source_path = ?3",
19738                fparams!["   ", "builder-5", "/tmp/host-only-history.jsonl"],
19739            )
19740            .unwrap();
19741        historical_conn
19742            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams!["builder-5"])
19743            .unwrap();
19744        drop(historical_conn);
19745
19746        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19747        assert_eq!(first.bundles_imported, 1);
19748        assert_eq!(first.messages_imported, 1);
19749
19750        let source_ids = storage.get_source_ids().unwrap();
19751        assert_eq!(source_ids, vec!["builder-5".to_string()]);
19752
19753        let conversations = storage.list_conversations(10, 0).unwrap();
19754        assert_eq!(conversations.len(), 1);
19755        assert_eq!(conversations[0].source_id, "builder-5");
19756        assert_eq!(conversations[0].origin_host.as_deref(), Some("builder-5"));
19757    }
19758
19759    #[test]
19760    fn historical_salvage_retry_splits_single_conversation_until_it_fits() {
19761        use crate::model::types::{Conversation, Message, MessageRole};
19762        use std::path::PathBuf;
19763
19764        let mut attempts: Vec<Vec<usize>> = Vec::new();
19765        let entry = HistoricalBatchEntry {
19766            source_row_id: 77,
19767            agent_id: 1,
19768            workspace_id: None,
19769            conversation: Conversation {
19770                id: None,
19771                agent_slug: "gemini".into(),
19772                workspace: Some(PathBuf::from("/tmp/workspace")),
19773                external_id: Some("conv-77".into()),
19774                title: Some("Large recovered conversation".into()),
19775                source_path: PathBuf::from("/tmp/history.jsonl"),
19776                started_at: Some(1_700_000_000_000),
19777                ended_at: Some(1_700_000_000_999),
19778                approx_tokens: None,
19779                metadata_json: serde_json::Value::Null,
19780                messages: (0..4)
19781                    .map(|idx| Message {
19782                        id: None,
19783                        idx,
19784                        role: MessageRole::User,
19785                        author: None,
19786                        created_at: Some(1_700_000_000_000 + idx),
19787                        content: format!("message-{idx}"),
19788                        extra_json: serde_json::Value::Null,
19789                        snippets: Vec::new(),
19790                    })
19791                    .collect(),
19792                source_id: LOCAL_SOURCE_ID.into(),
19793                origin_host: None,
19794            },
19795        };
19796
19797        let totals = SqliteStorage::import_historical_batch_with_retry(
19798            std::slice::from_ref(&entry),
19799            &mut |batch| {
19800                attempts.push(
19801                    batch
19802                        .iter()
19803                        .map(|entry| entry.conversation.messages.len())
19804                        .collect(),
19805                );
19806                let total_messages: usize = batch
19807                    .iter()
19808                    .map(|entry| entry.conversation.messages.len())
19809                    .sum();
19810                if total_messages > 1 {
19811                    Err(anyhow!("out of memory"))
19812                } else {
19813                    Ok(HistoricalBatchImportTotals {
19814                        inserted_source_rows: batch.len(),
19815                        inserted_messages: total_messages,
19816                    })
19817                }
19818            },
19819        )
19820        .unwrap();
19821
19822        assert_eq!(
19823            totals,
19824            HistoricalBatchImportTotals {
19825                inserted_source_rows: 1,
19826                inserted_messages: 4,
19827            }
19828        );
19829        assert_eq!(attempts.first().cloned(), Some(vec![4]));
19830        assert!(
19831            attempts.iter().filter(|sizes| sizes == &&vec![1]).count() >= 4,
19832            "expected recursive fallback to reach one-message slices"
19833        );
19834    }
19835
19836    #[test]
19837    fn salvage_historical_databases_resumes_from_progress_checkpoint() {
19838        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19839        use std::path::PathBuf;
19840
19841        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19842            Conversation {
19843                id: None,
19844                agent_slug: "codex".into(),
19845                workspace: Some(PathBuf::from("/tmp/workspace")),
19846                external_id: Some(format!("conv-{idx_seed}")),
19847                title: Some(format!("Recovered {idx_seed}")),
19848                source_path: PathBuf::from(source_path),
19849                started_at: Some(1_700_000_000_000 + idx_seed),
19850                ended_at: Some(1_700_000_000_100 + idx_seed),
19851                approx_tokens: None,
19852                metadata_json: serde_json::Value::Null,
19853                messages: vec![Message {
19854                    id: None,
19855                    idx: 0,
19856                    role: MessageRole::User,
19857                    author: None,
19858                    created_at: Some(1_700_000_000_000 + idx_seed),
19859                    content: format!("message-{idx_seed}"),
19860                    extra_json: serde_json::Value::Null,
19861                    snippets: Vec::new(),
19862                }],
19863                source_id: LOCAL_SOURCE_ID.into(),
19864                origin_host: None,
19865            }
19866        }
19867
19868        let dir = TempDir::new().unwrap();
19869        let canonical_db = dir.path().join("agent_search.db");
19870        let backup_db = dir
19871            .path()
19872            .join("backups/agent_search.db.20260322T020200.bak");
19873        let storage = SqliteStorage::open(&canonical_db).unwrap();
19874        let conv_a = make_conv("/tmp/one.jsonl", 1);
19875        let conv_b = make_conv("/tmp/two.jsonl", 2);
19876        let conv_c = make_conv("/tmp/three.jsonl", 3);
19877        seed_historical_db_direct(
19878            &backup_db,
19879            &[conv_a.clone(), conv_b.clone(), conv_c.clone()],
19880        );
19881
19882        let agent = Agent {
19883            id: None,
19884            slug: "codex".into(),
19885            name: "Codex".into(),
19886            version: Some("0.2.3".into()),
19887            kind: AgentKind::Cli,
19888        };
19889        let agent_id = storage.ensure_agent(&agent).unwrap();
19890        storage
19891            .insert_conversation_tree(agent_id, None, &conv_a)
19892            .unwrap();
19893
19894        let bundle = discover_historical_database_bundles(&canonical_db)
19895            .into_iter()
19896            .find(|bundle| bundle.root_path == backup_db)
19897            .unwrap();
19898        let first_row_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19899            .unwrap()
19900            .query_row_map(
19901                "SELECT id FROM conversations WHERE source_path = ?1",
19902                fparams!["/tmp/one.jsonl"],
19903                |row| row.get_typed(0),
19904            )
19905            .unwrap();
19906        storage
19907            .record_historical_bundle_progress(&bundle, "direct-readonly", first_row_id, 50, 99)
19908            .unwrap();
19909
19910        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
19911        assert_eq!(outcome.bundles_imported, 1);
19912        assert_eq!(outcome.conversations_imported, 52);
19913        assert_eq!(outcome.messages_imported, 101);
19914        assert_eq!(storage.list_conversations(10, 0).unwrap().len(), 3);
19915
19916        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
19917        let progress_left: Option<String> = storage
19918            .conn
19919            .query_row_map(
19920                "SELECT value FROM meta WHERE key = ?1",
19921                fparams![progress_key.as_str()],
19922                |row| row.get_typed(0),
19923            )
19924            .optional()
19925            .unwrap();
19926        assert!(
19927            progress_left.is_none(),
19928            "completed salvage should clear bundle progress"
19929        );
19930
19931        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19932        assert_eq!(second.bundles_imported, 0);
19933        assert_eq!(second.messages_imported, 0);
19934    }
19935
19936    #[test]
19937    fn salvage_historical_databases_skips_bundle_when_checkpoint_covers_backup() {
19938        // Regression for issue #247 (coding_agent_session_search-r8pcy): a bundle
19939        // whose progress checkpoint already covers the backup's entire conversation
19940        // row-id space (daemon OOM-killed after the last batch committed but before
19941        // the completion ledger marker landed) must be ledgered + skipped, not
19942        // re-scanned O(n) with imported=0 every batch.
19943        use crate::model::types::{Conversation, Message, MessageRole};
19944        use std::path::PathBuf;
19945
19946        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19947            Conversation {
19948                id: None,
19949                agent_slug: "codex".into(),
19950                workspace: Some(PathBuf::from("/tmp/workspace")),
19951                external_id: Some(format!("conv-{idx_seed}")),
19952                title: Some(format!("Recovered {idx_seed}")),
19953                source_path: PathBuf::from(source_path),
19954                started_at: Some(1_700_000_000_000 + idx_seed),
19955                ended_at: Some(1_700_000_000_100 + idx_seed),
19956                approx_tokens: None,
19957                metadata_json: serde_json::Value::Null,
19958                messages: vec![Message {
19959                    id: None,
19960                    idx: 0,
19961                    role: MessageRole::User,
19962                    author: None,
19963                    created_at: Some(1_700_000_000_000 + idx_seed),
19964                    content: format!("message-{idx_seed}"),
19965                    extra_json: serde_json::Value::Null,
19966                    snippets: Vec::new(),
19967                }],
19968                source_id: LOCAL_SOURCE_ID.into(),
19969                origin_host: None,
19970            }
19971        }
19972
19973        let dir = TempDir::new().unwrap();
19974        let canonical_db = dir.path().join("agent_search.db");
19975        let backup_db = dir
19976            .path()
19977            .join("backups/agent_search.db.20260322T020200.bak");
19978        let storage = SqliteStorage::open(&canonical_db).unwrap();
19979        seed_historical_db_direct(
19980            &backup_db,
19981            &[
19982                make_conv("/tmp/one.jsonl", 1),
19983                make_conv("/tmp/two.jsonl", 2),
19984                make_conv("/tmp/three.jsonl", 3),
19985            ],
19986        );
19987
19988        let bundle = discover_historical_database_bundles(&canonical_db)
19989            .into_iter()
19990            .find(|bundle| bundle.root_path == backup_db)
19991            .unwrap();
19992
19993        // Checkpoint high-water mark == backup's max conversation id.
19994        let backup_max_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19995            .unwrap()
19996            .query_row_map(
19997                "SELECT COALESCE(MAX(id), 0) FROM conversations",
19998                fparams![],
19999                |row| row.get_typed(0),
20000            )
20001            .unwrap();
20002        assert!(backup_max_id > 0, "seeded backup should have conversations");
20003        storage
20004            .record_historical_bundle_progress(&bundle, "direct-readonly", backup_max_id, 3, 3)
20005            .unwrap();
20006
20007        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
20008        assert_eq!(
20009            outcome.bundles_imported, 0,
20010            "fully-checkpointed bundle must not be re-scanned"
20011        );
20012        assert_eq!(outcome.conversations_imported, 0);
20013        assert_eq!(outcome.messages_imported, 0);
20014        assert_eq!(
20015            storage.list_conversations(10, 0).unwrap().len(),
20016            0,
20017            "skip path must not import anything"
20018        );
20019        assert!(
20020            storage.historical_bundle_already_imported(&bundle).unwrap(),
20021            "skipped bundle must be ledgered as salvaged so future runs short-circuit"
20022        );
20023
20024        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
20025        let progress_left: Option<String> = storage
20026            .conn
20027            .query_row_map(
20028                "SELECT value FROM meta WHERE key = ?1",
20029                fparams![progress_key.as_str()],
20030                |row| row.get_typed(0),
20031            )
20032            .optional()
20033            .unwrap();
20034        assert!(
20035            progress_left.is_none(),
20036            "skip path must clear the bundle progress checkpoint"
20037        );
20038    }
20039
20040    #[test]
20041    fn list_conversations_for_lexical_rebuild_uses_stable_id_order() {
20042        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20043        use std::path::PathBuf;
20044
20045        let dir = TempDir::new().unwrap();
20046        let db_path = dir.path().join("agent_search.db");
20047        let storage = SqliteStorage::open(&db_path).unwrap();
20048        let agent = Agent {
20049            id: None,
20050            slug: "codex".into(),
20051            name: "Codex".into(),
20052            version: Some("0.2.3".into()),
20053            kind: AgentKind::Cli,
20054        };
20055        let agent_id = storage.ensure_agent(&agent).unwrap();
20056
20057        let make_conv = |source_path: &str, started_at: i64| Conversation {
20058            id: None,
20059            agent_slug: "codex".into(),
20060            workspace: Some(PathBuf::from("/tmp/workspace")),
20061            external_id: Some(source_path.to_string()),
20062            title: Some(source_path.to_string()),
20063            source_path: PathBuf::from(source_path),
20064            started_at: Some(started_at),
20065            ended_at: Some(started_at + 1),
20066            approx_tokens: None,
20067            metadata_json: serde_json::Value::Null,
20068            messages: vec![Message {
20069                id: None,
20070                idx: 0,
20071                role: MessageRole::User,
20072                author: None,
20073                created_at: Some(started_at),
20074                content: format!("message for {source_path}"),
20075                extra_json: serde_json::Value::Null,
20076                snippets: Vec::new(),
20077            }],
20078            source_id: LOCAL_SOURCE_ID.into(),
20079            origin_host: None,
20080        };
20081
20082        let conv_a = make_conv("/tmp/a.jsonl", 3_000);
20083        let conv_b = make_conv("/tmp/b.jsonl", 1_000);
20084        let conv_c = make_conv("/tmp/c.jsonl", 2_000);
20085
20086        storage
20087            .insert_conversation_tree(agent_id, None, &conv_a)
20088            .unwrap();
20089        storage
20090            .insert_conversation_tree(agent_id, None, &conv_b)
20091            .unwrap();
20092        storage
20093            .insert_conversation_tree(agent_id, None, &conv_c)
20094            .unwrap();
20095
20096        let user_order: Vec<PathBuf> = storage
20097            .list_conversations(10, 0)
20098            .unwrap()
20099            .into_iter()
20100            .map(|conv| conv.source_path)
20101            .collect();
20102        assert_eq!(
20103            user_order,
20104            vec![
20105                PathBuf::from("/tmp/a.jsonl"),
20106                PathBuf::from("/tmp/c.jsonl"),
20107                PathBuf::from("/tmp/b.jsonl"),
20108            ]
20109        );
20110
20111        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20112        let rebuild_order: Vec<PathBuf> = storage
20113            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
20114            .unwrap()
20115            .into_iter()
20116            .map(|conv| conv.source_path)
20117            .collect();
20118        assert_eq!(
20119            rebuild_order,
20120            vec![
20121                PathBuf::from("/tmp/a.jsonl"),
20122                PathBuf::from("/tmp/b.jsonl"),
20123                PathBuf::from("/tmp/c.jsonl"),
20124            ]
20125        );
20126
20127        let first_page = storage
20128            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
20129            .unwrap();
20130        let first_page_paths: Vec<PathBuf> = first_page
20131            .iter()
20132            .map(|conv| conv.source_path.clone())
20133            .collect();
20134        assert_eq!(
20135            first_page_paths,
20136            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
20137        );
20138
20139        let second_page = storage
20140            .list_conversations_for_lexical_rebuild_after_id(
20141                2,
20142                first_page
20143                    .last()
20144                    .and_then(|conv| conv.id)
20145                    .expect("first page should include an id"),
20146                &agent_slugs,
20147                &workspace_paths,
20148            )
20149            .unwrap();
20150        let second_page_paths: Vec<PathBuf> = second_page
20151            .iter()
20152            .map(|conv| conv.source_path.clone())
20153            .collect();
20154        assert_eq!(second_page_paths, vec![PathBuf::from("/tmp/c.jsonl")]);
20155
20156        let bounded_page = storage
20157            .list_conversations_for_lexical_rebuild_after_id_through_id(
20158                10,
20159                0,
20160                first_page
20161                    .last()
20162                    .and_then(|conv| conv.id)
20163                    .expect("first page should include an id"),
20164                &agent_slugs,
20165                &workspace_paths,
20166            )
20167            .unwrap();
20168        let bounded_paths: Vec<PathBuf> = bounded_page
20169            .iter()
20170            .map(|conv| conv.source_path.clone())
20171            .collect();
20172        assert_eq!(
20173            bounded_paths,
20174            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
20175        );
20176    }
20177
20178    #[test]
20179    fn keyset_traversal_handles_sparse_holey_conversation_ids() {
20180        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20181        use std::path::PathBuf;
20182
20183        let dir = TempDir::new().unwrap();
20184        let db_path = dir.path().join("agent_search.db");
20185        let storage = SqliteStorage::open(&db_path).unwrap();
20186        let agent = Agent {
20187            id: None,
20188            slug: "codex".into(),
20189            name: "Codex".into(),
20190            version: Some("0.2.3".into()),
20191            kind: AgentKind::Cli,
20192        };
20193        let agent_id = storage.ensure_agent(&agent).unwrap();
20194
20195        let make_conv = |label: &str, ts: i64| Conversation {
20196            id: None,
20197            agent_slug: "codex".into(),
20198            workspace: Some(PathBuf::from("/tmp/workspace")),
20199            external_id: Some(label.to_string()),
20200            title: Some(label.to_string()),
20201            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20202            started_at: Some(ts),
20203            ended_at: Some(ts + 1),
20204            approx_tokens: None,
20205            metadata_json: serde_json::Value::Null,
20206            messages: vec![Message {
20207                id: None,
20208                idx: 0,
20209                role: MessageRole::User,
20210                author: None,
20211                created_at: Some(ts),
20212                content: format!("msg for {label}"),
20213                extra_json: serde_json::Value::Null,
20214                snippets: Vec::new(),
20215            }],
20216            source_id: LOCAL_SOURCE_ID.into(),
20217            origin_host: None,
20218        };
20219
20220        for i in 0..6 {
20221            storage
20222                .insert_conversation_tree(
20223                    agent_id,
20224                    None,
20225                    &make_conv(&format!("conv-{i}"), 1000 + i),
20226                )
20227                .unwrap();
20228        }
20229
20230        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20231        storage
20232            .conn
20233            .execute_compat("DELETE FROM conversations WHERE id IN (2, 4)", fparams![])
20234            .unwrap();
20235        storage
20236            .conn
20237            .execute_compat(
20238                "DELETE FROM messages WHERE conversation_id IN (2, 4)",
20239                fparams![],
20240            )
20241            .unwrap();
20242        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20243
20244        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20245
20246        let page1 = storage
20247            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
20248            .unwrap();
20249        assert_eq!(page1.len(), 2);
20250        let page1_ids: Vec<i64> = page1.iter().map(|c| c.id.unwrap()).collect();
20251        assert_eq!(page1_ids, vec![1, 3]);
20252
20253        let page2 = storage
20254            .list_conversations_for_lexical_rebuild_after_id(
20255                2,
20256                *page1_ids.last().unwrap(),
20257                &agent_slugs,
20258                &workspace_paths,
20259            )
20260            .unwrap();
20261        assert_eq!(page2.len(), 2);
20262        let page2_ids: Vec<i64> = page2.iter().map(|c| c.id.unwrap()).collect();
20263        assert_eq!(page2_ids, vec![5, 6]);
20264
20265        let page3 = storage
20266            .list_conversations_for_lexical_rebuild_after_id(
20267                2,
20268                *page2_ids.last().unwrap(),
20269                &agent_slugs,
20270                &workspace_paths,
20271            )
20272            .unwrap();
20273        assert!(page3.is_empty());
20274
20275        let all_ids: Vec<i64> = page1_ids.iter().chain(page2_ids.iter()).copied().collect();
20276        assert_eq!(all_ids, vec![1, 3, 5, 6]);
20277    }
20278
20279    #[test]
20280    fn keyset_traversal_through_id_with_sparse_ranges() {
20281        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20282        use std::path::PathBuf;
20283
20284        let dir = TempDir::new().unwrap();
20285        let db_path = dir.path().join("agent_search.db");
20286        let storage = SqliteStorage::open(&db_path).unwrap();
20287        let agent = Agent {
20288            id: None,
20289            slug: "codex".into(),
20290            name: "Codex".into(),
20291            version: Some("0.2.3".into()),
20292            kind: AgentKind::Cli,
20293        };
20294        let agent_id = storage.ensure_agent(&agent).unwrap();
20295
20296        let make_conv = |label: &str, ts: i64| Conversation {
20297            id: None,
20298            agent_slug: "codex".into(),
20299            workspace: Some(PathBuf::from("/tmp/workspace")),
20300            external_id: Some(label.to_string()),
20301            title: Some(label.to_string()),
20302            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20303            started_at: Some(ts),
20304            ended_at: Some(ts + 1),
20305            approx_tokens: None,
20306            metadata_json: serde_json::Value::Null,
20307            messages: vec![Message {
20308                id: None,
20309                idx: 0,
20310                role: MessageRole::User,
20311                author: None,
20312                created_at: Some(ts),
20313                content: format!("msg for {label}"),
20314                extra_json: serde_json::Value::Null,
20315                snippets: Vec::new(),
20316            }],
20317            source_id: LOCAL_SOURCE_ID.into(),
20318            origin_host: None,
20319        };
20320
20321        for i in 0..10 {
20322            storage
20323                .insert_conversation_tree(
20324                    agent_id,
20325                    None,
20326                    &make_conv(&format!("conv-{i}"), 1000 + i),
20327                )
20328                .unwrap();
20329        }
20330
20331        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20332        storage
20333            .conn
20334            .execute_compat(
20335                "DELETE FROM conversations WHERE id IN (3, 5, 7, 8)",
20336                fparams![],
20337            )
20338            .unwrap();
20339        storage
20340            .conn
20341            .execute_compat(
20342                "DELETE FROM messages WHERE conversation_id IN (3, 5, 7, 8)",
20343                fparams![],
20344            )
20345            .unwrap();
20346        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20347
20348        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20349
20350        let through_5 = storage
20351            .list_conversations_for_lexical_rebuild_after_id_through_id(
20352                100,
20353                0,
20354                5,
20355                &agent_slugs,
20356                &workspace_paths,
20357            )
20358            .unwrap();
20359        let through_5_ids: Vec<i64> = through_5.iter().map(|c| c.id.unwrap()).collect();
20360        assert_eq!(through_5_ids, vec![1, 2, 4]);
20361
20362        let after_4_through_10 = storage
20363            .list_conversations_for_lexical_rebuild_after_id_through_id(
20364                100,
20365                4,
20366                10,
20367                &agent_slugs,
20368                &workspace_paths,
20369            )
20370            .unwrap();
20371        let ids: Vec<i64> = after_4_through_10.iter().map(|c| c.id.unwrap()).collect();
20372        assert_eq!(ids, vec![6, 9, 10]);
20373
20374        let after_10 = storage
20375            .list_conversations_for_lexical_rebuild_after_id_through_id(
20376                100,
20377                10,
20378                20,
20379                &agent_slugs,
20380                &workspace_paths,
20381            )
20382            .unwrap();
20383        assert!(after_10.is_empty());
20384    }
20385
20386    #[test]
20387    fn list_conversation_footprints_for_lexical_rebuild_estimates_bytes_and_keeps_empty_conversations()
20388     {
20389        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20390        use std::path::PathBuf;
20391
20392        let dir = TempDir::new().unwrap();
20393        let db_path = dir.path().join("agent_search.db");
20394        let storage = SqliteStorage::open(&db_path).unwrap();
20395        let agent = Agent {
20396            id: None,
20397            slug: "codex".into(),
20398            name: "Codex".into(),
20399            version: Some("0.2.3".into()),
20400            kind: AgentKind::Cli,
20401        };
20402        let agent_id = storage.ensure_agent(&agent).unwrap();
20403
20404        let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
20405            storage
20406                .insert_conversation_tree(
20407                    agent_id,
20408                    None,
20409                    &Conversation {
20410                        id: None,
20411                        agent_slug: "codex".into(),
20412                        workspace: Some(PathBuf::from("/tmp/workspace")),
20413                        external_id: Some(external_id.to_string()),
20414                        title: Some(external_id.to_string()),
20415                        source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
20416                        started_at: Some(base_ts),
20417                        ended_at: Some(base_ts + 100),
20418                        approx_tokens: None,
20419                        metadata_json: serde_json::Value::Null,
20420                        messages,
20421                        source_id: LOCAL_SOURCE_ID.into(),
20422                        origin_host: None,
20423                    },
20424                )
20425                .unwrap()
20426                .conversation_id
20427        };
20428
20429        let ascii_id = insert(
20430            "footprint-ascii",
20431            1_700_000_000_000,
20432            vec![
20433                Message {
20434                    id: None,
20435                    idx: 0,
20436                    role: MessageRole::User,
20437                    author: None,
20438                    created_at: Some(1_700_000_000_001),
20439                    content: "abc".into(),
20440                    extra_json: serde_json::Value::Null,
20441                    snippets: Vec::new(),
20442                },
20443                Message {
20444                    id: None,
20445                    idx: 1,
20446                    role: MessageRole::Agent,
20447                    author: None,
20448                    created_at: Some(1_700_000_000_002),
20449                    content: "defg".into(),
20450                    extra_json: serde_json::Value::Null,
20451                    snippets: Vec::new(),
20452                },
20453            ],
20454        );
20455        let empty_id = insert("footprint-empty", 1_700_000_001_000, Vec::new());
20456        let utf8_id = insert(
20457            "footprint-utf8",
20458            1_700_000_002_000,
20459            vec![Message {
20460                id: None,
20461                idx: 0,
20462                role: MessageRole::Tool,
20463                author: None,
20464                created_at: Some(1_700_000_002_001),
20465                content: "hé🙂".into(),
20466                extra_json: serde_json::Value::Null,
20467                snippets: Vec::new(),
20468            }],
20469        );
20470        let sparse_id = insert(
20471            "footprint-sparse",
20472            1_700_000_003_000,
20473            vec![Message {
20474                id: None,
20475                idx: 10,
20476                role: MessageRole::User,
20477                author: None,
20478                created_at: Some(1_700_000_003_010),
20479                content: "sparse".into(),
20480                extra_json: serde_json::Value::Null,
20481                snippets: Vec::new(),
20482            }],
20483        );
20484        storage
20485            .conn
20486            .execute_compat(
20487                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20488                fparams![utf8_id],
20489            )
20490            .unwrap();
20491
20492        let footprints = storage
20493            .list_conversation_footprints_for_lexical_rebuild()
20494            .unwrap();
20495        assert_eq!(
20496            footprints,
20497            vec![
20498                LexicalRebuildConversationFootprintRow {
20499                    conversation_id: ascii_id,
20500                    message_count: 2,
20501                    message_bytes: 2 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20502                },
20503                LexicalRebuildConversationFootprintRow {
20504                    conversation_id: empty_id,
20505                    message_count: 0,
20506                    message_bytes: 0,
20507                },
20508                LexicalRebuildConversationFootprintRow {
20509                    conversation_id: utf8_id,
20510                    message_count: 1,
20511                    message_bytes: LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20512                },
20513                LexicalRebuildConversationFootprintRow {
20514                    conversation_id: sparse_id,
20515                    message_count: 11,
20516                    message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20517                },
20518            ]
20519        );
20520    }
20521
20522    #[test]
20523    fn list_conversation_footprints_for_lexical_rebuild_falls_back_for_missing_tail_cache() {
20524        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20525        use std::path::PathBuf;
20526
20527        let dir = TempDir::new().unwrap();
20528        let db_path = dir.path().join("agent_search.db");
20529        let storage = SqliteStorage::open(&db_path).unwrap();
20530        let agent = Agent {
20531            id: None,
20532            slug: "codex".into(),
20533            name: "Codex".into(),
20534            version: Some("0.2.3".into()),
20535            kind: AgentKind::Cli,
20536        };
20537        let agent_id = storage.ensure_agent(&agent).unwrap();
20538        let conversation_id = storage
20539            .insert_conversation_tree(
20540                agent_id,
20541                None,
20542                &Conversation {
20543                    id: None,
20544                    agent_slug: "codex".into(),
20545                    workspace: Some(PathBuf::from("/tmp/workspace")),
20546                    external_id: Some("footprint-missing-tail".to_string()),
20547                    title: Some("footprint-missing-tail".to_string()),
20548                    source_path: PathBuf::from("/tmp/footprint-missing-tail.jsonl"),
20549                    started_at: Some(1_700_000_000_000),
20550                    ended_at: Some(1_700_000_000_100),
20551                    approx_tokens: None,
20552                    metadata_json: serde_json::Value::Null,
20553                    messages: vec![Message {
20554                        id: None,
20555                        idx: 10,
20556                        role: MessageRole::User,
20557                        author: None,
20558                        created_at: Some(1_700_000_000_010),
20559                        content: "legacy sparse tail".into(),
20560                        extra_json: serde_json::Value::Null,
20561                        snippets: Vec::new(),
20562                    }],
20563                    source_id: LOCAL_SOURCE_ID.into(),
20564                    origin_host: None,
20565                },
20566            )
20567            .unwrap()
20568            .conversation_id;
20569
20570        storage
20571            .conn
20572            .execute_compat(
20573                "UPDATE conversations
20574                 SET last_message_idx = NULL, last_message_created_at = NULL
20575                 WHERE id = ?1",
20576                fparams![conversation_id],
20577            )
20578            .unwrap();
20579        storage
20580            .conn
20581            .execute_compat(
20582                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20583                fparams![conversation_id],
20584            )
20585            .unwrap();
20586
20587        let footprints = storage
20588            .list_conversation_footprints_for_lexical_rebuild()
20589            .unwrap();
20590
20591        assert_eq!(
20592            footprints,
20593            vec![LexicalRebuildConversationFootprintRow {
20594                conversation_id,
20595                message_count: 11,
20596                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20597            }],
20598            "missing tail-cache metadata should fall back to messages MAX(idx) instead of treating legacy conversations as empty"
20599        );
20600    }
20601
20602    #[test]
20603    fn list_conversation_footprints_for_lexical_rebuild_raises_stale_low_tail_cache() {
20604        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20605        use std::path::PathBuf;
20606
20607        let dir = TempDir::new().unwrap();
20608        let db_path = dir.path().join("agent_search.db");
20609        let storage = SqliteStorage::open(&db_path).unwrap();
20610        let agent = Agent {
20611            id: None,
20612            slug: "codex".into(),
20613            name: "Codex".into(),
20614            version: Some("0.2.3".into()),
20615            kind: AgentKind::Cli,
20616        };
20617        let agent_id = storage.ensure_agent(&agent).unwrap();
20618        let conversation_id = storage
20619            .insert_conversation_tree(
20620                agent_id,
20621                None,
20622                &Conversation {
20623                    id: None,
20624                    agent_slug: "codex".into(),
20625                    workspace: Some(PathBuf::from("/tmp/workspace")),
20626                    external_id: Some("footprint-stale-tail".to_string()),
20627                    title: Some("footprint-stale-tail".to_string()),
20628                    source_path: PathBuf::from("/tmp/footprint-stale-tail.jsonl"),
20629                    started_at: Some(1_700_000_000_000),
20630                    ended_at: Some(1_700_000_000_100),
20631                    approx_tokens: None,
20632                    metadata_json: serde_json::Value::Null,
20633                    messages: (0..3)
20634                        .map(|idx| Message {
20635                            id: None,
20636                            idx,
20637                            role: MessageRole::User,
20638                            author: None,
20639                            created_at: Some(1_700_000_000_010 + idx),
20640                            content: format!("message {idx}"),
20641                            extra_json: serde_json::Value::Null,
20642                            snippets: Vec::new(),
20643                        })
20644                        .collect(),
20645                    source_id: LOCAL_SOURCE_ID.into(),
20646                    origin_host: None,
20647                },
20648            )
20649            .unwrap()
20650            .conversation_id;
20651
20652        storage
20653            .conn
20654            .execute_compat(
20655                "UPDATE conversations
20656                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20657                 WHERE id = ?1",
20658                fparams![conversation_id],
20659            )
20660            .unwrap();
20661        storage
20662            .conn
20663            .execute_compat(
20664                "UPDATE conversation_tail_state
20665                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20666                 WHERE conversation_id = ?1",
20667                fparams![conversation_id],
20668            )
20669            .unwrap();
20670
20671        let footprints = storage
20672            .list_conversation_footprints_for_lexical_rebuild()
20673            .unwrap();
20674
20675        assert_eq!(
20676            footprints,
20677            vec![LexicalRebuildConversationFootprintRow {
20678                conversation_id,
20679                message_count: 3,
20680                message_bytes: 3 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20681            }],
20682            "stale-low tail caches must not under-plan lexical shards and trip doc>plan invariants"
20683        );
20684    }
20685
20686    #[test]
20687    fn list_conversation_footprints_for_lexical_rebuild_tolerates_missing_tail_state_table() {
20688        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20689        use std::path::PathBuf;
20690
20691        let dir = TempDir::new().unwrap();
20692        let db_path = dir.path().join("agent_search.db");
20693        let storage = SqliteStorage::open(&db_path).unwrap();
20694        let agent = Agent {
20695            id: None,
20696            slug: "codex".into(),
20697            name: "Codex".into(),
20698            version: Some("0.2.3".into()),
20699            kind: AgentKind::Cli,
20700        };
20701        let agent_id = storage.ensure_agent(&agent).unwrap();
20702        let conversation_id = storage
20703            .insert_conversation_tree(
20704                agent_id,
20705                None,
20706                &Conversation {
20707                    id: None,
20708                    agent_slug: "codex".into(),
20709                    workspace: Some(PathBuf::from("/tmp/workspace")),
20710                    external_id: Some("footprint-missing-tail-table".to_string()),
20711                    title: Some("footprint-missing-tail-table".to_string()),
20712                    source_path: PathBuf::from("/tmp/footprint-missing-tail-table.jsonl"),
20713                    started_at: Some(1_700_000_000_000),
20714                    ended_at: Some(1_700_000_000_100),
20715                    approx_tokens: None,
20716                    metadata_json: serde_json::Value::Null,
20717                    messages: vec![Message {
20718                        id: None,
20719                        idx: 10,
20720                        role: MessageRole::User,
20721                        author: None,
20722                        created_at: Some(1_700_000_000_010),
20723                        content: "legacy sparse tail without hot table".into(),
20724                        extra_json: serde_json::Value::Null,
20725                        snippets: Vec::new(),
20726                    }],
20727                    source_id: LOCAL_SOURCE_ID.into(),
20728                    origin_host: None,
20729                },
20730            )
20731            .unwrap()
20732            .conversation_id;
20733
20734        storage
20735            .conn
20736            .execute_compat(
20737                "UPDATE conversations
20738                 SET last_message_idx = NULL, last_message_created_at = NULL
20739                 WHERE id = ?1",
20740                fparams![conversation_id],
20741            )
20742            .unwrap();
20743        storage
20744            .conn
20745            .execute_compat("DROP TABLE conversation_tail_state", fparams![])
20746            .unwrap();
20747
20748        let footprints = storage
20749            .list_conversation_footprints_for_lexical_rebuild()
20750            .unwrap();
20751
20752        assert_eq!(
20753            footprints,
20754            vec![LexicalRebuildConversationFootprintRow {
20755                conversation_id,
20756                message_count: 11,
20757                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20758            }],
20759            "read-only lexical self-heal must tolerate pre-tail-cache databases and use messages MAX(idx)"
20760        );
20761    }
20762
20763    #[test]
20764    fn list_conversation_footprints_for_lexical_rebuild_tolerates_legacy_search_demo_fixture() {
20765        let fixture_db = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
20766            .join("tests")
20767            .join("fixtures")
20768            .join("search_demo_data")
20769            .join("agent_search.db");
20770        let storage = FrankenStorage::open_readonly(&fixture_db).unwrap();
20771
20772        let footprints = storage
20773            .list_conversation_footprints_for_lexical_rebuild()
20774            .unwrap();
20775
20776        assert!(
20777            !footprints.is_empty(),
20778            "search self-heal should be able to plan a lexical rebuild from the legacy search demo fixture"
20779        );
20780        assert!(
20781            footprints
20782                .iter()
20783                .all(|footprint| footprint.message_count > 0),
20784            "legacy fixture conversations should derive message counts from messages when tail caches are absent"
20785        );
20786    }
20787
20788    #[test]
20789    fn lexical_rebuild_listing_normalizes_host_only_remote_source_from_blank_source_id() {
20790        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20791        use std::path::PathBuf;
20792
20793        let dir = TempDir::new().unwrap();
20794        let db_path = dir.path().join("agent_search.db");
20795        let storage = SqliteStorage::open(&db_path).unwrap();
20796        let agent = Agent {
20797            id: None,
20798            slug: "codex".into(),
20799            name: "Codex".into(),
20800            version: Some("0.2.3".into()),
20801            kind: AgentKind::Cli,
20802        };
20803        let agent_id = storage.ensure_agent(&agent).unwrap();
20804        let conversation = Conversation {
20805            id: None,
20806            agent_slug: "codex".into(),
20807            workspace: Some(PathBuf::from("/tmp/workspace")),
20808            external_id: Some("legacy-blank-source".into()),
20809            title: Some("Legacy blank source".into()),
20810            source_path: PathBuf::from("/tmp/legacy-blank-source.jsonl"),
20811            started_at: Some(1_700_000_000_000),
20812            ended_at: Some(1_700_000_000_100),
20813            approx_tokens: None,
20814            metadata_json: serde_json::Value::Null,
20815            messages: vec![Message {
20816                id: None,
20817                idx: 0,
20818                role: MessageRole::User,
20819                author: None,
20820                created_at: Some(1_700_000_000_000),
20821                content: "hello".into(),
20822                extra_json: serde_json::Value::Null,
20823                snippets: Vec::new(),
20824            }],
20825            source_id: LOCAL_SOURCE_ID.into(),
20826            origin_host: None,
20827        };
20828
20829        let conversation_id = storage
20830            .insert_conversation_tree(agent_id, None, &conversation)
20831            .unwrap()
20832            .conversation_id;
20833        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20834        storage
20835            .conn
20836            .execute_compat(
20837                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
20838                fparams!["   ", "dev@laptop", conversation_id],
20839            )
20840            .unwrap();
20841        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20842
20843        let listed = storage.list_conversations(10, 0).unwrap();
20844        assert_eq!(listed.len(), 1);
20845        assert_eq!(listed[0].source_id, "dev@laptop");
20846        assert_eq!(listed[0].origin_host.as_deref(), Some("dev@laptop"));
20847
20848        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20849        let rebuild_listed = storage
20850            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
20851            .unwrap();
20852        assert_eq!(rebuild_listed.len(), 1);
20853        assert_eq!(rebuild_listed[0].source_id, "dev@laptop");
20854        assert_eq!(rebuild_listed[0].origin_host.as_deref(), Some("dev@laptop"));
20855    }
20856
20857    #[test]
20858    fn seed_canonical_from_best_historical_bundle_copies_data_and_resets_runtime_meta() {
20859        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20860        use std::path::PathBuf;
20861
20862        let dir = TempDir::new().unwrap();
20863        let canonical_db = dir.path().join("agent_search.db");
20864        let source_db = dir
20865            .path()
20866            .join("backups/agent_search.db.20260322T020200.bak");
20867
20868        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
20869
20870        let source = SqliteStorage::open(&source_db).unwrap();
20871        let agent = Agent {
20872            id: None,
20873            slug: "codex".into(),
20874            name: "Codex".into(),
20875            version: Some("0.2.3".into()),
20876            kind: AgentKind::Cli,
20877        };
20878        let agent_id = source.ensure_agent(&agent).unwrap();
20879        let conversation = Conversation {
20880            id: None,
20881            agent_slug: "codex".into(),
20882            workspace: Some(PathBuf::from("/tmp/workspace")),
20883            external_id: Some("seed-conv".into()),
20884            title: Some("Historical seed".into()),
20885            source_path: PathBuf::from("/tmp/historical-seed.jsonl"),
20886            started_at: Some(1_700_000_000_000),
20887            ended_at: Some(1_700_000_000_100),
20888            approx_tokens: Some(42),
20889            metadata_json: serde_json::json!({"seed": true}),
20890            messages: vec![Message {
20891                id: None,
20892                idx: 0,
20893                role: MessageRole::Agent,
20894                author: Some("assistant".into()),
20895                created_at: Some(1_700_000_000_050),
20896                content: "seeded message".into(),
20897                extra_json: serde_json::json!({"usage": {"total_tokens": 12}}),
20898                snippets: Vec::new(),
20899            }],
20900            source_id: LOCAL_SOURCE_ID.into(),
20901            origin_host: None,
20902        };
20903        source
20904            .insert_conversation_tree(agent_id, None, &conversation)
20905            .unwrap();
20906        source.set_last_scan_ts(123).unwrap();
20907        source.set_last_indexed_at(456).unwrap();
20908        source.set_last_embedded_message_id(789).unwrap();
20909        source
20910            .conn
20911            .execute_compat(
20912                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
20913                fparams!["historical_bundle_salvaged:stale", "{\"stale\":true}"],
20914            )
20915            .unwrap();
20916        drop(source);
20917
20918        // Legacy "duplicate FTS" fixture reconstruction.
20919        //
20920        // Post-V14 migration cass drops the V13-era fts_messages virtual table
20921        // and recreates it lazily, so a freshly-opened canonical DB has zero
20922        // fts_messages entries in sqlite_master. To reproduce the historical
20923        // failure mode this test exercises — a legacy v13 bundle with a
20924        // duplicated CREATE VIRTUAL TABLE row — we have to inject *both*
20925        // entries: the original V13-era contentless row and the buggy duplicate
20926        // row. Before V14 existed the original was already present after
20927        // migration and only the duplicate needed manual injection.
20928        let legacy_v13_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, content='', tokenize='porter')";
20929        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
20930        let legacy = rusqlite_test_fixture_conn(&source_db);
20931        legacy
20932            .execute_batch(
20933                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
20934                 DELETE FROM _schema_migrations WHERE version = 14;
20935                 PRAGMA writable_schema = ON;",
20936            )
20937            .unwrap();
20938        legacy
20939            .execute(
20940                "DELETE FROM meta WHERE key = ?1",
20941                [FTS_FRANKEN_REBUILD_META_KEY],
20942            )
20943            .unwrap();
20944        // Inject the V13 original first.
20945        legacy
20946            .execute(
20947                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20948                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20949                [legacy_v13_fts_sql],
20950            )
20951            .unwrap();
20952        // Then the duplicate that's the real subject of the fixup logic.
20953        legacy
20954            .execute(
20955                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20956                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20957                [duplicate_legacy_fts_sql],
20958            )
20959            .unwrap();
20960        legacy
20961            .execute_batch("PRAGMA writable_schema = OFF;")
20962            .unwrap();
20963        drop(legacy);
20964
20965        // Verify fixture with rusqlite+writable_schema to see raw
20966        // sqlite_master rows (frankensqlite deduplicates schema entries).
20967        {
20968            let verify = rusqlite_test_fixture_conn(&source_db);
20969            verify
20970                .execute_batch("PRAGMA writable_schema = ON;")
20971                .unwrap();
20972            let fts_entries: i64 = verify
20973                .query_row(
20974                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20975                    [],
20976                    |row| row.get(0),
20977                )
20978                .unwrap();
20979            assert_eq!(
20980                fts_entries, 2,
20981                "test fixture should reproduce the duplicate legacy fts_messages rows"
20982            );
20983            let msg_count: i64 = verify
20984                .query_row("SELECT COUNT(*) FROM messages", [], |row| row.get(0))
20985                .unwrap();
20986            assert_eq!(msg_count, 1);
20987        }
20988
20989        let fresh = SqliteStorage::open(&canonical_db).unwrap();
20990        drop(fresh);
20991
20992        let outcome = seed_canonical_from_best_historical_bundle(&canonical_db)
20993            .unwrap()
20994            .unwrap();
20995        assert_eq!(outcome.bundles_imported, 1);
20996        assert_eq!(outcome.conversations_imported, 1);
20997        assert_eq!(outcome.messages_imported, 1);
20998
20999        let readonly = open_franken_with_flags(
21000            &canonical_db.to_string_lossy(),
21001            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21002        )
21003        .unwrap();
21004        let readonly_message_count: i64 = readonly
21005            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21006                row.get_typed(0)
21007            })
21008            .unwrap();
21009        assert_eq!(readonly_message_count, 1);
21010
21011        let seeded = SqliteStorage::open(&canonical_db).unwrap();
21012        assert_eq!(
21013            seeded
21014                .count_sessions_in_range(None, None, None, None)
21015                .unwrap()
21016                .0,
21017            1
21018        );
21019        let message_count: i64 = seeded
21020            .conn
21021            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21022                row.get_typed(0)
21023            })
21024            .unwrap();
21025        assert_eq!(message_count, 1);
21026        assert_eq!(seeded.get_last_scan_ts().unwrap(), None);
21027        assert_eq!(seeded.get_last_embedded_message_id().unwrap(), None);
21028
21029        let last_indexed: Option<String> = seeded
21030            .conn
21031            .query_row_map(
21032                "SELECT value FROM meta WHERE key = 'last_indexed_at'",
21033                fparams![],
21034                |row| row.get_typed(0),
21035            )
21036            .optional()
21037            .unwrap();
21038        assert!(last_indexed.is_none());
21039
21040        let salvage_keys: Vec<String> = seeded
21041            .conn
21042            .query_map_collect(
21043                "SELECT key FROM meta WHERE key LIKE 'historical_bundle_salvaged:%' ORDER BY key",
21044                fparams![],
21045                |row| row.get_typed(0),
21046            )
21047            .unwrap();
21048        assert_eq!(salvage_keys.len(), 1);
21049
21050        let reopened_readonly = open_franken_with_flags(
21051            &canonical_db.to_string_lossy(),
21052            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21053        )
21054        .unwrap();
21055        let reopened_fts_entries: i64 = reopened_readonly
21056            .query_row_map(
21057                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
21058                fparams![],
21059                |row| row.get_typed(0),
21060            )
21061            .unwrap();
21062        assert_eq!(
21063            reopened_fts_entries, 1,
21064            "seeded canonical db should keep a single stock-SQLite fts_messages schema row"
21065        );
21066        let reopened_message_count: i64 = reopened_readonly
21067            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21068                row.get_typed(0)
21069            })
21070            .unwrap();
21071        assert_eq!(reopened_message_count, 1);
21072
21073        let franken_seeded = FrankenStorage::open(&canonical_db).unwrap();
21074        assert_eq!(
21075            franken_seeded.schema_version().unwrap(),
21076            CURRENT_SCHEMA_VERSION
21077        );
21078        // Post-V14 fts_messages is recreated lazily. `FrankenStorage::open`
21079        // alone doesn't re-register the virtual table for the frankensqlite
21080        // query engine — the consistency pass does, and this is exactly what
21081        // normal cass startup runs before the first search. Invoke it
21082        // explicitly so the query below exercises the expected post-repair
21083        // state rather than the between-steps state.
21084        franken_seeded
21085            .ensure_search_fallback_fts_consistency()
21086            .expect("ensure FTS consistency after seed");
21087        let post_franken_schema_rows: i64 = franken_seeded
21088            .raw()
21089            .query_row_map(
21090                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
21091                fparams![],
21092                |row| row.get_typed(0),
21093            )
21094            .unwrap();
21095        assert_eq!(post_franken_schema_rows, 1);
21096        let fts_probe = franken_seeded
21097            .raw()
21098            .query("SELECT COUNT(*) FROM fts_messages");
21099        assert!(
21100            fts_probe.is_ok(),
21101            "expected post-seed FTS to be queryable, got {fts_probe:?}"
21102        );
21103    }
21104
21105    #[test]
21106    fn failed_baseline_seed_preserves_existing_canonical_bundle() {
21107        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21108        use std::path::PathBuf;
21109
21110        let dir = TempDir::new().unwrap();
21111        let canonical_db = dir.path().join("agent_search.db");
21112        let source_db = dir
21113            .path()
21114            .join("backups/agent_search.db.20260325T120000Z.bad-seed.bak");
21115
21116        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
21117
21118        let canonical = SqliteStorage::open(&canonical_db).unwrap();
21119        canonical
21120            .conn
21121            .execute_compat(
21122                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
21123                fparams!["sentinel", "keep-me"],
21124            )
21125            .unwrap();
21126        drop(canonical);
21127
21128        let source = SqliteStorage::open(&source_db).unwrap();
21129        let agent = Agent {
21130            id: None,
21131            slug: "codex".into(),
21132            name: "Codex".into(),
21133            version: Some("0.2.3".into()),
21134            kind: AgentKind::Cli,
21135        };
21136        let agent_id = source.ensure_agent(&agent).unwrap();
21137        let conversation = Conversation {
21138            id: None,
21139            agent_slug: "codex".into(),
21140            workspace: Some(PathBuf::from("/tmp/workspace")),
21141            external_id: Some("bad-seed-conv".into()),
21142            title: Some("Bad seed".into()),
21143            source_path: PathBuf::from("/tmp/bad-seed.jsonl"),
21144            started_at: Some(1_700_000_000_000),
21145            ended_at: Some(1_700_000_000_100),
21146            approx_tokens: Some(42),
21147            metadata_json: serde_json::json!({"seed": "bad"}),
21148            messages: vec![Message {
21149                id: None,
21150                idx: 0,
21151                role: MessageRole::Agent,
21152                author: Some("assistant".into()),
21153                created_at: Some(1_700_000_000_050),
21154                content: "this seed should fail".into(),
21155                extra_json: serde_json::Value::Null,
21156                snippets: Vec::new(),
21157            }],
21158            source_id: LOCAL_SOURCE_ID.into(),
21159            origin_host: None,
21160        };
21161        source
21162            .insert_conversation_tree(agent_id, None, &conversation)
21163            .unwrap();
21164        drop(source);
21165
21166        let legacy = FrankenConnection::open(source_db.to_string_lossy().into_owned()).unwrap();
21167        legacy
21168            .execute("UPDATE meta SET value = '12' WHERE key = 'schema_version'")
21169            .unwrap();
21170        drop(legacy);
21171
21172        let err = seed_canonical_from_best_historical_bundle(&canonical_db).unwrap_err();
21173        assert!(
21174            err.to_string()
21175                .contains("schema_version 12 is too old for baseline import"),
21176            "unexpected seed error: {err:#}"
21177        );
21178
21179        let reopened = SqliteStorage::open(&canonical_db).unwrap();
21180        let sentinel: Option<String> = reopened
21181            .conn
21182            .query_row_map(
21183                "SELECT value FROM meta WHERE key = 'sentinel'",
21184                fparams![],
21185                |row| row.get_typed(0),
21186            )
21187            .optional()
21188            .unwrap();
21189        assert_eq!(sentinel.as_deref(), Some("keep-me"));
21190
21191        let conversation_count: i64 = reopened
21192            .conn
21193            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
21194                row.get_typed(0)
21195            })
21196            .unwrap();
21197        assert_eq!(conversation_count, 0);
21198
21199        let readonly = open_franken_with_flags(
21200            &canonical_db.to_string_lossy(),
21201            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21202        )
21203        .unwrap();
21204        let readonly_conversation_count: i64 = readonly
21205            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
21206                row.get_typed(0)
21207            })
21208            .unwrap();
21209        assert_eq!(readonly_conversation_count, 0);
21210    }
21211
21212    #[test]
21213    fn fetch_messages_for_lexical_rebuild_skips_extra_json() {
21214        let dir = TempDir::new().unwrap();
21215        let db_path = dir.path().join("test.db");
21216        let storage = SqliteStorage::open(&db_path).unwrap();
21217
21218        let agent = Agent {
21219            id: None,
21220            slug: "codex".into(),
21221            name: "Codex".into(),
21222            version: Some("0.2.3".into()),
21223            kind: AgentKind::Cli,
21224        };
21225        let agent_id = storage.ensure_agent(&agent).unwrap();
21226
21227        let conversation = Conversation {
21228            id: None,
21229            agent_slug: "codex".into(),
21230            workspace: Some(PathBuf::from("/tmp/workspace")),
21231            external_id: Some("lexical-rebuild-test".into()),
21232            title: Some("Lexical rebuild".into()),
21233            source_path: PathBuf::from("/tmp/lexical-rebuild.jsonl"),
21234            started_at: Some(1_700_000_000_000),
21235            ended_at: Some(1_700_000_000_100),
21236            approx_tokens: Some(42),
21237            metadata_json: serde_json::Value::Null,
21238            messages: vec![Message {
21239                id: None,
21240                idx: 0,
21241                role: MessageRole::Agent,
21242                author: Some("assistant".into()),
21243                created_at: Some(1_700_000_000_050),
21244                content: "indexed text".into(),
21245                extra_json: serde_json::json!({
21246                    "usage": { "total_tokens": 1234 },
21247                    "irrelevant_blob": "still preserved in canonical storage"
21248                }),
21249                snippets: Vec::new(),
21250            }],
21251            source_id: LOCAL_SOURCE_ID.into(),
21252            origin_host: None,
21253        };
21254
21255        let inserted = storage
21256            .insert_conversation_tree(agent_id, None, &conversation)
21257            .unwrap();
21258        let conversation_id = inserted.conversation_id;
21259
21260        let stored = storage.fetch_messages(conversation_id).unwrap();
21261        assert_eq!(stored.len(), 1);
21262        assert!(!stored[0].extra_json.is_null());
21263
21264        let lexical = storage
21265            .fetch_messages_for_lexical_rebuild(conversation_id)
21266            .unwrap();
21267        assert_eq!(lexical.len(), 1);
21268        assert_eq!(lexical[0].content, "indexed text");
21269        assert_eq!(lexical[0].author.as_deref(), Some("assistant"));
21270        assert!(lexical[0].extra_json.is_null());
21271    }
21272
21273    #[test]
21274    fn fetch_messages_for_lexical_rebuild_batch_groups_and_orders_messages() {
21275        let dir = TempDir::new().unwrap();
21276        let db_path = dir.path().join("test.db");
21277        let storage = SqliteStorage::open(&db_path).unwrap();
21278
21279        let agent = Agent {
21280            id: None,
21281            slug: "codex".into(),
21282            name: "Codex".into(),
21283            version: Some("0.2.3".into()),
21284            kind: AgentKind::Cli,
21285        };
21286        let agent_id = storage.ensure_agent(&agent).unwrap();
21287
21288        let first = Conversation {
21289            id: None,
21290            agent_slug: "codex".into(),
21291            workspace: Some(PathBuf::from("/tmp/workspace")),
21292            external_id: Some("lexical-batch-1".into()),
21293            title: Some("Lexical batch 1".into()),
21294            source_path: PathBuf::from("/tmp/lexical-batch-1.jsonl"),
21295            started_at: Some(1_700_000_000_000),
21296            ended_at: Some(1_700_000_000_100),
21297            approx_tokens: Some(42),
21298            metadata_json: serde_json::Value::Null,
21299            messages: vec![
21300                Message {
21301                    id: None,
21302                    idx: 0,
21303                    role: MessageRole::User,
21304                    author: Some("user".into()),
21305                    created_at: Some(1_700_000_000_010),
21306                    content: "first-a".into(),
21307                    extra_json: serde_json::json!({"opaque": true}),
21308                    snippets: Vec::new(),
21309                },
21310                Message {
21311                    id: None,
21312                    idx: 1,
21313                    role: MessageRole::Agent,
21314                    author: Some("assistant".into()),
21315                    created_at: Some(1_700_000_000_020),
21316                    content: "first-b".into(),
21317                    extra_json: serde_json::json!({"opaque": true}),
21318                    snippets: Vec::new(),
21319                },
21320            ],
21321            source_id: LOCAL_SOURCE_ID.into(),
21322            origin_host: None,
21323        };
21324
21325        let second = Conversation {
21326            id: None,
21327            agent_slug: "codex".into(),
21328            workspace: Some(PathBuf::from("/tmp/workspace")),
21329            external_id: Some("lexical-batch-2".into()),
21330            title: Some("Lexical batch 2".into()),
21331            source_path: PathBuf::from("/tmp/lexical-batch-2.jsonl"),
21332            started_at: Some(1_700_000_000_200),
21333            ended_at: Some(1_700_000_000_300),
21334            approx_tokens: Some(84),
21335            metadata_json: serde_json::Value::Null,
21336            messages: vec![Message {
21337                id: None,
21338                idx: 0,
21339                role: MessageRole::Tool,
21340                author: Some("tool".into()),
21341                created_at: Some(1_700_000_000_210),
21342                content: "second-a".into(),
21343                extra_json: serde_json::json!({"opaque": true}),
21344                snippets: Vec::new(),
21345            }],
21346            source_id: LOCAL_SOURCE_ID.into(),
21347            origin_host: None,
21348        };
21349        let third = Conversation {
21350            external_id: Some("lexical-batch-3".into()),
21351            title: Some("Lexical batch 3".into()),
21352            source_path: PathBuf::from("/tmp/lexical-batch-3.jsonl"),
21353            messages: vec![Message {
21354                id: None,
21355                idx: 0,
21356                role: MessageRole::System,
21357                author: Some("system".into()),
21358                created_at: Some(1_700_000_000_410),
21359                content: "third-a".into(),
21360                extra_json: serde_json::json!({"opaque": true}),
21361                snippets: Vec::new(),
21362            }],
21363            ..second.clone()
21364        };
21365
21366        let first_id = storage
21367            .insert_conversation_tree(agent_id, None, &first)
21368            .unwrap()
21369            .conversation_id;
21370        let second_id = storage
21371            .insert_conversation_tree(agent_id, None, &second)
21372            .unwrap()
21373            .conversation_id;
21374        let third_id = storage
21375            .insert_conversation_tree(agent_id, None, &third)
21376            .unwrap()
21377            .conversation_id;
21378
21379        let lexical = storage
21380            .fetch_messages_for_lexical_rebuild_batch(&[third_id, first_id], None, None)
21381            .unwrap();
21382
21383        let first_messages = lexical.get(&first_id).expect("first conversation");
21384        assert_eq!(first_messages.len(), 2);
21385        assert_eq!(first_messages[0].content, "first-a");
21386        assert_eq!(first_messages[1].content, "first-b");
21387        assert!(
21388            first_messages
21389                .iter()
21390                .all(|message| message.extra_json.is_null())
21391        );
21392
21393        assert!(
21394            !lexical.contains_key(&second_id),
21395            "batch fetch must exclude conversations not requested by the caller"
21396        );
21397
21398        let third_messages = lexical.get(&third_id).expect("third conversation");
21399        assert_eq!(third_messages.len(), 1);
21400        assert_eq!(third_messages[0].content, "third-a");
21401        assert!(third_messages[0].extra_json.is_null());
21402    }
21403
21404    #[test]
21405    fn fetch_messages_for_lexical_rebuild_batch_enforces_content_byte_guardrail() {
21406        let dir = TempDir::new().unwrap();
21407        let db_path = dir.path().join("test.db");
21408        let storage = SqliteStorage::open(&db_path).unwrap();
21409
21410        let agent = Agent {
21411            id: None,
21412            slug: "codex".into(),
21413            name: "Codex".into(),
21414            version: Some("0.2.3".into()),
21415            kind: AgentKind::Cli,
21416        };
21417        let agent_id = storage.ensure_agent(&agent).unwrap();
21418
21419        let conversation = Conversation {
21420            id: None,
21421            agent_slug: "codex".into(),
21422            workspace: Some(PathBuf::from("/tmp/workspace")),
21423            external_id: Some("lexical-batch-guard".into()),
21424            title: Some("Lexical batch guard".into()),
21425            source_path: PathBuf::from("/tmp/lexical-batch-guard.jsonl"),
21426            started_at: Some(1_700_000_000_000),
21427            ended_at: Some(1_700_000_000_100),
21428            approx_tokens: Some(42),
21429            metadata_json: serde_json::Value::Null,
21430            messages: vec![
21431                Message {
21432                    id: None,
21433                    idx: 0,
21434                    role: MessageRole::User,
21435                    author: Some("user".into()),
21436                    created_at: Some(1_700_000_000_010),
21437                    content: "123456".into(),
21438                    extra_json: serde_json::Value::Null,
21439                    snippets: Vec::new(),
21440                },
21441                Message {
21442                    id: None,
21443                    idx: 1,
21444                    role: MessageRole::Agent,
21445                    author: Some("assistant".into()),
21446                    created_at: Some(1_700_000_000_020),
21447                    content: "abcdef".into(),
21448                    extra_json: serde_json::Value::Null,
21449                    snippets: Vec::new(),
21450                },
21451            ],
21452            source_id: LOCAL_SOURCE_ID.into(),
21453            origin_host: None,
21454        };
21455
21456        let conversation_id = storage
21457            .insert_conversation_tree(agent_id, None, &conversation)
21458            .unwrap()
21459            .conversation_id;
21460
21461        let error = storage
21462            .fetch_messages_for_lexical_rebuild_batch(&[conversation_id], Some(10), Some(8))
21463            .expect_err("guardrail should reject oversized batch content");
21464
21465        let message = format!("{error:#}");
21466        assert!(
21467            message.contains("content-byte guardrail"),
21468            "expected guardrail reason in error, got {message}"
21469        );
21470    }
21471
21472    #[test]
21473    fn fetch_messages_handles_manual_rows_inserted_via_raw_connection() {
21474        let dir = TempDir::new().unwrap();
21475        let db_path = dir.path().join("manual-rows.db");
21476        let storage = FrankenStorage::open(&db_path).unwrap();
21477        let conn = storage.raw();
21478
21479        conn.execute(
21480            "INSERT INTO agents (id, slug, name, kind, created_at, updated_at)
21481             VALUES (1, 'claude_code', 'Claude Code', 'local', 0, 0)",
21482        )
21483        .unwrap();
21484        conn.execute(
21485            "INSERT INTO conversations
21486             (id, agent_id, external_id, title, source_path, source_id, started_at)
21487             VALUES (1, 1, 'manual-ext', 'Manual Session', '/tmp/manual.jsonl', 'local', 200)",
21488        )
21489        .unwrap();
21490        conn.execute(
21491            "INSERT INTO messages
21492             (id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
21493             VALUES (1, 1, 0, 'user', 'tester', 1700000000000, 'manual body', '{\"k\":1}', NULL)",
21494        )
21495        .unwrap();
21496
21497        let lexical = storage.fetch_messages_for_lexical_rebuild(1).unwrap();
21498        assert_eq!(lexical.len(), 1);
21499        assert_eq!(lexical[0].content, "manual body");
21500
21501        let full = storage.fetch_messages(1).unwrap();
21502        assert_eq!(full.len(), 1);
21503        assert_eq!(full[0].content, "manual body");
21504        assert_eq!(full[0].author.as_deref(), Some("tester"));
21505        assert_eq!(full[0].extra_json, serde_json::json!({ "k": 1 }));
21506    }
21507
21508    #[test]
21509    fn lexical_rebuild_batch_messages_query_avoids_sorter_temp_btrees() {
21510        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21511        use std::path::PathBuf;
21512
21513        let dir = TempDir::new().unwrap();
21514        let db_path = dir.path().join("agent_search.db");
21515        let storage = SqliteStorage::open(&db_path).unwrap();
21516
21517        let agent = Agent {
21518            id: None,
21519            slug: "claude_code".into(),
21520            name: "Claude Code".into(),
21521            version: None,
21522            kind: AgentKind::Cli,
21523        };
21524        let agent_id = storage.ensure_agent(&agent).unwrap();
21525
21526        for (external_id, base_ts) in [
21527            ("conv-1", 1_700_000_000_000_i64),
21528            ("conv-2", 1_700_000_001_000_i64),
21529        ] {
21530            let conversation = Conversation {
21531                id: None,
21532                agent_slug: "claude_code".into(),
21533                workspace: Some(PathBuf::from("/tmp/workspace")),
21534                external_id: Some(external_id.to_string()),
21535                title: Some("Lexical rebuild".into()),
21536                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21537                started_at: Some(base_ts),
21538                ended_at: Some(base_ts + 100),
21539                approx_tokens: None,
21540                metadata_json: serde_json::Value::Null,
21541                messages: vec![
21542                    Message {
21543                        id: None,
21544                        idx: 0,
21545                        role: MessageRole::User,
21546                        author: Some("user".into()),
21547                        created_at: Some(base_ts + 10),
21548                        content: format!("{external_id}-first"),
21549                        extra_json: serde_json::Value::Null,
21550                        snippets: Vec::new(),
21551                    },
21552                    Message {
21553                        id: None,
21554                        idx: 1,
21555                        role: MessageRole::Agent,
21556                        author: Some("assistant".into()),
21557                        created_at: Some(base_ts + 20),
21558                        content: format!("{external_id}-second"),
21559                        extra_json: serde_json::Value::Null,
21560                        snippets: Vec::new(),
21561                    },
21562                ],
21563                source_id: LOCAL_SOURCE_ID.into(),
21564                origin_host: None,
21565            };
21566            storage
21567                .insert_conversation_tree(agent_id, None, &conversation)
21568                .unwrap();
21569        }
21570
21571        let conversation_ids: Vec<i64> = storage
21572            .conn
21573            .query_map_collect(
21574                "SELECT id FROM conversations ORDER BY id",
21575                fparams![],
21576                |row| row.get_typed(0),
21577            )
21578            .unwrap();
21579        assert_eq!(conversation_ids.len(), 2);
21580
21581        let plan_details: Vec<String> = storage
21582            .conn
21583            .query_map_collect(
21584                "EXPLAIN QUERY PLAN \
21585                 SELECT conversation_id, id, idx, role, author, created_at, content \
21586                 FROM messages \
21587                 WHERE conversation_id IN (?1, ?2) \
21588                 ORDER BY conversation_id ASC, idx ASC",
21589                fparams![conversation_ids[0], conversation_ids[1]],
21590                |row| row.get_typed(3),
21591            )
21592            .unwrap();
21593
21594        assert!(
21595            plan_details
21596                .iter()
21597                .any(|detail| detail.contains("sqlite_autoindex_messages_1")),
21598            "expected batched lexical rebuild fetch to use the conversation_id/idx composite index, got {plan_details:?}"
21599        );
21600        assert!(
21601            !plan_details
21602                .iter()
21603                .any(|detail| detail.contains("TEMP B-TREE")),
21604            "expected batched lexical rebuild fetch to avoid sorter temp b-trees, got {plan_details:?}"
21605        );
21606    }
21607
21608    #[test]
21609    fn stream_messages_for_lexical_rebuild_groups_and_orders_messages() {
21610        let dir = TempDir::new().unwrap();
21611        let db_path = dir.path().join("test.db");
21612        let storage = SqliteStorage::open(&db_path).unwrap();
21613
21614        let agent = Agent {
21615            id: None,
21616            slug: "codex".into(),
21617            name: "Codex".into(),
21618            version: Some("0.2.3".into()),
21619            kind: AgentKind::Cli,
21620        };
21621        let agent_id = storage.ensure_agent(&agent).unwrap();
21622
21623        let first = Conversation {
21624            id: None,
21625            agent_slug: "codex".into(),
21626            workspace: Some(PathBuf::from("/tmp/workspace")),
21627            external_id: Some("lexical-stream-1".into()),
21628            title: Some("Lexical stream 1".into()),
21629            source_path: PathBuf::from("/tmp/lexical-stream-1.jsonl"),
21630            started_at: Some(1_700_000_000_000),
21631            ended_at: Some(1_700_000_000_100),
21632            approx_tokens: Some(42),
21633            metadata_json: serde_json::Value::Null,
21634            messages: vec![
21635                Message {
21636                    id: None,
21637                    idx: 0,
21638                    role: MessageRole::User,
21639                    author: Some("user".into()),
21640                    created_at: Some(1_700_000_000_010),
21641                    content: "first-a".into(),
21642                    extra_json: serde_json::json!({"opaque": true}),
21643                    snippets: Vec::new(),
21644                },
21645                Message {
21646                    id: None,
21647                    idx: 1,
21648                    role: MessageRole::Agent,
21649                    author: Some("assistant".into()),
21650                    created_at: Some(1_700_000_000_020),
21651                    content: "first-b".into(),
21652                    extra_json: serde_json::json!({"opaque": true}),
21653                    snippets: Vec::new(),
21654                },
21655            ],
21656            source_id: LOCAL_SOURCE_ID.into(),
21657            origin_host: None,
21658        };
21659
21660        let second = Conversation {
21661            id: None,
21662            agent_slug: "codex".into(),
21663            workspace: Some(PathBuf::from("/tmp/workspace")),
21664            external_id: Some("lexical-stream-2".into()),
21665            title: Some("Lexical stream 2".into()),
21666            source_path: PathBuf::from("/tmp/lexical-stream-2.jsonl"),
21667            started_at: Some(1_700_000_000_200),
21668            ended_at: Some(1_700_000_000_300),
21669            approx_tokens: Some(84),
21670            metadata_json: serde_json::Value::Null,
21671            messages: vec![Message {
21672                id: None,
21673                idx: 0,
21674                role: MessageRole::Tool,
21675                author: Some("tool".into()),
21676                created_at: Some(1_700_000_000_210),
21677                content: "second-a".into(),
21678                extra_json: serde_json::json!({"opaque": true}),
21679                snippets: Vec::new(),
21680            }],
21681            source_id: LOCAL_SOURCE_ID.into(),
21682            origin_host: None,
21683        };
21684
21685        let first_id = storage
21686            .insert_conversation_tree(agent_id, None, &first)
21687            .unwrap()
21688            .conversation_id;
21689        let second_id = storage
21690            .insert_conversation_tree(agent_id, None, &second)
21691            .unwrap()
21692            .conversation_id;
21693
21694        let mut streamed = Vec::new();
21695        storage
21696            .stream_messages_for_lexical_rebuild_from_conversation_id(first_id, |row| {
21697                streamed.push((
21698                    row.conversation_id,
21699                    row.idx,
21700                    row.role,
21701                    row.author,
21702                    row.content,
21703                ));
21704                Ok(())
21705            })
21706            .unwrap();
21707
21708        assert_eq!(
21709            streamed,
21710            vec![
21711                (
21712                    first_id,
21713                    0,
21714                    "user".to_string(),
21715                    Some("user".to_string()),
21716                    "first-a".to_string(),
21717                ),
21718                (
21719                    first_id,
21720                    1,
21721                    "agent".to_string(),
21722                    Some("assistant".to_string()),
21723                    "first-b".to_string(),
21724                ),
21725                (
21726                    second_id,
21727                    0,
21728                    "tool".to_string(),
21729                    Some("tool".to_string()),
21730                    "second-a".to_string(),
21731                ),
21732            ]
21733        );
21734    }
21735
21736    #[test]
21737    fn stream_messages_for_lexical_rebuild_between_conversation_ids_respects_upper_bound() {
21738        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21739        use std::path::PathBuf;
21740
21741        let dir = TempDir::new().unwrap();
21742        let db_path = dir.path().join("agent_search.db");
21743        let storage = SqliteStorage::open(&db_path).unwrap();
21744
21745        let agent = Agent {
21746            id: None,
21747            slug: "claude_code".into(),
21748            name: "Claude Code".into(),
21749            version: Some("1.2.3".into()),
21750            kind: AgentKind::Cli,
21751        };
21752        let agent_id = storage.ensure_agent(&agent).unwrap();
21753
21754        let first = Conversation {
21755            id: None,
21756            agent_slug: "claude_code".into(),
21757            workspace: Some(PathBuf::from("/tmp/workspace")),
21758            external_id: Some("lexical-range-1".into()),
21759            title: Some("Lexical range 1".into()),
21760            source_path: PathBuf::from("/tmp/lexical-range-1.jsonl"),
21761            started_at: Some(1_700_000_000_000),
21762            ended_at: Some(1_700_000_000_100),
21763            approx_tokens: Some(42),
21764            metadata_json: serde_json::Value::Null,
21765            messages: vec![Message {
21766                id: None,
21767                idx: 0,
21768                role: MessageRole::User,
21769                author: Some("user".into()),
21770                created_at: Some(1_700_000_000_010),
21771                content: "first-only".into(),
21772                extra_json: serde_json::json!({"opaque": true}),
21773                snippets: Vec::new(),
21774            }],
21775            source_id: LOCAL_SOURCE_ID.into(),
21776            origin_host: None,
21777        };
21778
21779        let second = Conversation {
21780            id: None,
21781            agent_slug: "claude_code".into(),
21782            workspace: Some(PathBuf::from("/tmp/workspace")),
21783            external_id: Some("lexical-range-2".into()),
21784            title: Some("Lexical range 2".into()),
21785            source_path: PathBuf::from("/tmp/lexical-range-2.jsonl"),
21786            started_at: Some(1_700_000_000_200),
21787            ended_at: Some(1_700_000_000_300),
21788            approx_tokens: Some(84),
21789            metadata_json: serde_json::Value::Null,
21790            messages: vec![Message {
21791                id: None,
21792                idx: 0,
21793                role: MessageRole::Tool,
21794                author: Some("tool".into()),
21795                created_at: Some(1_700_000_000_210),
21796                content: "second-should-not-appear".into(),
21797                extra_json: serde_json::json!({"opaque": true}),
21798                snippets: Vec::new(),
21799            }],
21800            source_id: LOCAL_SOURCE_ID.into(),
21801            origin_host: None,
21802        };
21803
21804        let first_id = storage
21805            .insert_conversation_tree(agent_id, None, &first)
21806            .unwrap()
21807            .conversation_id;
21808        let second_id = storage
21809            .insert_conversation_tree(agent_id, None, &second)
21810            .unwrap()
21811            .conversation_id;
21812
21813        let mut streamed = Vec::new();
21814        storage
21815            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21816                first_id,
21817                first_id,
21818                |row| {
21819                    streamed.push((row.conversation_id, row.idx, row.content));
21820                    Ok(())
21821                },
21822            )
21823            .unwrap();
21824
21825        assert_eq!(streamed, vec![(first_id, 0, "first-only".to_string())]);
21826        assert!(
21827            streamed
21828                .iter()
21829                .all(|(conversation_id, _, _)| *conversation_id != second_id),
21830            "upper bound should exclude later conversation ids"
21831        );
21832    }
21833
21834    #[test]
21835    fn stream_messages_for_lexical_rebuild_between_conversation_ids_handles_mixed_ranges() {
21836        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21837        use std::path::PathBuf;
21838
21839        let dir = TempDir::new().unwrap();
21840        let db_path = dir.path().join("agent_search.db");
21841        let storage = SqliteStorage::open(&db_path).unwrap();
21842
21843        let claude_agent_id = storage
21844            .ensure_agent(&Agent {
21845                id: None,
21846                slug: "claude_code".into(),
21847                name: "Claude Code".into(),
21848                version: None,
21849                kind: AgentKind::Cli,
21850            })
21851            .unwrap();
21852        let aider_agent_id = storage
21853            .ensure_agent(&Agent {
21854                id: None,
21855                slug: "aider".into(),
21856                name: "Aider".into(),
21857                version: None,
21858                kind: AgentKind::Cli,
21859            })
21860            .unwrap();
21861
21862        type MessageSpec = (i64, MessageRole, Option<String>, Option<i64>, String);
21863
21864        let mut expected = Vec::new();
21865        let mut first_conversation_id = None;
21866        let mut last_conversation_id = None;
21867        let mut insert_conversation =
21868            |agent_id: i64,
21869             external_id: &str,
21870             title: &str,
21871             source_path: &str,
21872             started_at: i64,
21873             message_specs: Vec<MessageSpec>| {
21874                let conversation = Conversation {
21875                    id: None,
21876                    agent_slug: if agent_id == aider_agent_id {
21877                        "aider".into()
21878                    } else {
21879                        "claude_code".into()
21880                    },
21881                    workspace: Some(PathBuf::from("/tmp/workspace")),
21882                    external_id: Some(external_id.to_string()),
21883                    title: Some(title.to_string()),
21884                    source_path: PathBuf::from(source_path),
21885                    started_at: Some(started_at),
21886                    ended_at: Some(started_at + 100),
21887                    approx_tokens: None,
21888                    metadata_json: serde_json::Value::Null,
21889                    messages: message_specs
21890                        .iter()
21891                        .map(|(idx, role, author, created_at, content)| Message {
21892                            id: None,
21893                            idx: *idx,
21894                            role: role.clone(),
21895                            author: author.clone(),
21896                            created_at: *created_at,
21897                            content: content.clone(),
21898                            extra_json: serde_json::Value::Null,
21899                            snippets: Vec::new(),
21900                        })
21901                        .collect(),
21902                    source_id: LOCAL_SOURCE_ID.into(),
21903                    origin_host: None,
21904                };
21905                let conversation_id = storage
21906                    .insert_conversation_tree(agent_id, None, &conversation)
21907                    .unwrap()
21908                    .conversation_id;
21909                if first_conversation_id.is_none() {
21910                    first_conversation_id = Some(conversation_id);
21911                }
21912                last_conversation_id = Some(conversation_id);
21913                expected.extend(message_specs.into_iter().map(
21914                    |(idx, role, author, created_at, content)| {
21915                        (
21916                            conversation_id,
21917                            idx,
21918                            match role {
21919                                MessageRole::User => "user".to_string(),
21920                                MessageRole::Agent => "agent".to_string(),
21921                                MessageRole::Tool => "tool".to_string(),
21922                                MessageRole::System => "system".to_string(),
21923                                MessageRole::Other(other) => other,
21924                            },
21925                            author,
21926                            created_at,
21927                            content,
21928                        )
21929                    },
21930                ));
21931            };
21932
21933        for (label, base_ts) in [
21934            ("alpha", 1_700_000_000_000_i64),
21935            ("beta", 1_700_000_001_000_i64),
21936            ("gamma", 1_700_000_002_000_i64),
21937            ("delta", 1_700_000_003_000_i64),
21938            ("epsilon", 1_700_000_004_000_i64),
21939        ] {
21940            insert_conversation(
21941                claude_agent_id,
21942                &format!("lexical-{label}"),
21943                &format!("Lexical {label}"),
21944                &format!("/tmp/{label}.jsonl"),
21945                base_ts,
21946                vec![
21947                    (
21948                        0,
21949                        MessageRole::User,
21950                        None,
21951                        Some(base_ts + 10),
21952                        format!("{label}_content"),
21953                    ),
21954                    (
21955                        1,
21956                        MessageRole::Agent,
21957                        None,
21958                        Some(base_ts + 20),
21959                        format!("{label}_content_response"),
21960                    ),
21961                ],
21962            );
21963        }
21964
21965        insert_conversation(
21966            aider_agent_id,
21967            "lexical-aider-history",
21968            "Aider Chat: coding_agent_session_search",
21969            "/tmp/.aider.chat.history.md",
21970            1_764_619_673_394,
21971            vec![
21972                (
21973                    0,
21974                    MessageRole::System,
21975                    Some("system".to_string()),
21976                    None,
21977                    "# aider chat started at 2025-12-01 20:07:47".to_string(),
21978                ),
21979                (
21980                    1,
21981                    MessageRole::User,
21982                    Some("user".to_string()),
21983                    None,
21984                    "/tmp/workspace/.venv/bin/aider --no-git --message hello world".to_string(),
21985                ),
21986            ],
21987        );
21988        insert_conversation(
21989            aider_agent_id,
21990            "lexical-aider-fixture",
21991            "Aider Chat: aider",
21992            "/tmp/tests/fixtures/aider/.aider.chat.history.md",
21993            1_764_621_401_399,
21994            vec![
21995                (
21996                    0,
21997                    MessageRole::User,
21998                    Some("user".to_string()),
21999                    None,
22000                    "/add src/main.rs".to_string(),
22001                ),
22002                (
22003                    1,
22004                    MessageRole::Agent,
22005                    Some("assistant".to_string()),
22006                    None,
22007                    "Added src/main.rs to the chat.
22008
22009#### /add src/main.rs"
22010                        .to_string(),
22011                ),
22012                (
22013                    2,
22014                    MessageRole::User,
22015                    Some("user".to_string()),
22016                    None,
22017                    "Please refactor.".to_string(),
22018                ),
22019                (
22020                    3,
22021                    MessageRole::Agent,
22022                    Some("assistant".to_string()),
22023                    None,
22024                    "Sure, here is the code.".to_string(),
22025                ),
22026            ],
22027        );
22028
22029        let mut streamed = Vec::new();
22030        storage
22031            .stream_messages_for_lexical_rebuild_between_conversation_ids(
22032                first_conversation_id.unwrap(),
22033                last_conversation_id.unwrap(),
22034                |row| {
22035                    streamed.push((
22036                        row.conversation_id,
22037                        row.idx,
22038                        row.role,
22039                        row.author,
22040                        row.created_at,
22041                        row.content,
22042                    ));
22043                    Ok(())
22044                },
22045            )
22046            .unwrap();
22047
22048        assert_eq!(streamed, expected);
22049    }
22050
22051    #[test]
22052    fn lexical_rebuild_stream_queries_use_rowid_and_per_conversation_probes() {
22053        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22054        use std::path::PathBuf;
22055
22056        let dir = TempDir::new().unwrap();
22057        let db_path = dir.path().join("agent_search.db");
22058        let storage = SqliteStorage::open(&db_path).unwrap();
22059
22060        let agent = Agent {
22061            id: None,
22062            slug: "claude_code".into(),
22063            name: "Claude Code".into(),
22064            version: None,
22065            kind: AgentKind::Cli,
22066        };
22067        let agent_id = storage.ensure_agent(&agent).unwrap();
22068
22069        for (external_id, base_ts) in [
22070            ("conv-1", 1_700_000_000_000_i64),
22071            ("conv-2", 1_700_000_001_000_i64),
22072        ] {
22073            let conversation = Conversation {
22074                id: None,
22075                agent_slug: "claude_code".into(),
22076                workspace: Some(PathBuf::from("/tmp/workspace")),
22077                external_id: Some(external_id.to_string()),
22078                title: Some("Lexical rebuild".into()),
22079                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
22080                started_at: Some(base_ts),
22081                ended_at: Some(base_ts + 100),
22082                approx_tokens: None,
22083                metadata_json: serde_json::Value::Null,
22084                messages: vec![
22085                    Message {
22086                        id: None,
22087                        idx: 0,
22088                        role: MessageRole::User,
22089                        author: Some("user".into()),
22090                        created_at: Some(base_ts + 10),
22091                        content: format!("{external_id}-first"),
22092                        extra_json: serde_json::Value::Null,
22093                        snippets: Vec::new(),
22094                    },
22095                    Message {
22096                        id: None,
22097                        idx: 1,
22098                        role: MessageRole::Agent,
22099                        author: Some("assistant".into()),
22100                        created_at: Some(base_ts + 20),
22101                        content: format!("{external_id}-second"),
22102                        extra_json: serde_json::Value::Null,
22103                        snippets: Vec::new(),
22104                    },
22105                ],
22106                source_id: LOCAL_SOURCE_ID.into(),
22107                origin_host: None,
22108            };
22109            storage
22110                .insert_conversation_tree(agent_id, None, &conversation)
22111                .unwrap();
22112        }
22113
22114        let first_id: i64 = storage
22115            .conn
22116            .query_row_map(
22117                "SELECT id FROM conversations ORDER BY id LIMIT 1",
22118                fparams![],
22119                |row| row.get_typed(0),
22120            )
22121            .unwrap();
22122        let last_id: i64 = storage
22123            .conn
22124            .query_row_map(
22125                "SELECT id FROM conversations ORDER BY id DESC LIMIT 1",
22126                fparams![],
22127                |row| row.get_typed(0),
22128            )
22129            .unwrap();
22130
22131        let conversation_plan_details: Vec<String> = storage
22132            .conn
22133            .query_map_collect(
22134                "EXPLAIN QUERY PLAN                  SELECT id FROM conversations                  WHERE id >= ?1 AND id <= ?2                  ORDER BY id ASC",
22135                fparams![first_id, last_id],
22136                |row| row.get_typed(3),
22137            )
22138            .unwrap();
22139        assert!(
22140            !conversation_plan_details
22141                .iter()
22142                .any(|detail| detail.contains("TEMP B-TREE")),
22143            "expected streamed lexical rebuild conversation listing to avoid sorter temp b-trees, got {conversation_plan_details:?}"
22144        );
22145
22146        let message_plan_details: Vec<String> = storage
22147            .conn
22148            .query_map_collect(
22149                "EXPLAIN QUERY PLAN                  SELECT id, idx, role, author, created_at, content                  FROM messages INDEXED BY sqlite_autoindex_messages_1                  WHERE conversation_id = ?1                  ORDER BY idx",
22150                fparams![first_id],
22151                |row| row.get_typed(3),
22152            )
22153            .unwrap();
22154        assert!(
22155            message_plan_details
22156                .iter()
22157                .any(|detail| detail.contains("sqlite_autoindex_messages_1")
22158                    || detail.contains("idx_messages_conv_idx")),
22159            "expected per-conversation lexical rebuild fetch to use the conversation_id/idx index, got {message_plan_details:?}"
22160        );
22161        assert!(
22162            !message_plan_details
22163                .iter()
22164                .any(|detail| detail.contains("TEMP B-TREE")),
22165            "expected per-conversation lexical rebuild fetch to avoid sorter temp b-trees, got {message_plan_details:?}"
22166        );
22167    }
22168
22169    #[test]
22170    fn discover_historical_database_bundles_prefers_larger_archives_first() {
22171        let dir = TempDir::new().unwrap();
22172        let canonical_db = dir.path().join("agent_search.db");
22173        fs::write(&canonical_db, b"canonical").unwrap();
22174
22175        let smaller = dir.path().join("agent_search.corrupt.small");
22176        fs::write(&smaller, vec![0_u8; 32]).unwrap();
22177
22178        let backups_dir = dir.path().join("backups");
22179        fs::create_dir_all(&backups_dir).unwrap();
22180        let larger = backups_dir.join("agent_search.db.20260322T020200.bak");
22181        fs::write(&larger, vec![0_u8; 128]).unwrap();
22182
22183        let bundles = discover_historical_database_bundles(&canonical_db);
22184        let ordered_paths: Vec<PathBuf> =
22185            bundles.into_iter().map(|bundle| bundle.root_path).collect();
22186
22187        assert_eq!(ordered_paths, vec![larger, smaller]);
22188    }
22189
22190    #[test]
22191    fn discover_historical_database_bundles_prefers_queryable_direct_bundles_first() {
22192        let dir = TempDir::new().unwrap();
22193        let canonical_db = dir.path().join("agent_search.db");
22194        fs::write(&canonical_db, b"canonical").unwrap();
22195
22196        let larger_corrupt = dir.path().join("agent_search.corrupt.20260324_212907");
22197        fs::write(&larger_corrupt, vec![0_u8; 4096]).unwrap();
22198
22199        let backups_dir = dir.path().join("backups");
22200        fs::create_dir_all(&backups_dir).unwrap();
22201        let smaller_healthy = backups_dir.join("agent_search.db.20260322T020200.bak");
22202        let conn = FrankenConnection::open(smaller_healthy.to_string_lossy().into_owned()).unwrap();
22203        conn.execute_batch(
22204            "CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT);
22205             CREATE TABLE messages (
22206                 id INTEGER PRIMARY KEY,
22207                 conversation_id INTEGER NOT NULL,
22208                 idx INTEGER NOT NULL,
22209                 content TEXT
22210             );
22211             INSERT INTO conversations(id, source_path) VALUES (1, '/tmp/history.jsonl');
22212             INSERT INTO messages(id, conversation_id, idx, content)
22213             VALUES (1, 1, 0, 'seed');",
22214        )
22215        .unwrap();
22216        drop(conn);
22217
22218        let bundles = discover_historical_database_bundles(&canonical_db);
22219        let ordered_paths: Vec<PathBuf> = bundles
22220            .iter()
22221            .map(|bundle| bundle.root_path.clone())
22222            .collect();
22223
22224        assert_eq!(ordered_paths, vec![smaller_healthy, larger_corrupt]);
22225        assert!(bundles[0].supports_direct_readonly);
22226        assert!(!bundles[1].supports_direct_readonly);
22227    }
22228
22229    #[test]
22230    fn salvage_historical_databases_skips_unreadable_quarantined_bundles() {
22231        let dir = TempDir::new().unwrap();
22232        let canonical_db = dir.path().join("agent_search.db");
22233        let storage = SqliteStorage::open(&canonical_db).unwrap();
22234
22235        let quarantined = dir.path().join("agent_search.corrupt.20260324_212907");
22236        fs::write(&quarantined, b"not a sqlite database").unwrap();
22237
22238        let discovered: Vec<PathBuf> = discover_historical_database_bundles(&canonical_db)
22239            .into_iter()
22240            .map(|bundle| bundle.root_path)
22241            .collect();
22242        assert_eq!(discovered, vec![quarantined]);
22243
22244        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
22245        assert_eq!(outcome.bundles_considered, 1);
22246        assert_eq!(outcome.bundles_imported, 0);
22247        assert_eq!(outcome.conversations_imported, 0);
22248        assert_eq!(outcome.messages_imported, 0);
22249        assert!(storage.list_conversations(10, 0).unwrap().is_empty());
22250    }
22251
22252    #[test]
22253    fn discover_historical_database_bundles_includes_repair_lab_and_snapshots_named_roots() {
22254        let dir = TempDir::new().unwrap();
22255        let canonical_db = dir.path().join("agent_search.db");
22256        fs::write(&canonical_db, b"canonical").unwrap();
22257
22258        let repair_lab_dir = dir.path().join("repair-lab").join("live-copy");
22259        fs::create_dir_all(&repair_lab_dir).unwrap();
22260        let repair_lab_db = repair_lab_dir.join("agent_search.db");
22261        fs::write(&repair_lab_db, vec![0_u8; 96]).unwrap();
22262        fs::write(
22263            repair_lab_dir.join("agent_search.rebuild-test.db"),
22264            vec![0_u8; 192],
22265        )
22266        .unwrap();
22267
22268        let snapshots_dir = dir.path().join("snapshots").join("20260324T013201Z");
22269        fs::create_dir_all(&snapshots_dir).unwrap();
22270        let snapshot_db = snapshots_dir.join("agent_search.db");
22271        fs::write(&snapshot_db, vec![0_u8; 64]).unwrap();
22272
22273        let bundles = discover_historical_database_bundles(&canonical_db);
22274        let ordered_paths: Vec<PathBuf> =
22275            bundles.into_iter().map(|bundle| bundle.root_path).collect();
22276
22277        assert!(ordered_paths.contains(&repair_lab_db));
22278        assert!(ordered_paths.contains(&snapshot_db));
22279        assert!(
22280            !ordered_paths
22281                .iter()
22282                .any(|path| path.file_name().and_then(|name| name.to_str())
22283                    == Some("agent_search.rebuild-test.db"))
22284        );
22285    }
22286
22287    #[test]
22288    fn discover_historical_database_bundles_prefers_healthy_backup_over_replay_priority() {
22289        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22290
22291        let dir = TempDir::new().unwrap();
22292        let canonical_db = dir.path().join("agent_search.db");
22293        fs::write(&canonical_db, b"canonical").unwrap();
22294
22295        let replay_dir = dir
22296            .path()
22297            .join("repair-lab")
22298            .join("replay-20260324T070101Z");
22299        fs::create_dir_all(&replay_dir).unwrap();
22300        let replay_db = replay_dir.join("agent_search.db");
22301        let replay_storage = SqliteStorage::open(&replay_db).unwrap();
22302        let agent = Agent {
22303            id: None,
22304            slug: "codex".into(),
22305            name: "Codex".into(),
22306            version: Some("0.2.3".into()),
22307            kind: AgentKind::Cli,
22308        };
22309        let agent_id = replay_storage.ensure_agent(&agent).unwrap();
22310        let conversation = Conversation {
22311            id: None,
22312            agent_slug: "codex".into(),
22313            workspace: Some(PathBuf::from("/tmp/workspace")),
22314            external_id: Some("replay-conv".into()),
22315            title: Some("Replay bundle".into()),
22316            source_path: PathBuf::from("/tmp/replay.jsonl"),
22317            started_at: Some(1_700_000_000_000),
22318            ended_at: Some(1_700_000_000_100),
22319            approx_tokens: Some(42),
22320            metadata_json: serde_json::Value::Null,
22321            messages: vec![Message {
22322                id: None,
22323                idx: 0,
22324                role: MessageRole::Agent,
22325                author: Some("assistant".into()),
22326                created_at: Some(1_700_000_000_050),
22327                content: "replay message".into(),
22328                extra_json: serde_json::Value::Null,
22329                snippets: Vec::new(),
22330            }],
22331            source_id: LOCAL_SOURCE_ID.into(),
22332            origin_host: None,
22333        };
22334        replay_storage
22335            .insert_conversation_tree(agent_id, None, &conversation)
22336            .unwrap();
22337        drop(replay_storage);
22338
22339        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
22340        let replay_legacy = rusqlite_test_fixture_conn(&replay_db);
22341        replay_legacy
22342            .execute_batch(
22343                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
22344                 DELETE FROM _schema_migrations WHERE version = 14;
22345                 PRAGMA writable_schema = ON;",
22346            )
22347            .unwrap();
22348        replay_legacy
22349            .execute(
22350                "DELETE FROM meta WHERE key = ?1",
22351                [FTS_FRANKEN_REBUILD_META_KEY],
22352            )
22353            .unwrap();
22354        replay_legacy
22355            .execute(
22356                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22357                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22358                [duplicate_legacy_fts_sql],
22359            )
22360            .unwrap();
22361        replay_legacy
22362            .execute_batch("PRAGMA writable_schema = OFF;")
22363            .unwrap();
22364        drop(replay_legacy);
22365
22366        let backups_dir = dir.path().join("backups");
22367        fs::create_dir_all(&backups_dir).unwrap();
22368        let clean_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
22369        let clean_storage = SqliteStorage::open(&clean_backup).unwrap();
22370        let clean_agent_id = clean_storage.ensure_agent(&agent).unwrap();
22371        clean_storage
22372            .insert_conversation_tree(clean_agent_id, None, &conversation)
22373            .unwrap();
22374        drop(clean_storage);
22375
22376        let bundles = discover_historical_database_bundles(&canonical_db);
22377        let ordered_paths: Vec<PathBuf> = bundles
22378            .iter()
22379            .map(|bundle| bundle.root_path.clone())
22380            .collect();
22381
22382        assert_eq!(ordered_paths[0], clean_backup);
22383        assert_eq!(ordered_paths[1], replay_db);
22384        assert_eq!(
22385            bundles[0].probe.schema_version,
22386            Some(CURRENT_SCHEMA_VERSION)
22387        );
22388        // Post-V14 cass drops the fts_messages virtual table during migration
22389        // and recreates it lazily on first open, so a freshly-migrated "clean"
22390        // backup has zero fts_messages rows in sqlite_master. The bundle is
22391        // still ranked as healthy by `bundle_health_rank` because 0 rows is a
22392        // legitimate lazy-FTS state (see comment there).
22393        assert_eq!(bundles[0].probe.fts_schema_rows, Some(0));
22394        // `fts_queryable` mirrors a direct rusqlite probe; with 0 sqlite_master
22395        // rows the table isn't queryable until lazy repair runs.
22396        assert!(!bundles[0].probe.fts_queryable);
22397        assert_eq!(bundles[1].probe.schema_version, Some(13));
22398        // The replay bundle had V14 run (dropping fts_messages → 0 rows), then
22399        // the test rolls meta.schema_version back to 13, deletes the V14
22400        // marker, and manually injects a duplicate sqlite_master row. Net
22401        // result: one synthetic (malformed) fts_messages entry.
22402        assert_eq!(bundles[1].probe.fts_schema_rows, Some(1));
22403    }
22404
22405    #[test]
22406    fn ensure_fts_consistency_via_rusqlite_catches_up_missing_rows() {
22407        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22408
22409        let dir = TempDir::new().unwrap();
22410        let db_path = dir.path().join("fts-catchup.db");
22411        let storage = SqliteStorage::open(&db_path).unwrap();
22412        let agent = Agent {
22413            id: None,
22414            slug: "codex".into(),
22415            name: "Codex".into(),
22416            version: Some("0.2.3".into()),
22417            kind: AgentKind::Cli,
22418        };
22419        let agent_id = storage.ensure_agent(&agent).unwrap();
22420        let conversation = Conversation {
22421            id: None,
22422            agent_slug: "codex".into(),
22423            workspace: Some(PathBuf::from("/tmp/workspace")),
22424            external_id: Some("fts-catchup".into()),
22425            title: Some("FTS catchup".into()),
22426            source_path: PathBuf::from("/tmp/fts-catchup.jsonl"),
22427            started_at: Some(1_700_000_000_000),
22428            ended_at: Some(1_700_000_000_100),
22429            approx_tokens: Some(42),
22430            metadata_json: serde_json::Value::Null,
22431            messages: vec![Message {
22432                id: None,
22433                idx: 0,
22434                role: MessageRole::User,
22435                author: Some("user".into()),
22436                created_at: Some(1_700_000_000_050),
22437                content: "initial message".into(),
22438                extra_json: serde_json::Value::Null,
22439                snippets: Vec::new(),
22440            }],
22441            source_id: LOCAL_SOURCE_ID.into(),
22442            origin_host: None,
22443        };
22444        storage
22445            .insert_conversation_tree(agent_id, None, &conversation)
22446            .unwrap();
22447        drop(storage);
22448
22449        rebuild_fts_via_rusqlite(&db_path).unwrap();
22450
22451        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22452        let conversation_id: i64 = conn
22453            .query_row_map("SELECT id FROM conversations LIMIT 1", fparams![], |row| {
22454                row.get_typed(0)
22455            })
22456            .unwrap();
22457        conn.execute_compat(
22458            "INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
22459             VALUES(2, ?1, 1, 'assistant', 'assistant', 1700000000060, 'authentication catchup', NULL, NULL)",
22460            fparams![conversation_id],
22461        )
22462        .unwrap();
22463        drop(conn);
22464
22465        let repair = ensure_fts_consistency_via_rusqlite(&db_path).unwrap();
22466        assert_eq!(
22467            repair,
22468            FtsConsistencyRepair::IncrementalCatchUp {
22469                inserted_rows: 1,
22470                total_rows: 2
22471            }
22472        );
22473
22474        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22475        let auth_rows: i64 = conn
22476            .query_row_map(
22477                "SELECT COUNT(*) FROM fts_messages WHERE rowid = 2",
22478                fparams![],
22479                |row| row.get_typed(0),
22480            )
22481            .unwrap();
22482        assert_eq!(auth_rows, 1);
22483    }
22484
22485    #[test]
22486    fn rebuild_fts_via_rusqlite_cleans_duplicate_legacy_schema_rows() {
22487        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22488
22489        let dir = TempDir::new().unwrap();
22490        let db_path = dir.path().join("fts-duplicate-rebuild.db");
22491
22492        let storage = SqliteStorage::open(&db_path).unwrap();
22493        let agent = Agent {
22494            id: None,
22495            slug: "codex".into(),
22496            name: "Codex".into(),
22497            version: Some("0.2.3".into()),
22498            kind: AgentKind::Cli,
22499        };
22500        let agent_id = storage.ensure_agent(&agent).unwrap();
22501        let conversation = Conversation {
22502            id: None,
22503            agent_slug: "codex".into(),
22504            workspace: Some(PathBuf::from("/ws")),
22505            external_id: Some("retro".into()),
22506            title: Some("retro".into()),
22507            source_path: PathBuf::from("/tmp/retro.jsonl"),
22508            started_at: Some(42),
22509            ended_at: Some(42),
22510            approx_tokens: None,
22511            metadata_json: serde_json::Value::Null,
22512            messages: vec![Message {
22513                id: None,
22514                idx: 0,
22515                role: MessageRole::User,
22516                author: None,
22517                created_at: Some(42),
22518                content: "retro investigation".into(),
22519                extra_json: serde_json::Value::Null,
22520                snippets: Vec::new(),
22521            }],
22522            source_id: LOCAL_SOURCE_ID.into(),
22523            origin_host: None,
22524        };
22525        storage
22526            .insert_conversation_tree(agent_id, None, &conversation)
22527            .unwrap();
22528        drop(storage);
22529        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
22530
22531        let conn = rusqlite_test_fixture_conn(&db_path);
22532        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
22533        conn.execute(
22534            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22535             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22536            ["CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')"],
22537        )
22538        .unwrap();
22539        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
22540        let duplicate_rows: i64 = conn
22541            .query_row(
22542                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
22543                [],
22544                |row| row.get(0),
22545            )
22546            .unwrap();
22547        assert_eq!(duplicate_rows, 2);
22548        drop(conn);
22549
22550        let inserted = rebuild_fts_via_rusqlite(&db_path).unwrap();
22551        assert_eq!(inserted, 1);
22552
22553        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22554        let schema_rows = franken_fts_schema_rows(&conn).unwrap();
22555        assert_eq!(
22556            schema_rows, 1,
22557            "DROP TABLE should leave one clean FTS schema"
22558        );
22559        let match_count: i64 = conn
22560            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
22561                row.get_typed(0)
22562            })
22563            .unwrap();
22564        assert_eq!(match_count, 1);
22565    }
22566
22567    // =========================================================================
22568    // Agent storage tests (bead yln.4)
22569    // =========================================================================
22570
22571    #[test]
22572    fn ensure_agent_creates_new() {
22573        let dir = TempDir::new().unwrap();
22574        let db_path = dir.path().join("test.db");
22575        let storage = SqliteStorage::open(&db_path).unwrap();
22576
22577        let agent = Agent {
22578            id: None,
22579            slug: "test_agent".into(),
22580            name: "Test Agent".into(),
22581            version: Some("1.0".into()),
22582            kind: AgentKind::Cli,
22583        };
22584
22585        let id = storage.ensure_agent(&agent).unwrap();
22586        assert!(id > 0);
22587    }
22588
22589    #[test]
22590    fn ensure_agent_returns_existing_id() {
22591        let dir = TempDir::new().unwrap();
22592        let db_path = dir.path().join("test.db");
22593        let storage = SqliteStorage::open(&db_path).unwrap();
22594
22595        let agent = Agent {
22596            id: None,
22597            slug: "codex".into(),
22598            name: "Codex".into(),
22599            version: None,
22600            kind: AgentKind::Cli,
22601        };
22602
22603        let id1 = storage.ensure_agent(&agent).unwrap();
22604        let id2 = storage.ensure_agent(&agent).unwrap();
22605        assert_eq!(id1, id2);
22606    }
22607
22608    #[test]
22609    fn ensure_agent_unchanged_preserves_updated_at() {
22610        let dir = TempDir::new().unwrap();
22611        let db_path = dir.path().join("test.db");
22612        let storage = SqliteStorage::open(&db_path).unwrap();
22613
22614        let agent = Agent {
22615            id: None,
22616            slug: "codex".into(),
22617            name: "Codex".into(),
22618            version: Some("1.0".into()),
22619            kind: AgentKind::Cli,
22620        };
22621
22622        storage.ensure_agent(&agent).unwrap();
22623        let initial_updated_at: i64 = storage
22624            .conn
22625            .query_row_map(
22626                "SELECT updated_at FROM agents WHERE slug = ?1",
22627                fparams![agent.slug.as_str()],
22628                |row| row.get_typed(0),
22629            )
22630            .unwrap();
22631        std::thread::sleep(std::time::Duration::from_millis(5));
22632
22633        storage.ensure_agent(&agent).unwrap();
22634        let fetched_updated_at: i64 = storage
22635            .conn
22636            .query_row_map(
22637                "SELECT updated_at FROM agents WHERE slug = ?1",
22638                fparams![agent.slug.as_str()],
22639                |row| row.get_typed(0),
22640            )
22641            .unwrap();
22642
22643        assert_eq!(fetched_updated_at, initial_updated_at);
22644    }
22645
22646    #[test]
22647    fn ensure_agent_changed_metadata_updates_cached_slug() {
22648        let dir = TempDir::new().unwrap();
22649        let db_path = dir.path().join("test.db");
22650        let storage = SqliteStorage::open(&db_path).unwrap();
22651
22652        let mut agent = Agent {
22653            id: None,
22654            slug: "codex".into(),
22655            name: "Codex".into(),
22656            version: Some("1.0".into()),
22657            kind: AgentKind::Cli,
22658        };
22659
22660        let id1 = storage.ensure_agent(&agent).unwrap();
22661        agent.name = "Codex CLI".into();
22662        agent.version = Some("1.1".into());
22663        let id2 = storage.ensure_agent(&agent).unwrap();
22664
22665        let fetched: (String, Option<String>) = storage
22666            .conn
22667            .query_row_map(
22668                "SELECT name, version FROM agents WHERE slug = ?1",
22669                fparams![agent.slug.as_str()],
22670                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
22671            )
22672            .unwrap();
22673
22674        assert_eq!(id1, id2);
22675        assert_eq!(fetched, ("Codex CLI".into(), Some("1.1".into())));
22676    }
22677
22678    #[test]
22679    fn list_agents_returns_inserted() {
22680        let dir = TempDir::new().unwrap();
22681        let db_path = dir.path().join("test.db");
22682        let storage = SqliteStorage::open(&db_path).unwrap();
22683
22684        let agent = Agent {
22685            id: None,
22686            slug: "new_agent".into(),
22687            name: "New Agent".into(),
22688            version: None,
22689            kind: AgentKind::VsCode,
22690        };
22691        storage.ensure_agent(&agent).unwrap();
22692
22693        let agents = storage.list_agents().unwrap();
22694        assert!(agents.iter().any(|a| a.slug == "new_agent"));
22695    }
22696
22697    // =========================================================================
22698    // Workspace storage tests (bead yln.4)
22699    // =========================================================================
22700
22701    #[test]
22702    fn ensure_workspace_creates_new() {
22703        let dir = TempDir::new().unwrap();
22704        let db_path = dir.path().join("test.db");
22705        let storage = SqliteStorage::open(&db_path).unwrap();
22706
22707        let id = storage
22708            .ensure_workspace(Path::new("/home/user/project"), Some("My Project"))
22709            .unwrap();
22710        assert!(id > 0);
22711    }
22712
22713    #[test]
22714    fn ensure_workspace_returns_existing() {
22715        let dir = TempDir::new().unwrap();
22716        let db_path = dir.path().join("test.db");
22717        let storage = SqliteStorage::open(&db_path).unwrap();
22718
22719        let path = Path::new("/home/user/myproject");
22720        let id1 = storage.ensure_workspace(path, None).unwrap();
22721        let id2 = storage.ensure_workspace(path, None).unwrap();
22722        assert_eq!(id1, id2);
22723    }
22724
22725    #[test]
22726    fn ensure_workspace_changed_display_name_updates_cached_path() {
22727        let dir = TempDir::new().unwrap();
22728        let db_path = dir.path().join("test.db");
22729        let storage = SqliteStorage::open(&db_path).unwrap();
22730
22731        let path = Path::new("/home/user/myproject");
22732        let id1 = storage.ensure_workspace(path, Some("Before")).unwrap();
22733        let id2 = storage.ensure_workspace(path, Some("After")).unwrap();
22734
22735        let display_name: Option<String> = storage
22736            .conn
22737            .query_row_map(
22738                "SELECT display_name FROM workspaces WHERE path = ?1",
22739                fparams![path.to_string_lossy().as_ref()],
22740                |row| row.get_typed(0),
22741            )
22742            .unwrap();
22743
22744        assert_eq!(id1, id2);
22745        assert_eq!(display_name.as_deref(), Some("After"));
22746    }
22747
22748    #[test]
22749    fn list_workspaces_returns_inserted() {
22750        let dir = TempDir::new().unwrap();
22751        let db_path = dir.path().join("test.db");
22752        let storage = SqliteStorage::open(&db_path).unwrap();
22753
22754        storage
22755            .ensure_workspace(Path::new("/test/workspace"), Some("Test WS"))
22756            .unwrap();
22757
22758        let workspaces = storage.list_workspaces().unwrap();
22759        assert!(
22760            workspaces
22761                .iter()
22762                .any(|w| w.path.to_str() == Some("/test/workspace"))
22763        );
22764    }
22765
22766    // =========================================================================
22767    // Source storage tests (bead yln.4)
22768    // =========================================================================
22769
22770    #[test]
22771    fn upsert_source_creates_new() {
22772        let dir = TempDir::new().unwrap();
22773        let db_path = dir.path().join("test.db");
22774        let storage = SqliteStorage::open(&db_path).unwrap();
22775
22776        let source = Source {
22777            id: "test-laptop".into(),
22778            kind: SourceKind::Ssh,
22779            host_label: Some("test.local".into()),
22780            machine_id: Some("test-machine-id".into()),
22781            platform: None,
22782            config_json: None,
22783            created_at: Some(SqliteStorage::now_millis()),
22784            updated_at: None,
22785        };
22786
22787        storage.upsert_source(&source).unwrap();
22788        let fetched = storage.get_source("test-laptop").unwrap();
22789        assert!(fetched.is_some());
22790        assert_eq!(fetched.unwrap().host_label, Some("test.local".into()));
22791    }
22792
22793    #[test]
22794    fn upsert_source_updates_existing() {
22795        let dir = TempDir::new().unwrap();
22796        let db_path = dir.path().join("test.db");
22797        let storage = SqliteStorage::open(&db_path).unwrap();
22798
22799        let source1 = Source {
22800            id: "my-source".into(),
22801            kind: SourceKind::Ssh,
22802            host_label: Some("Original Label".into()),
22803            machine_id: None,
22804            platform: None,
22805            config_json: None,
22806            created_at: Some(SqliteStorage::now_millis()),
22807            updated_at: None,
22808        };
22809        storage.upsert_source(&source1).unwrap();
22810
22811        let source2 = Source {
22812            id: "my-source".into(),
22813            kind: SourceKind::Ssh,
22814            host_label: Some("Updated Label".into()),
22815            machine_id: None,
22816            platform: Some("linux".into()),
22817            config_json: None,
22818            created_at: Some(SqliteStorage::now_millis()),
22819            updated_at: Some(SqliteStorage::now_millis()),
22820        };
22821        storage.upsert_source(&source2).unwrap();
22822
22823        let fetched = storage.get_source("my-source").unwrap().unwrap();
22824        assert_eq!(fetched.host_label, Some("Updated Label".into()));
22825        assert!(fetched.platform.is_some());
22826    }
22827
22828    #[test]
22829    fn upsert_source_unchanged_preserves_updated_at() {
22830        let dir = TempDir::new().unwrap();
22831        let db_path = dir.path().join("test.db");
22832        let storage = SqliteStorage::open(&db_path).unwrap();
22833
22834        let source = Source {
22835            id: "stable-source".into(),
22836            kind: SourceKind::Ssh,
22837            host_label: Some("builder.local".into()),
22838            machine_id: None,
22839            platform: Some("linux".into()),
22840            config_json: Some(serde_json::json!({"role": "bench"})),
22841            created_at: None,
22842            updated_at: None,
22843        };
22844
22845        storage.upsert_source(&source).unwrap();
22846        let initial = storage.get_source("stable-source").unwrap().unwrap();
22847        std::thread::sleep(std::time::Duration::from_millis(5));
22848
22849        storage.upsert_source(&source).unwrap();
22850        let fetched = storage.get_source("stable-source").unwrap().unwrap();
22851
22852        assert_eq!(fetched.created_at, initial.created_at);
22853        assert_eq!(fetched.updated_at, initial.updated_at);
22854        assert_eq!(fetched.host_label, initial.host_label);
22855        assert_eq!(fetched.platform, initial.platform);
22856        assert_eq!(fetched.config_json, initial.config_json);
22857    }
22858
22859    #[test]
22860    fn ensure_source_for_conversation_recreates_remote_source_after_delete() {
22861        let dir = TempDir::new().unwrap();
22862        let db_path = dir.path().join("test.db");
22863        let storage = SqliteStorage::open(&db_path).unwrap();
22864
22865        let conversation = Conversation {
22866            id: None,
22867            agent_slug: "codex".into(),
22868            workspace: Some(PathBuf::from("/ws/cache-recreate")),
22869            external_id: Some("cache-recreate".into()),
22870            title: Some("Cache Recreate".into()),
22871            source_path: PathBuf::from("/log/cache-recreate.jsonl"),
22872            started_at: Some(1_700_000_000_000),
22873            ended_at: Some(1_700_000_000_001),
22874            approx_tokens: Some(16),
22875            metadata_json: serde_json::json!({}),
22876            messages: vec![Message {
22877                id: None,
22878                idx: 0,
22879                role: MessageRole::User,
22880                author: Some("tester".into()),
22881                created_at: Some(1_700_000_000_000),
22882                content: "cache recreate".into(),
22883                extra_json: serde_json::json!({}),
22884                snippets: Vec::new(),
22885            }],
22886            source_id: "cache-remote-source".into(),
22887            origin_host: Some("builder-cache".into()),
22888        };
22889
22890        storage
22891            .ensure_source_for_conversation(&conversation)
22892            .unwrap();
22893        assert!(storage.get_source("cache-remote-source").unwrap().is_some());
22894
22895        let deleted = storage.delete_source("cache-remote-source", false).unwrap();
22896        assert!(deleted);
22897        assert!(storage.get_source("cache-remote-source").unwrap().is_none());
22898
22899        storage
22900            .ensure_source_for_conversation(&conversation)
22901            .unwrap();
22902        let recreated = storage.get_source("cache-remote-source").unwrap();
22903        assert!(recreated.is_some());
22904        assert_eq!(
22905            recreated.unwrap().host_label.as_deref(),
22906            Some("builder-cache")
22907        );
22908    }
22909
22910    #[test]
22911    fn delete_source_removes_entry() {
22912        let dir = TempDir::new().unwrap();
22913        let db_path = dir.path().join("test.db");
22914        let storage = SqliteStorage::open(&db_path).unwrap();
22915
22916        let source = Source {
22917            id: "to-delete".into(),
22918            kind: SourceKind::Local,
22919            host_label: None,
22920            machine_id: None,
22921            platform: None,
22922            config_json: None,
22923            created_at: Some(SqliteStorage::now_millis()),
22924            updated_at: None,
22925        };
22926        storage.upsert_source(&source).unwrap();
22927
22928        let deleted = storage.delete_source("to-delete", false).unwrap();
22929        assert!(deleted);
22930
22931        let fetched = storage.get_source("to-delete").unwrap();
22932        assert!(fetched.is_none());
22933    }
22934
22935    #[test]
22936    fn delete_source_cannot_delete_local() {
22937        let dir = TempDir::new().unwrap();
22938        let db_path = dir.path().join("test.db");
22939        let storage = SqliteStorage::open(&db_path).unwrap();
22940
22941        let result = storage.delete_source(LOCAL_SOURCE_ID, false);
22942        assert!(result.is_err());
22943    }
22944
22945    #[test]
22946    fn list_sources_includes_local() {
22947        let dir = TempDir::new().unwrap();
22948        let db_path = dir.path().join("test.db");
22949        let storage = SqliteStorage::open(&db_path).unwrap();
22950
22951        let sources = storage.list_sources().unwrap();
22952        assert!(sources.iter().any(|s| s.id == LOCAL_SOURCE_ID));
22953    }
22954
22955    #[test]
22956    fn insert_conversation_tree_blank_local_source_normalizes_to_local_id() {
22957        let dir = TempDir::new().unwrap();
22958        let db_path = dir.path().join("test.db");
22959        let storage = SqliteStorage::open(&db_path).unwrap();
22960
22961        let agent_id = storage
22962            .ensure_agent(&Agent {
22963                id: None,
22964                slug: "codex".into(),
22965                name: "Codex".into(),
22966                version: None,
22967                kind: AgentKind::Cli,
22968            })
22969            .unwrap();
22970
22971        let conversation = Conversation {
22972            id: None,
22973            agent_slug: "codex".into(),
22974            workspace: None,
22975            external_id: Some("blank-local-source".into()),
22976            title: Some("Blank local source".into()),
22977            source_path: dir.path().join("blank-local.jsonl"),
22978            started_at: Some(1_700_000_000_000),
22979            ended_at: Some(1_700_000_000_001),
22980            approx_tokens: None,
22981            metadata_json: serde_json::Value::Null,
22982            messages: vec![Message {
22983                id: None,
22984                idx: 0,
22985                role: MessageRole::User,
22986                author: None,
22987                created_at: Some(1_700_000_000_000),
22988                content: "hello".into(),
22989                extra_json: serde_json::Value::Null,
22990                snippets: Vec::new(),
22991            }],
22992            source_id: "   ".into(),
22993            origin_host: None,
22994        };
22995
22996        storage
22997            .insert_conversation_tree(agent_id, None, &conversation)
22998            .unwrap();
22999
23000        assert!(storage.get_source("   ").unwrap().is_none());
23001        let source = storage
23002            .get_source(LOCAL_SOURCE_ID)
23003            .unwrap()
23004            .expect("local source row should exist");
23005        assert_eq!(source.kind, SourceKind::Local);
23006        assert_eq!(source.host_label, None);
23007
23008        let conversations = storage.list_conversations(10, 0).unwrap();
23009        assert_eq!(conversations.len(), 1);
23010        assert_eq!(conversations[0].source_id, LOCAL_SOURCE_ID);
23011        assert_eq!(conversations[0].origin_host, None);
23012    }
23013
23014    #[test]
23015    fn repeated_local_inserts_do_not_touch_bootstrap_source_row() {
23016        let dir = TempDir::new().unwrap();
23017        let db_path = dir.path().join("test.db");
23018        let storage = SqliteStorage::open(&db_path).unwrap();
23019
23020        let agent_id = storage
23021            .ensure_agent(&Agent {
23022                id: None,
23023                slug: "codex".into(),
23024                name: "Codex".into(),
23025                version: None,
23026                kind: AgentKind::Cli,
23027            })
23028            .unwrap();
23029
23030        let bootstrap_updated_at: i64 = storage
23031            .conn
23032            .query_row_map(
23033                "SELECT updated_at FROM sources WHERE id = ?1",
23034                fparams![LOCAL_SOURCE_ID],
23035                |row| row.get_typed(0),
23036            )
23037            .unwrap();
23038
23039        let make_conversation = |external_id: &str, suffix: &str| Conversation {
23040            id: None,
23041            agent_slug: "codex".into(),
23042            workspace: None,
23043            external_id: Some(external_id.into()),
23044            title: Some(format!("Local source {suffix}")),
23045            source_path: dir.path().join(format!("local-{suffix}.jsonl")),
23046            started_at: Some(1_700_000_000_000),
23047            ended_at: Some(1_700_000_000_001),
23048            approx_tokens: None,
23049            metadata_json: serde_json::Value::Null,
23050            messages: vec![Message {
23051                id: None,
23052                idx: 0,
23053                role: MessageRole::User,
23054                author: None,
23055                created_at: Some(1_700_000_000_000),
23056                content: format!("hello-{suffix}"),
23057                extra_json: serde_json::Value::Null,
23058                snippets: Vec::new(),
23059            }],
23060            source_id: LOCAL_SOURCE_ID.into(),
23061            origin_host: None,
23062        };
23063
23064        std::thread::sleep(std::time::Duration::from_millis(5));
23065        storage
23066            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-1", "one"))
23067            .unwrap();
23068        let after_first_insert: i64 = storage
23069            .conn
23070            .query_row_map(
23071                "SELECT updated_at FROM sources WHERE id = ?1",
23072                fparams![LOCAL_SOURCE_ID],
23073                |row| row.get_typed(0),
23074            )
23075            .unwrap();
23076
23077        std::thread::sleep(std::time::Duration::from_millis(5));
23078        storage
23079            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-2", "two"))
23080            .unwrap();
23081        let after_second_insert: i64 = storage
23082            .conn
23083            .query_row_map(
23084                "SELECT updated_at FROM sources WHERE id = ?1",
23085                fparams![LOCAL_SOURCE_ID],
23086                |row| row.get_typed(0),
23087            )
23088            .unwrap();
23089
23090        assert_eq!(after_first_insert, bootstrap_updated_at);
23091        assert_eq!(after_second_insert, bootstrap_updated_at);
23092    }
23093
23094    #[test]
23095    fn insert_conversation_tree_blank_remote_source_normalizes_to_origin_host() {
23096        let dir = TempDir::new().unwrap();
23097        let db_path = dir.path().join("test.db");
23098        let storage = SqliteStorage::open(&db_path).unwrap();
23099
23100        let agent_id = storage
23101            .ensure_agent(&Agent {
23102                id: None,
23103                slug: "codex".into(),
23104                name: "Codex".into(),
23105                version: None,
23106                kind: AgentKind::Cli,
23107            })
23108            .unwrap();
23109
23110        let conversation = Conversation {
23111            id: None,
23112            agent_slug: "codex".into(),
23113            workspace: None,
23114            external_id: Some("blank-remote-source".into()),
23115            title: Some("Blank remote source".into()),
23116            source_path: dir.path().join("blank-remote.jsonl"),
23117            started_at: Some(1_700_000_000_000),
23118            ended_at: Some(1_700_000_000_001),
23119            approx_tokens: None,
23120            metadata_json: serde_json::Value::Null,
23121            messages: vec![Message {
23122                id: None,
23123                idx: 0,
23124                role: MessageRole::User,
23125                author: None,
23126                created_at: Some(1_700_000_000_000),
23127                content: "hello".into(),
23128                extra_json: serde_json::Value::Null,
23129                snippets: Vec::new(),
23130            }],
23131            source_id: "   ".into(),
23132            origin_host: Some("user@work-laptop".into()),
23133        };
23134
23135        storage
23136            .insert_conversation_tree(agent_id, None, &conversation)
23137            .unwrap();
23138
23139        assert!(storage.get_source("   ").unwrap().is_none());
23140        let source = storage
23141            .get_source("user@work-laptop")
23142            .unwrap()
23143            .expect("normalized remote source row should exist");
23144        assert_eq!(source.kind, SourceKind::Ssh);
23145        assert_eq!(source.host_label.as_deref(), Some("user@work-laptop"));
23146
23147        let conversations = storage.list_conversations(10, 0).unwrap();
23148        assert_eq!(conversations.len(), 1);
23149        assert_eq!(conversations[0].source_id, "user@work-laptop");
23150        assert_eq!(
23151            conversations[0].origin_host.as_deref(),
23152            Some("user@work-laptop")
23153        );
23154    }
23155
23156    #[test]
23157    fn insert_conversations_batched_normalizes_host_only_remote_source_id() {
23158        let dir = TempDir::new().unwrap();
23159        let db_path = dir.path().join("test.db");
23160        let storage = SqliteStorage::open(&db_path).unwrap();
23161
23162        let agent_id = storage
23163            .ensure_agent(&Agent {
23164                id: None,
23165                slug: "codex".into(),
23166                name: "Codex".into(),
23167                version: None,
23168                kind: AgentKind::Cli,
23169            })
23170            .unwrap();
23171
23172        let conversation = Conversation {
23173            id: None,
23174            agent_slug: "codex".into(),
23175            workspace: None,
23176            external_id: Some("batched-blank-remote-source".into()),
23177            title: Some("Batched blank remote source".into()),
23178            source_path: dir.path().join("batched-blank-remote.jsonl"),
23179            started_at: Some(1_700_000_000_000),
23180            ended_at: Some(1_700_000_000_001),
23181            approx_tokens: None,
23182            metadata_json: serde_json::Value::Null,
23183            messages: vec![Message {
23184                id: None,
23185                idx: 0,
23186                role: MessageRole::User,
23187                author: None,
23188                created_at: Some(1_700_000_000_000),
23189                content: "hello".into(),
23190                extra_json: serde_json::Value::Null,
23191                snippets: Vec::new(),
23192            }],
23193            source_id: "   ".into(),
23194            origin_host: Some("user@batch-host".into()),
23195        };
23196
23197        storage
23198            .insert_conversations_batched(&[(agent_id, None, &conversation)])
23199            .unwrap();
23200
23201        assert!(storage.get_source("   ").unwrap().is_none());
23202        let source = storage
23203            .get_source("user@batch-host")
23204            .unwrap()
23205            .expect("normalized batched remote source row should exist");
23206        assert_eq!(source.kind, SourceKind::Ssh);
23207        assert_eq!(source.host_label.as_deref(), Some("user@batch-host"));
23208
23209        let conversations = storage.list_conversations(10, 0).unwrap();
23210        assert_eq!(conversations.len(), 1);
23211        assert_eq!(conversations[0].source_id, "user@batch-host");
23212        assert_eq!(
23213            conversations[0].origin_host.as_deref(),
23214            Some("user@batch-host")
23215        );
23216    }
23217
23218    #[test]
23219    fn get_source_ids_excludes_local() {
23220        let dir = TempDir::new().unwrap();
23221        let db_path = dir.path().join("test.db");
23222        let storage = SqliteStorage::open(&db_path).unwrap();
23223
23224        // Add a non-local source
23225        let source = Source {
23226            id: "remote-1".into(),
23227            kind: SourceKind::Ssh,
23228            host_label: Some("server".into()),
23229            machine_id: None,
23230            platform: None,
23231            config_json: None,
23232            created_at: Some(SqliteStorage::now_millis()),
23233            updated_at: None,
23234        };
23235        storage.upsert_source(&source).unwrap();
23236
23237        let ids = storage.get_source_ids().unwrap();
23238        assert!(!ids.contains(&LOCAL_SOURCE_ID.to_string()));
23239        assert!(ids.contains(&"remote-1".to_string()));
23240    }
23241
23242    // =========================================================================
23243    // Scan timestamp tests (bead yln.4)
23244    // =========================================================================
23245
23246    #[test]
23247    fn get_last_scan_ts_returns_none_initially() {
23248        let dir = TempDir::new().unwrap();
23249        let db_path = dir.path().join("test.db");
23250        let storage = SqliteStorage::open(&db_path).unwrap();
23251
23252        let ts = storage.get_last_scan_ts().unwrap();
23253        assert!(ts.is_none());
23254    }
23255
23256    #[test]
23257    fn set_and_get_last_scan_ts() {
23258        let dir = TempDir::new().unwrap();
23259        let db_path = dir.path().join("test.db");
23260        let storage = SqliteStorage::open(&db_path).unwrap();
23261
23262        let expected_ts = 1700000000000_i64;
23263        storage.set_last_scan_ts(expected_ts).unwrap();
23264
23265        let actual_ts = storage.get_last_scan_ts().unwrap();
23266        assert_eq!(actual_ts, Some(expected_ts));
23267    }
23268
23269    // =========================================================================
23270    // now_millis utility test (bead yln.4)
23271    // =========================================================================
23272
23273    #[test]
23274    fn now_millis_returns_reasonable_value() {
23275        let ts = SqliteStorage::now_millis();
23276        // Should be after Jan 1, 2020 (approx 1577836800000)
23277        assert!(ts > 1577836800000);
23278        // Should be before Jan 1, 2100 (approx 4102444800000)
23279        assert!(ts < 4102444800000);
23280    }
23281
23282    // =========================================================================
23283    // Binary Metadata Serialization Tests (Opt 3.1)
23284    // =========================================================================
23285
23286    #[test]
23287    fn msgpack_roundtrip_basic_object() {
23288        let value = serde_json::json!({
23289            "key": "value",
23290            "number": 42,
23291            "nested": { "inner": true }
23292        });
23293
23294        let bytes = serialize_json_to_msgpack(&value).expect("should serialize");
23295        let recovered = deserialize_msgpack_to_json(&bytes);
23296
23297        assert_eq!(value, recovered);
23298    }
23299
23300    #[test]
23301    fn msgpack_returns_none_for_null() {
23302        let value = serde_json::Value::Null;
23303        assert!(serialize_json_to_msgpack(&value).is_none());
23304    }
23305
23306    #[test]
23307    fn message_insert_stores_null_extra_json_as_sql_null() {
23308        let dir = TempDir::new().unwrap();
23309        let db_path = dir.path().join("test.db");
23310        let storage = SqliteStorage::open(&db_path).unwrap();
23311        let agent_id = storage
23312            .ensure_agent(&Agent {
23313                id: None,
23314                slug: "codex".into(),
23315                name: "Codex".into(),
23316                version: None,
23317                kind: AgentKind::Cli,
23318            })
23319            .unwrap();
23320        let conversation = Conversation {
23321            id: None,
23322            agent_slug: "codex".into(),
23323            workspace: None,
23324            external_id: Some("null-extra-json".into()),
23325            title: Some("Null extra_json".into()),
23326            source_path: PathBuf::from("/tmp/null-extra-json.jsonl"),
23327            started_at: Some(1_700_000_000_000),
23328            ended_at: Some(1_700_000_000_001),
23329            approx_tokens: None,
23330            metadata_json: serde_json::Value::Null,
23331            messages: vec![Message {
23332                id: None,
23333                idx: 0,
23334                role: MessageRole::User,
23335                author: None,
23336                created_at: Some(1_700_000_000_000),
23337                content: "null metadata message".into(),
23338                extra_json: serde_json::Value::Null,
23339                snippets: Vec::new(),
23340            }],
23341            source_id: LOCAL_SOURCE_ID.into(),
23342            origin_host: None,
23343        };
23344
23345        let conversation_id = storage
23346            .insert_conversation_tree(agent_id, None, &conversation)
23347            .unwrap()
23348            .conversation_id;
23349
23350        let (extra_json, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23351            .conn
23352            .query_row_map(
23353                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23354                fparams![conversation_id],
23355                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23356            )
23357            .unwrap();
23358        assert!(extra_json.is_none());
23359        assert!(extra_bin.is_none());
23360
23361        let stored = storage.fetch_messages(conversation_id).unwrap();
23362        assert!(stored[0].extra_json.is_null());
23363    }
23364
23365    #[test]
23366    fn message_insert_stores_nonempty_extra_json_as_msgpack_only() {
23367        let dir = TempDir::new().unwrap();
23368        let db_path = dir.path().join("test.db");
23369        let storage = SqliteStorage::open(&db_path).unwrap();
23370        let agent_id = storage
23371            .ensure_agent(&Agent {
23372                id: None,
23373                slug: "codex".into(),
23374                name: "Codex".into(),
23375                version: None,
23376                kind: AgentKind::Cli,
23377            })
23378            .unwrap();
23379        let extra_json = serde_json::json!({ "idx": 7, "kind": "profile" });
23380        let conversation = Conversation {
23381            id: None,
23382            agent_slug: "codex".into(),
23383            workspace: None,
23384            external_id: Some("msgpack-extra-json".into()),
23385            title: Some("MessagePack extra_json".into()),
23386            source_path: PathBuf::from("/tmp/msgpack-extra-json.jsonl"),
23387            started_at: Some(1_700_000_000_000),
23388            ended_at: Some(1_700_000_000_001),
23389            approx_tokens: None,
23390            metadata_json: serde_json::Value::Null,
23391            messages: vec![Message {
23392                id: None,
23393                idx: 0,
23394                role: MessageRole::User,
23395                author: None,
23396                created_at: Some(1_700_000_000_000),
23397                content: "msgpack metadata message".into(),
23398                extra_json: extra_json.clone(),
23399                snippets: Vec::new(),
23400            }],
23401            source_id: LOCAL_SOURCE_ID.into(),
23402            origin_host: None,
23403        };
23404
23405        let conversation_id = storage
23406            .insert_conversation_tree(agent_id, None, &conversation)
23407            .unwrap()
23408            .conversation_id;
23409
23410        let (extra_json_text, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23411            .conn
23412            .query_row_map(
23413                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23414                fparams![conversation_id],
23415                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23416            )
23417            .unwrap();
23418        assert!(extra_json_text.is_none());
23419        assert!(extra_bin.is_some());
23420
23421        let stored = storage.fetch_messages(conversation_id).unwrap();
23422        assert_eq!(stored[0].extra_json, extra_json);
23423    }
23424
23425    #[test]
23426    fn conversation_insert_preserves_null_metadata_json_as_json_null() {
23427        let dir = TempDir::new().unwrap();
23428        let db_path = dir.path().join("test.db");
23429        let storage = SqliteStorage::open(&db_path).unwrap();
23430        let agent_id = storage
23431            .ensure_agent(&Agent {
23432                id: None,
23433                slug: "codex".into(),
23434                name: "Codex".into(),
23435                version: None,
23436                kind: AgentKind::Cli,
23437            })
23438            .unwrap();
23439        let conversation = Conversation {
23440            id: None,
23441            agent_slug: "codex".into(),
23442            workspace: None,
23443            external_id: Some("null-conversation-metadata".into()),
23444            title: Some("Null conversation metadata".into()),
23445            source_path: PathBuf::from("/tmp/null-conversation-metadata.jsonl"),
23446            started_at: Some(1_700_000_000_000),
23447            ended_at: Some(1_700_000_000_001),
23448            approx_tokens: None,
23449            metadata_json: serde_json::Value::Null,
23450            messages: vec![Message {
23451                id: None,
23452                idx: 0,
23453                role: MessageRole::User,
23454                author: None,
23455                created_at: Some(1_700_000_000_000),
23456                content: "null conversation metadata message".into(),
23457                extra_json: serde_json::Value::Null,
23458                snippets: Vec::new(),
23459            }],
23460            source_id: LOCAL_SOURCE_ID.into(),
23461            origin_host: None,
23462        };
23463
23464        storage
23465            .insert_conversation_tree(agent_id, None, &conversation)
23466            .unwrap();
23467
23468        let (metadata_json, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23469            .conn
23470            .query_row_map(
23471                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23472                fparams!["null-conversation-metadata"],
23473                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23474            )
23475            .unwrap();
23476        assert_eq!(metadata_json.as_deref(), Some("null"));
23477        assert!(metadata_bin.is_none());
23478
23479        let listed = storage.list_conversations(10, 0).unwrap();
23480        assert!(listed[0].metadata_json.is_null());
23481    }
23482
23483    #[test]
23484    fn conversation_insert_stores_nonempty_metadata_as_msgpack_only() {
23485        let dir = TempDir::new().unwrap();
23486        let db_path = dir.path().join("test.db");
23487        let storage = SqliteStorage::open(&db_path).unwrap();
23488        let agent_id = storage
23489            .ensure_agent(&Agent {
23490                id: None,
23491                slug: "codex".into(),
23492                name: "Codex".into(),
23493                version: None,
23494                kind: AgentKind::Cli,
23495            })
23496            .unwrap();
23497        let metadata_json = serde_json::json!({ "bench": true, "source": "profile" });
23498        let conversation = Conversation {
23499            id: None,
23500            agent_slug: "codex".into(),
23501            workspace: None,
23502            external_id: Some("msgpack-conversation-metadata".into()),
23503            title: Some("MessagePack conversation metadata".into()),
23504            source_path: PathBuf::from("/tmp/msgpack-conversation-metadata.jsonl"),
23505            started_at: Some(1_700_000_000_000),
23506            ended_at: Some(1_700_000_000_001),
23507            approx_tokens: None,
23508            metadata_json: metadata_json.clone(),
23509            messages: vec![Message {
23510                id: None,
23511                idx: 0,
23512                role: MessageRole::User,
23513                author: None,
23514                created_at: Some(1_700_000_000_000),
23515                content: "msgpack conversation metadata message".into(),
23516                extra_json: serde_json::Value::Null,
23517                snippets: Vec::new(),
23518            }],
23519            source_id: LOCAL_SOURCE_ID.into(),
23520            origin_host: None,
23521        };
23522
23523        storage
23524            .insert_conversation_tree(agent_id, None, &conversation)
23525            .unwrap();
23526
23527        let (metadata_text, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23528            .conn
23529            .query_row_map(
23530                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23531                fparams!["msgpack-conversation-metadata"],
23532                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23533            )
23534            .unwrap();
23535        assert!(metadata_text.is_none());
23536        assert!(metadata_bin.is_some());
23537
23538        let listed = storage.list_conversations(10, 0).unwrap();
23539        assert_eq!(listed[0].metadata_json, metadata_json);
23540    }
23541
23542    #[test]
23543    fn msgpack_returns_none_for_empty_object() {
23544        let value = serde_json::json!({});
23545        assert!(serialize_json_to_msgpack(&value).is_none());
23546    }
23547
23548    #[test]
23549    fn parse_historical_json_column_preserves_large_payloads_as_raw_json() {
23550        let raw = format!("{{\"blob\":\"{}\"}}", "x".repeat(1_000_000));
23551
23552        let value = parse_historical_json_column(Some(raw.clone()));
23553
23554        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23555        assert_eq!(json_value_size_hint(&value), raw.len());
23556    }
23557
23558    #[test]
23559    fn parse_historical_json_column_preserves_small_payloads_as_raw_json() {
23560        let raw = String::from("{\"ok\":true,\"n\":1}");
23561
23562        let value = parse_historical_json_column(Some(raw.clone()));
23563
23564        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23565    }
23566
23567    #[test]
23568    fn msgpack_serializes_non_empty_array() {
23569        let value = serde_json::json!([1, 2, 3]);
23570        let bytes = serialize_json_to_msgpack(&value).expect("should serialize array");
23571        let recovered = deserialize_msgpack_to_json(&bytes);
23572        assert_eq!(value, recovered);
23573    }
23574
23575    #[test]
23576    fn msgpack_smaller_than_json() {
23577        let value = serde_json::json!({
23578            "field_name_one": "some_value",
23579            "field_name_two": 123456,
23580            "field_name_three": [1, 2, 3, 4, 5],
23581            "field_name_four": { "nested": true }
23582        });
23583
23584        let json_bytes = serde_json::to_vec(&value).unwrap();
23585        let msgpack_bytes = serialize_json_to_msgpack(&value).unwrap();
23586
23587        // MessagePack should be smaller due to more compact encoding
23588        assert!(
23589            msgpack_bytes.len() < json_bytes.len(),
23590            "MessagePack ({} bytes) should be smaller than JSON ({} bytes)",
23591            msgpack_bytes.len(),
23592            json_bytes.len()
23593        );
23594    }
23595
23596    #[test]
23597    fn migration_v7_adds_binary_columns() {
23598        let dir = TempDir::new().unwrap();
23599        let db_path = dir.path().join("test.db");
23600        let storage = SqliteStorage::open(&db_path).unwrap();
23601
23602        // Verify metadata_bin column exists
23603        let has_metadata_bin = storage
23604            .raw()
23605            .query("PRAGMA table_info(conversations)")
23606            .unwrap()
23607            .iter()
23608            .any(|row| row.get_typed::<String>(1).unwrap() == "metadata_bin");
23609        assert!(
23610            has_metadata_bin,
23611            "conversations should have metadata_bin column"
23612        );
23613
23614        // Verify extra_bin column exists
23615        let has_extra_bin = storage
23616            .raw()
23617            .query("PRAGMA table_info(messages)")
23618            .unwrap()
23619            .iter()
23620            .any(|row| row.get_typed::<String>(1).unwrap() == "extra_bin");
23621        assert!(has_extra_bin, "messages should have extra_bin column");
23622    }
23623
23624    #[test]
23625    fn insert_conversation_tree_rehydrates_append_tail_state_cache_after_manual_clear() {
23626        let dir = TempDir::new().unwrap();
23627        let db_path = dir.path().join("append-tail-state-cache.db");
23628        let storage = SqliteStorage::open(&db_path).unwrap();
23629        let agent_id = storage
23630            .ensure_agent(&Agent {
23631                id: None,
23632                slug: "codex".into(),
23633                name: "Codex".into(),
23634                version: None,
23635                kind: AgentKind::Cli,
23636            })
23637            .unwrap();
23638        let workspace = PathBuf::from("/ws/profiled-append-remote");
23639        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
23640
23641        let initial = make_profiled_append_remote_merge_conversation(11, 5);
23642        let insert_outcome = storage
23643            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
23644            .unwrap();
23645        let conversation_id = insert_outcome.conversation_id;
23646
23647        let initial_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23648            .raw()
23649            .query_row_map(
23650                "SELECT ended_at, last_message_idx, last_message_created_at
23651                 FROM conversation_tail_state
23652                 WHERE conversation_id = ?1",
23653                fparams![conversation_id],
23654                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23655            )
23656            .unwrap();
23657        assert_eq!(initial_tail, (Some(111_005), Some(4), Some(111_004)));
23658
23659        storage
23660            .raw()
23661            .execute_compat(
23662                "UPDATE conversations SET ended_at = ?1 WHERE id = ?2",
23663                fparams![111_999_i64, conversation_id],
23664            )
23665            .unwrap();
23666        storage
23667            .raw()
23668            .execute_compat(
23669                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
23670                fparams![conversation_id],
23671            )
23672            .unwrap();
23673
23674        let appended = make_profiled_append_remote_merge_conversation(11, 10);
23675        let append_outcome = storage
23676            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
23677            .unwrap();
23678        assert_eq!(append_outcome.inserted_indices, vec![5, 6, 7, 8, 9]);
23679
23680        let final_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23681            .raw()
23682            .query_row_map(
23683                "SELECT ended_at, last_message_idx, last_message_created_at
23684                 FROM conversation_tail_state
23685                 WHERE conversation_id = ?1",
23686                fparams![conversation_id],
23687                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23688            )
23689            .unwrap();
23690        assert_eq!(final_tail, (Some(111_999), Some(9), Some(111_009)));
23691    }
23692
23693    #[test]
23694    fn msgpack_deserialize_empty_returns_default() {
23695        let recovered = deserialize_msgpack_to_json(&[]);
23696        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23697    }
23698
23699    #[test]
23700    fn msgpack_deserialize_garbage_returns_default() {
23701        // Use truncated msgpack data that will fail to parse
23702        // 0x85 indicates a fixmap with 5 elements, but we don't provide them
23703        let recovered = deserialize_msgpack_to_json(&[0x85]);
23704        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23705    }
23706
23707    #[test]
23708    fn stats_aggregator_collects_and_expands() {
23709        let mut agg = StatsAggregator::new();
23710        assert!(agg.is_empty());
23711
23712        // Record some stats
23713        // Day 100, agent "claude", source "local"
23714        agg.record("claude", "local", 100, 5, 500);
23715        // Day 100, agent "codex", source "local"
23716        agg.record("codex", "local", 100, 3, 300);
23717        // Day 101, agent "claude", source "local"
23718        agg.record("claude", "local", 101, 2, 200);
23719
23720        assert!(!agg.is_empty());
23721        assert_eq!(agg.raw_entry_count(), 3);
23722
23723        let entries = agg.expand();
23724        // Each raw entry expands to 4 permutations.
23725        // But (all, local) and (all, all) will aggregate.
23726        //
23727        // Raw:
23728        // 1. (100, claude, local) -> 1 sess, 5 msgs, 500 chars
23729        // 2. (100, codex, local)  -> 1 sess, 3 msgs, 300 chars
23730        // 3. (101, claude, local) -> 1 sess, 2 msgs, 200 chars
23731        //
23732        // Expanded 1 (day 100):
23733        // - (100, claude, local): 1 sess, 5 msgs, 500 chars
23734        // - (100, all, local):    1 (from claude) + 1 (from codex) = 2 sess, 8 msgs, 800 chars
23735        // - (100, claude, all):   1 sess, 5 msgs, 500 chars
23736        // - (100, codex, local):  1 sess, 3 msgs, 300 chars
23737        // - (100, codex, all):    1 sess, 3 msgs, 300 chars
23738        // - (100, all, all):      2 sess, 8 msgs, 800 chars
23739        //
23740        // Expanded 3 (day 101):
23741        // - (101, claude, local): 1 sess, 2 msgs, 200 chars
23742        // - (101, all, local):    1 sess, 2 msgs, 200 chars
23743        // - (101, claude, all):   1 sess, 2 msgs, 200 chars
23744        // - (101, all, all):      1 sess, 2 msgs, 200 chars
23745        //
23746        // Total unique keys in expanded map:
23747        // Day 100: (claude, local), (codex, local), (all, local), (claude, all), (codex, all), (all, all) = 6
23748        // Day 101: (claude, local), (all, local), (claude, all), (all, all) = 4
23749        // Total = 10 entries
23750
23751        assert_eq!(entries.len(), 10);
23752
23753        // Verify totals for day 100, all/all
23754        let day100_all = entries
23755            .iter()
23756            .find(|(d, a, s, _)| *d == 100 && a == "all" && s == "all")
23757            .unwrap();
23758        assert_eq!(day100_all.3.session_count_delta, 2);
23759        assert_eq!(day100_all.3.message_count_delta, 8);
23760        assert_eq!(day100_all.3.total_chars_delta, 800);
23761    }
23762
23763    // =========================================================================
23764    // LazyFrankenDb tests (bd-1ueu)
23765    // =========================================================================
23766
23767    #[test]
23768    fn lazy_franken_db_not_open_before_get() {
23769        let dir = TempDir::new().unwrap();
23770        let db_path = dir.path().join("lazy_test.db");
23771
23772        // Create a real DB so the path exists
23773        let _storage = SqliteStorage::open(&db_path).unwrap();
23774
23775        let lazy = LazyFrankenDb::new(db_path);
23776        assert!(
23777            !lazy.is_open(),
23778            "LazyFrankenDb must not open on construction"
23779        );
23780    }
23781
23782    #[test]
23783    fn lazy_franken_db_opens_on_first_get() {
23784        let dir = TempDir::new().unwrap();
23785        let db_path = dir.path().join("lazy_test.db");
23786
23787        // Create a real DB so the path exists
23788        let _storage = SqliteStorage::open(&db_path).unwrap();
23789        drop(_storage);
23790
23791        let lazy = LazyFrankenDb::new(db_path);
23792        assert!(!lazy.is_open());
23793
23794        let conn = lazy.get("test").expect("should open successfully");
23795        let count: i64 = conn
23796            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |r| {
23797                r.get_typed(0)
23798            })
23799            .unwrap();
23800        assert_eq!(count, 0);
23801        drop(conn);
23802
23803        assert!(lazy.is_open(), "LazyFrankenDb must be open after get()");
23804    }
23805
23806    #[test]
23807    fn lazy_franken_db_reuses_connection() {
23808        let dir = TempDir::new().unwrap();
23809        let db_path = dir.path().join("lazy_test.db");
23810        let _storage = SqliteStorage::open(&db_path).unwrap();
23811        drop(_storage);
23812
23813        let lazy = LazyFrankenDb::new(db_path);
23814
23815        // First access opens
23816        {
23817            let conn = lazy.get("first").unwrap();
23818            conn.execute_batch("CREATE TABLE IF NOT EXISTS test_tbl (id INTEGER)")
23819                .unwrap();
23820        }
23821
23822        // Second access reuses (table still exists)
23823        {
23824            let conn = lazy.get("second").unwrap();
23825            let count: i64 = conn
23826                .query_row_map("SELECT COUNT(*) FROM test_tbl", fparams![], |r| {
23827                    r.get_typed(0)
23828                })
23829                .unwrap();
23830            assert_eq!(count, 0);
23831        }
23832    }
23833
23834    #[test]
23835    fn lazy_franken_db_not_found_error() {
23836        let dir = TempDir::new().unwrap();
23837        let db_path = dir.path().join("nonexistent.db");
23838
23839        let lazy = LazyFrankenDb::new(db_path);
23840        let result = lazy.get("test");
23841        assert!(result.is_err());
23842        assert!(
23843            matches!(result.unwrap_err(), LazyDbError::NotFound(_)),
23844            "should return NotFound for missing DB"
23845        );
23846    }
23847
23848    #[test]
23849    fn lazy_franken_db_path_accessor() {
23850        let path = PathBuf::from("/tmp/test_lazy.db");
23851        let lazy = LazyFrankenDb::new(path.clone());
23852        assert_eq!(lazy.path(), path.as_path());
23853    }
23854
23855    // =========================================================================
23856    // Pricing / cost estimation tests (bead z9fse.10)
23857    // =========================================================================
23858
23859    #[test]
23860    fn sql_like_match_basic_patterns() {
23861        assert!(sql_like_match("claude-opus-4-20250101", "claude-opus-4%"));
23862        assert!(sql_like_match("claude-opus-4", "claude-opus-4%"));
23863        assert!(!sql_like_match("claude-sonnet-4", "claude-opus-4%"));
23864
23865        // Middle wildcard (gemini pattern)
23866        assert!(sql_like_match("gemini-2.0-flash-001", "gemini-2%flash%"));
23867        assert!(sql_like_match("gemini-2-flash", "gemini-2%flash%"));
23868        assert!(!sql_like_match("gemini-2-pro", "gemini-2%flash%"));
23869
23870        // Exact match
23871        assert!(sql_like_match("hello", "hello"));
23872        assert!(!sql_like_match("hello!", "hello"));
23873
23874        // Underscore wildcard
23875        assert!(sql_like_match("gpt-4o", "gpt-4_"));
23876        assert!(!sql_like_match("gpt-4oo", "gpt-4_"));
23877
23878        // Case insensitive
23879        assert!(sql_like_match("Claude-Opus-4", "claude-opus-4%"));
23880    }
23881
23882    #[test]
23883    fn date_str_to_day_id_converts_correctly() {
23884        // 2025-10-01 is 2100 days after 2020-01-01
23885        assert_eq!(date_str_to_day_id("2025-10-01").unwrap(), 2100);
23886        // 2024-04-01 is 1552 days after 2020-01-01
23887        assert_eq!(date_str_to_day_id("2024-04-01").unwrap(), 1552);
23888        assert!(date_str_to_day_id("invalid").is_err());
23889    }
23890
23891    #[test]
23892    fn pricing_table_lookup_selects_matching_entry() {
23893        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23894        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23895        let table = PricingTable {
23896            entries: vec![
23897                PricingEntry {
23898                    model_pattern: "claude-opus-4%".into(),
23899                    provider: "anthropic".into(),
23900                    input_cost_per_mtok: 15.0,
23901                    output_cost_per_mtok: 75.0,
23902                    cache_read_cost_per_mtok: Some(1.5),
23903                    cache_creation_cost_per_mtok: Some(18.75),
23904                    effective_day_id: effective_day,
23905                },
23906                PricingEntry {
23907                    model_pattern: "claude-sonnet-4%".into(),
23908                    provider: "anthropic".into(),
23909                    input_cost_per_mtok: 3.0,
23910                    output_cost_per_mtok: 15.0,
23911                    cache_read_cost_per_mtok: Some(0.3),
23912                    cache_creation_cost_per_mtok: Some(3.75),
23913                    effective_day_id: effective_day,
23914                },
23915            ],
23916        };
23917
23918        let result = table.lookup("claude-opus-4-20260101", lookup_day);
23919        assert!(result.is_some());
23920        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23921
23922        let result = table.lookup("claude-sonnet-4-latest", lookup_day);
23923        assert!(result.is_some());
23924        assert_eq!(result.unwrap().input_cost_per_mtok, 3.0);
23925
23926        assert!(table.lookup("unknown-model", lookup_day).is_none());
23927    }
23928
23929    #[test]
23930    fn pricing_table_lookup_respects_effective_date() {
23931        let effective_day_1 = date_str_to_day_id("2025-10-01").unwrap();
23932        let effective_day_2 = date_str_to_day_id("2026-01-01").unwrap();
23933        let table = PricingTable {
23934            entries: vec![
23935                PricingEntry {
23936                    model_pattern: "claude-opus-4%".into(),
23937                    provider: "anthropic".into(),
23938                    input_cost_per_mtok: 15.0,
23939                    output_cost_per_mtok: 75.0,
23940                    cache_read_cost_per_mtok: None,
23941                    cache_creation_cost_per_mtok: None,
23942                    effective_day_id: effective_day_1,
23943                },
23944                PricingEntry {
23945                    model_pattern: "claude-opus-4%".into(),
23946                    provider: "anthropic".into(),
23947                    input_cost_per_mtok: 12.0,
23948                    output_cost_per_mtok: 60.0,
23949                    cache_read_cost_per_mtok: None,
23950                    cache_creation_cost_per_mtok: None,
23951                    effective_day_id: effective_day_2,
23952                },
23953            ],
23954        };
23955
23956        // Before price drop
23957        let result = table.lookup("claude-opus-4", date_str_to_day_id("2025-11-01").unwrap());
23958        assert!(result.is_some());
23959        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23960
23961        // After price drop
23962        let result = table.lookup("claude-opus-4", date_str_to_day_id("2026-02-01").unwrap());
23963        assert!(result.is_some());
23964        assert_eq!(result.unwrap().input_cost_per_mtok, 12.0);
23965
23966        // Before all pricing
23967        assert!(
23968            table
23969                .lookup("claude-opus-4", date_str_to_day_id("2024-01-01").unwrap())
23970                .is_none()
23971        );
23972    }
23973
23974    #[test]
23975    fn pricing_table_lookup_specificity_tiebreak() {
23976        let effective_day = date_str_to_day_id("2025-01-01").unwrap();
23977        let lookup_day = date_str_to_day_id("2026-01-01").unwrap();
23978        let table = PricingTable {
23979            entries: vec![
23980                PricingEntry {
23981                    model_pattern: "gpt-4%".into(),
23982                    provider: "openai".into(),
23983                    input_cost_per_mtok: 10.0,
23984                    output_cost_per_mtok: 30.0,
23985                    cache_read_cost_per_mtok: None,
23986                    cache_creation_cost_per_mtok: None,
23987                    effective_day_id: effective_day,
23988                },
23989                PricingEntry {
23990                    model_pattern: "gpt-4-turbo%".into(),
23991                    provider: "openai".into(),
23992                    input_cost_per_mtok: 5.0,
23993                    output_cost_per_mtok: 15.0,
23994                    cache_read_cost_per_mtok: None,
23995                    cache_creation_cost_per_mtok: None,
23996                    effective_day_id: effective_day,
23997                },
23998            ],
23999        };
24000
24001        // Longer pattern wins for specific model
24002        let result = table.lookup("gpt-4-turbo-2025", lookup_day);
24003        assert!(result.is_some());
24004        assert_eq!(result.unwrap().input_cost_per_mtok, 5.0);
24005
24006        // Shorter pattern matches broader model
24007        let result = table.lookup("gpt-4o", lookup_day);
24008        assert!(result.is_some());
24009        assert_eq!(result.unwrap().input_cost_per_mtok, 10.0);
24010    }
24011
24012    #[test]
24013    fn pricing_table_compute_cost_basic() {
24014        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
24015        let table = PricingTable {
24016            entries: vec![PricingEntry {
24017                model_pattern: "claude-opus-4%".into(),
24018                provider: "anthropic".into(),
24019                input_cost_per_mtok: 15.0,
24020                output_cost_per_mtok: 75.0,
24021                cache_read_cost_per_mtok: Some(1.5),
24022                cache_creation_cost_per_mtok: Some(18.75),
24023                effective_day_id: effective_day,
24024            }],
24025        };
24026
24027        let cost = table.compute_cost(
24028            Some("claude-opus-4-latest"),
24029            date_str_to_day_id("2026-02-06").unwrap(),
24030            Some(1000),
24031            Some(500),
24032            None,
24033            None,
24034        );
24035        assert!(cost.is_some());
24036        // 1000 * 15.0 / 1M + 500 * 75.0 / 1M = 0.015 + 0.0375 = 0.0525
24037        assert!((cost.unwrap() - 0.0525).abs() < 1e-10);
24038    }
24039
24040    #[test]
24041    fn pricing_table_compute_cost_with_cache() {
24042        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
24043        let table = PricingTable {
24044            entries: vec![PricingEntry {
24045                model_pattern: "claude-opus-4%".into(),
24046                provider: "anthropic".into(),
24047                input_cost_per_mtok: 15.0,
24048                output_cost_per_mtok: 75.0,
24049                cache_read_cost_per_mtok: Some(1.5),
24050                cache_creation_cost_per_mtok: Some(18.75),
24051                effective_day_id: effective_day,
24052            }],
24053        };
24054
24055        let cost = table.compute_cost(
24056            Some("claude-opus-4-latest"),
24057            date_str_to_day_id("2026-02-06").unwrap(),
24058            Some(1_000_000),
24059            Some(100_000),
24060            Some(500_000),
24061            Some(200_000),
24062        );
24063        assert!(cost.is_some());
24064        // input excludes cache tokens to avoid double-charging them at both the
24065        // full input rate and the cache-specific rates.
24066        // non-cache input: 300K * 15/1M = 4.5, output: 100K * 75/1M = 7.5
24067        // cache_read: 500K * 1.5/1M = 0.75, cache_creation: 200K * 18.75/1M = 3.75
24068        // total = 16.5
24069        assert!((cost.unwrap() - 16.5).abs() < 1e-10);
24070    }
24071
24072    #[test]
24073    fn pricing_table_compute_cost_returns_none_for_unknown_model() {
24074        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
24075        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
24076        let table = PricingTable {
24077            entries: vec![PricingEntry {
24078                model_pattern: "claude-opus-4%".into(),
24079                provider: "anthropic".into(),
24080                input_cost_per_mtok: 15.0,
24081                output_cost_per_mtok: 75.0,
24082                cache_read_cost_per_mtok: None,
24083                cache_creation_cost_per_mtok: None,
24084                effective_day_id: effective_day,
24085            }],
24086        };
24087
24088        assert!(
24089            table
24090                .compute_cost(
24091                    Some("unknown-model"),
24092                    lookup_day,
24093                    Some(1000),
24094                    Some(500),
24095                    None,
24096                    None
24097                )
24098                .is_none()
24099        );
24100        assert!(
24101            table
24102                .compute_cost(None, lookup_day, Some(1000), Some(500), None, None)
24103                .is_none()
24104        );
24105        assert!(
24106            table
24107                .compute_cost(Some("claude-opus-4"), lookup_day, None, None, None, None)
24108                .is_none()
24109        );
24110    }
24111
24112    #[test]
24113    fn pricing_table_load_from_db() {
24114        let dir = TempDir::new().unwrap();
24115        let db_path = dir.path().join("test.db");
24116        let storage = SqliteStorage::open(&db_path).unwrap();
24117
24118        let table = PricingTable::load(&storage.conn).unwrap();
24119        assert!(!table.is_empty());
24120
24121        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
24122
24123        let opus = table.lookup("claude-opus-4-latest", lookup_day);
24124        assert!(opus.is_some());
24125        assert_eq!(opus.unwrap().input_cost_per_mtok, 15.0);
24126
24127        let flash = table.lookup("gemini-2.0-flash-001", lookup_day);
24128        assert!(flash.is_some());
24129        assert_eq!(flash.unwrap().input_cost_per_mtok, 0.075);
24130    }
24131
24132    #[test]
24133    fn pricing_table_load_rejects_invalid_effective_date() {
24134        let dir = TempDir::new().unwrap();
24135        let db_path = dir.path().join("test.db");
24136        let storage = SqliteStorage::open(&db_path).unwrap();
24137
24138        storage
24139            .conn
24140            .execute_compat(
24141                "INSERT INTO model_pricing (
24142                    model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
24143                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
24144                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
24145                fparams![
24146                    "broken-model%",
24147                    "test",
24148                    1.0_f64,
24149                    2.0_f64,
24150                    Option::<f64>::None,
24151                    Option::<f64>::None,
24152                    "not-a-date"
24153                ],
24154            )
24155            .unwrap();
24156
24157        let err = PricingTable::load(&storage.conn).unwrap_err();
24158        assert!(err.to_string().contains("invalid effective_date"));
24159    }
24160
24161    #[test]
24162    fn pricing_diagnostics_tracks_coverage() {
24163        let mut diag = PricingDiagnostics::default();
24164        diag.record_priced();
24165        diag.record_priced();
24166        diag.record_unpriced(Some("custom-model-v1"));
24167        diag.record_unpriced(Some("custom-model-v1"));
24168        diag.record_unpriced(None);
24169
24170        assert_eq!(diag.priced_count, 2);
24171        assert_eq!(diag.unpriced_count, 3);
24172        assert_eq!(diag.unknown_models.len(), 2);
24173        assert_eq!(diag.unknown_models["custom-model-v1"], 2);
24174        assert_eq!(diag.unknown_models["(none)"], 1);
24175    }
24176
24177    // =========================================================================
24178    // FrankenStorage migration tests (bead 2j6p6)
24179    // =========================================================================
24180
24181    /// Helper: create a FrankenStorage wrapping an in-memory connection and
24182    /// run migrations. This exercises the same code path as `open()` but avoids
24183    /// frankensqlite's file-based autoindex renaming limitation (V5 uses
24184    /// ALTER TABLE RENAME which triggers sqlite_autoindex lookup issues on
24185    /// file-based pagers).
24186    fn franken_storage_in_memory() -> FrankenStorage {
24187        let conn = FrankenConnection::open(":memory:").unwrap();
24188        let storage = FrankenStorage::new(conn, PathBuf::from(":memory:"));
24189        storage.run_migrations().unwrap();
24190        storage.apply_config().unwrap();
24191        storage
24192    }
24193
24194    #[test]
24195    fn franken_migrations_create_all_tables() {
24196        let storage = franken_storage_in_memory();
24197
24198        // Should be at CURRENT_SCHEMA_VERSION.
24199        let version = storage.schema_version().unwrap();
24200        assert_eq!(
24201            version, CURRENT_SCHEMA_VERSION,
24202            "fresh FrankenStorage should be at current schema version"
24203        );
24204
24205        // Core tables from V1 should exist.
24206        let rows = storage
24207            .raw()
24208            .query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
24209            .unwrap();
24210        let table_names: Vec<String> = rows
24211            .iter()
24212            .filter_map(|r| r.get_typed::<String>(0).ok())
24213            .collect();
24214
24215        for required in [
24216            "meta",
24217            "agents",
24218            "workspaces",
24219            "conversations",
24220            "messages",
24221            "snippets",
24222            "tags",
24223            "conversation_tags",
24224        ] {
24225            assert!(
24226                table_names.contains(&required.to_string()),
24227                "missing table: {required}"
24228            );
24229        }
24230
24231        // V4 sources table.
24232        assert!(
24233            table_names.contains(&"sources".to_string()),
24234            "missing sources table"
24235        );
24236
24237        // V8 daily_stats table.
24238        assert!(
24239            table_names.contains(&"daily_stats".to_string()),
24240            "missing daily_stats table"
24241        );
24242
24243        // V9 embedding_jobs table.
24244        assert!(
24245            table_names.contains(&"embedding_jobs".to_string()),
24246            "missing embedding_jobs table"
24247        );
24248
24249        // V11 message_metrics, usage_hourly, usage_daily tables.
24250        for analytics_table in ["message_metrics", "usage_hourly", "usage_daily"] {
24251            assert!(
24252                table_names.contains(&analytics_table.to_string()),
24253                "missing table: {analytics_table}"
24254            );
24255        }
24256        assert!(
24257            table_names.contains(&"conversation_tail_state".to_string()),
24258            "missing conversation_tail_state table"
24259        );
24260        assert!(
24261            table_names.contains(&"conversation_external_lookup".to_string()),
24262            "missing conversation_external_lookup table"
24263        );
24264        assert!(
24265            table_names.contains(&"conversation_external_tail_lookup".to_string()),
24266            "missing conversation_external_tail_lookup table"
24267        );
24268
24269        // Fresh frankensqlite databases should record the combined V13 base
24270        // schema plus every additive post-V13 migration.
24271        let rows = storage
24272            .raw()
24273            .query("SELECT COUNT(*) FROM _schema_migrations;")
24274            .unwrap();
24275        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24276        assert_eq!(
24277            count,
24278            (13..=CURRENT_SCHEMA_VERSION).count() as i64,
24279            "_schema_migrations should record the V13 base schema and post-V13 migrations"
24280        );
24281
24282        // The latest applied migration should be the current schema version.
24283        let rows = storage
24284            .raw()
24285            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24286            .unwrap();
24287        let versions: Vec<i64> = rows
24288            .iter()
24289            .map(|row| row.get_typed(0))
24290            .collect::<std::result::Result<_, _>>()
24291            .unwrap();
24292        assert_eq!(
24293            versions,
24294            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24295            "_schema_migrations should contain v13 through current"
24296        );
24297    }
24298
24299    #[test]
24300    fn franken_migrations_idempotent() {
24301        let storage = franken_storage_in_memory();
24302        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24303
24304        // Re-running migrations on the same connection is a no-op.
24305        storage.run_migrations().unwrap();
24306        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24307    }
24308
24309    #[test]
24310    fn migration_v20_backfills_conversation_external_tail_lookup() {
24311        let storage = franken_storage_in_memory();
24312        let agent_id = storage
24313            .ensure_agent(&Agent {
24314                id: None,
24315                slug: "codex".into(),
24316                name: "Codex".into(),
24317                version: None,
24318                kind: AgentKind::Cli,
24319            })
24320            .unwrap();
24321        let workspace_id = storage
24322            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
24323            .unwrap();
24324        let mut conv = make_profiled_storage_remote_conversation(1919, 2);
24325        conv.source_id = "profiled-storage-remote-source-東京".into();
24326        conv.external_id = Some("profiled-storage-remote-☃-1919".into());
24327        let outcome = storage
24328            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
24329            .unwrap();
24330        let external_id = conv.external_id.as_deref().unwrap();
24331        let lookup_key = conversation_external_lookup_key(&conv.source_id, agent_id, external_id);
24332
24333        storage
24334            .raw()
24335            .execute("DELETE FROM conversation_external_tail_lookup")
24336            .unwrap();
24337        storage
24338            .raw()
24339            .execute("DELETE FROM _schema_migrations WHERE version = 20")
24340            .unwrap();
24341        storage
24342            .raw()
24343            .execute_compat(
24344                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24345                fparams!["19"],
24346            )
24347            .unwrap();
24348
24349        storage.run_migrations().unwrap();
24350
24351        let backfilled: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
24352            .raw()
24353            .query_row_map(
24354                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
24355                 FROM conversation_external_tail_lookup
24356                 WHERE lookup_key = ?1",
24357                fparams![lookup_key.as_str()],
24358                |row| {
24359                    Ok((
24360                        row.get_typed(0)?,
24361                        row.get_typed(1)?,
24362                        row.get_typed(2)?,
24363                        row.get_typed(3)?,
24364                    ))
24365                },
24366            )
24367            .unwrap();
24368        assert_eq!(
24369            backfilled,
24370            (
24371                outcome.conversation_id,
24372                conv.ended_at,
24373                Some(1),
24374                conv.messages[1].created_at
24375            )
24376        );
24377        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24378    }
24379
24380    #[test]
24381    fn migration_v15_creates_lazy_tail_state_cache() {
24382        let conn = FrankenConnection::open(":memory:").unwrap();
24383        conn.execute_batch(
24384            "CREATE TABLE conversations (
24385                 id INTEGER PRIMARY KEY,
24386                 ended_at INTEGER
24387             );
24388             CREATE TABLE messages (
24389                 id INTEGER PRIMARY KEY,
24390                 conversation_id INTEGER NOT NULL,
24391                 idx INTEGER NOT NULL,
24392                 created_at INTEGER
24393             );
24394             INSERT INTO conversations(id, ended_at) VALUES
24395                 (1, 1710000000300),
24396                 (2, NULL);
24397             INSERT INTO messages(id, conversation_id, idx, created_at) VALUES
24398                 (10, 1, 0, 1710000000100),
24399                 (11, 1, 1, 1710000000200),
24400                 (12, 2, 0, 1710000000400);",
24401        )
24402        .unwrap();
24403
24404        conn.execute(
24405            "CREATE TABLE _schema_migrations (
24406                version INTEGER PRIMARY KEY,
24407                name TEXT NOT NULL,
24408                applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))
24409             );",
24410        )
24411        .unwrap();
24412
24413        assert!(
24414            apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24415            "v15 migration should apply once"
24416        );
24417        assert!(
24418            !apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24419            "v15 migration should be idempotent once recorded"
24420        );
24421
24422        let columns = conn.query("PRAGMA table_info(conversations);").unwrap();
24423        let column_names: HashSet<String> = columns
24424            .iter()
24425            .map(|row| row.get_typed(1))
24426            .collect::<std::result::Result<_, frankensqlite::FrankenError>>()
24427            .unwrap();
24428        assert!(column_names.contains("last_message_idx"));
24429        assert!(column_names.contains("last_message_created_at"));
24430
24431        let tail_rows: i64 = conn
24432            .query("SELECT COUNT(*) FROM conversation_tail_state;")
24433            .unwrap()
24434            .first()
24435            .unwrap()
24436            .get_typed(0)
24437            .unwrap();
24438        assert_eq!(
24439            tail_rows, 0,
24440            "v15 should create the cache without an open-time message scan"
24441        );
24442
24443        let applied: i64 = conn
24444            .query("SELECT COUNT(*) FROM _schema_migrations WHERE version = 15;")
24445            .unwrap()
24446            .first()
24447            .unwrap()
24448            .get_typed(0)
24449            .unwrap();
24450        assert_eq!(applied, 1);
24451    }
24452
24453    #[test]
24454    fn schema_repair_adds_missing_conversations_token_columns() {
24455        let conn = FrankenConnection::open(":memory:").unwrap();
24456        conn.execute_batch(
24457            "CREATE TABLE conversations (
24458                 id INTEGER PRIMARY KEY,
24459                 agent_id INTEGER NOT NULL,
24460                 source_path TEXT NOT NULL
24461             );",
24462        )
24463        .unwrap();
24464        let storage = FrankenStorage::new(conn, std::path::PathBuf::from(":memory:"));
24465
24466        storage.repair_missing_conversation_token_columns().unwrap();
24467        storage.repair_missing_conversation_token_columns().unwrap();
24468
24469        let columns = franken_table_column_names(&storage.conn, "conversations").unwrap();
24470        for &(column_name, _) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
24471            assert!(
24472                columns.contains(column_name),
24473                "schema repair should add conversations.{column_name}"
24474            );
24475        }
24476    }
24477
24478    #[test]
24479    fn franken_meta_schema_version_in_sync() {
24480        let storage = franken_storage_in_memory();
24481
24482        // meta.schema_version should be kept in sync.
24483        let rows = storage
24484            .raw()
24485            .query("SELECT value FROM meta WHERE key = 'schema_version';")
24486            .unwrap();
24487        let meta_version: String = rows.first().unwrap().get_typed(0).unwrap();
24488        assert_eq!(
24489            meta_version,
24490            CURRENT_SCHEMA_VERSION.to_string(),
24491            "meta.schema_version should match CURRENT_SCHEMA_VERSION"
24492        );
24493    }
24494
24495    #[test]
24496    fn franken_transition_from_meta_version() {
24497        let dir = TempDir::new().unwrap();
24498        let db_path = dir.path().join("test_transition.db");
24499
24500        // Simulate an existing database created by SqliteStorage at version 10.
24501        // We create just enough schema to test the transition.
24502        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24503        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24504            .unwrap();
24505        conn.execute("INSERT INTO meta(key, value) VALUES('schema_version', '10');")
24506            .unwrap();
24507        // Create a dummy conversations table so transition doesn't think it's corrupted.
24508        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24509            .unwrap();
24510        drop(conn);
24511
24512        // Now run the transition function.
24513        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24514        transition_from_meta_version(&conn).unwrap();
24515
24516        // The frankensqlite path uses a combined V13 base migration, so a
24517        // legacy V10 marker is bridged to V13 and later idempotent repair fills
24518        // in any missing V11-V13 objects.
24519        let rows = conn
24520            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24521            .unwrap();
24522        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24523        assert_eq!(
24524            versions,
24525            (1..=13).collect::<Vec<i64>>(),
24526            "transition should bridge legacy V10 databases through the combined V13 base marker"
24527        );
24528    }
24529
24530    #[test]
24531    fn franken_transition_from_current_meta_backfills_current_schema_marker() {
24532        let dir = TempDir::new().unwrap();
24533        let db_path = dir.path().join("test_current_transition.db");
24534
24535        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24536        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24537            .unwrap();
24538        conn.execute_compat(
24539            "INSERT INTO meta(key, value) VALUES('schema_version', ?1);",
24540            &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24541        )
24542        .unwrap();
24543        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24544            .unwrap();
24545        drop(conn);
24546
24547        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24548        transition_from_meta_version(&conn).unwrap();
24549
24550        let rows = conn
24551            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24552            .unwrap();
24553        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24554        assert_eq!(
24555            versions,
24556            (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24557            "current meta schema marker should backfill every known migration"
24558        );
24559    }
24560
24561    #[test]
24562    fn franken_transition_skips_when_already_done() {
24563        let dir = TempDir::new().unwrap();
24564        let db_path = dir.path().join("test_transition_skip.db");
24565
24566        // Create a DB that already has _schema_migrations.
24567        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24568        conn.execute(
24569            "CREATE TABLE _schema_migrations (version INTEGER PRIMARY KEY, name TEXT NOT NULL, applied_at TEXT NOT NULL DEFAULT 'now');",
24570        ).unwrap();
24571        conn.execute("INSERT INTO _schema_migrations (version, name) VALUES (1, 'test');")
24572            .unwrap();
24573
24574        // Transition should be a no-op.
24575        transition_from_meta_version(&conn).unwrap();
24576
24577        // Should still have exactly 1 entry.
24578        let rows = conn
24579            .query("SELECT COUNT(*) FROM _schema_migrations;")
24580            .unwrap();
24581        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24582        assert_eq!(
24583            count, 1,
24584            "transition should not re-run on already-transitioned DB"
24585        );
24586    }
24587
24588    #[test]
24589    fn franken_transition_fresh_db_is_noop() {
24590        let dir = TempDir::new().unwrap();
24591        let db_path = dir.path().join("test_fresh_noop.db");
24592
24593        // Empty database — no meta table, no tables at all.
24594        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24595        transition_from_meta_version(&conn).unwrap();
24596
24597        // _schema_migrations should NOT have been created.
24598        let res = conn.query("SELECT * FROM \"_schema_migrations\";");
24599        assert!(
24600            res.is_err(),
24601            "transition should not create _schema_migrations on fresh DB"
24602        );
24603    }
24604
24605    #[test]
24606    fn franken_transition_with_fts_virtual_table_succeeds() {
24607        let dir = TempDir::new().unwrap();
24608        let db_path = dir.path().join("test_transition_with_fts.db");
24609
24610        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24611        conn.execute_batch(
24612            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24613             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24614             CREATE TABLE conversations (id INTEGER PRIMARY KEY);
24615             CREATE VIRTUAL TABLE fts_messages USING fts5(
24616                 content,
24617                 title,
24618                 agent,
24619                 workspace,
24620                 source_path,
24621                 created_at,
24622                 content='',
24623                 tokenize='porter unicode61'
24624             );",
24625        )
24626        .unwrap();
24627        drop(conn);
24628
24629        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24630        transition_from_meta_version(&conn).unwrap();
24631
24632        let rows = conn
24633            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24634            .unwrap();
24635        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24636        assert_eq!(versions, (1..=13).collect::<Vec<i64>>());
24637    }
24638
24639    #[test]
24640    fn franken_storage_open_legacy_v13_with_fts_virtual_table_succeeds() {
24641        let dir = TempDir::new().unwrap();
24642        let db_path = dir.path().join("test_open_legacy_v13_with_fts.db");
24643
24644        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24645        conn.execute_batch(
24646            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24647             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24648             CREATE TABLE agents (
24649                 id INTEGER PRIMARY KEY,
24650                 slug TEXT NOT NULL
24651             );
24652             CREATE TABLE workspaces (
24653                 id INTEGER PRIMARY KEY,
24654                 path TEXT NOT NULL
24655             );
24656             CREATE TABLE sources (
24657                 id TEXT PRIMARY KEY,
24658                 kind TEXT NOT NULL,
24659                 host_label TEXT,
24660                 machine_id TEXT,
24661                 platform TEXT,
24662                 config_json TEXT,
24663                 created_at INTEGER NOT NULL,
24664                 updated_at INTEGER NOT NULL
24665             );
24666             CREATE TABLE conversations (
24667                 id INTEGER PRIMARY KEY,
24668                 agent_id INTEGER NOT NULL,
24669                 workspace_id INTEGER,
24670                 source_id TEXT NOT NULL DEFAULT 'local',
24671                 external_id TEXT,
24672                 title TEXT,
24673                 source_path TEXT NOT NULL,
24674                 started_at INTEGER,
24675                 ended_at INTEGER
24676             );
24677             CREATE TABLE messages (
24678                 id INTEGER PRIMARY KEY,
24679                 conversation_id INTEGER NOT NULL,
24680                 idx INTEGER NOT NULL,
24681                 role TEXT NOT NULL,
24682                 author TEXT,
24683                 created_at INTEGER,
24684                 content TEXT NOT NULL,
24685                 extra_json TEXT,
24686                 extra_bin BLOB
24687             );
24688             INSERT INTO agents(id, slug) VALUES (1, 'codex');
24689             INSERT INTO workspaces(id, path) VALUES (1, '/data/projects/coding_agent_session_search');
24690             INSERT INTO sources(id, kind, host_label, created_at, updated_at)
24691             VALUES ('local', 'local', NULL, 1710000000000, 1710000000000);
24692             INSERT INTO conversations(
24693                 id,
24694                 agent_id,
24695                 workspace_id,
24696                 source_id,
24697                 external_id,
24698                 title,
24699                 source_path,
24700                 started_at
24701             )
24702             VALUES (
24703                 1,
24704                 1,
24705                 1,
24706                 'local',
24707                 'legacy-session',
24708                 'legacy session',
24709                 '/tmp/legacy.jsonl',
24710                 1710000000000
24711             );
24712             INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content)
24713             VALUES (1, 1, 0, 'user', 'tester', 1710000000000, 'legacy content');
24714             CREATE VIRTUAL TABLE fts_messages USING fts5(
24715                 content,
24716                 title,
24717                 agent,
24718                 workspace,
24719                 source_path,
24720                 created_at,
24721                 message_id,
24722                 content='',
24723                 tokenize='porter unicode61'
24724             );",
24725        )
24726        .unwrap();
24727        drop(conn);
24728
24729        let storage = FrankenStorage::open(&db_path).unwrap();
24730        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24731
24732        let rows = storage
24733            .raw()
24734            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24735            .unwrap();
24736        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24737        assert_eq!(versions, (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>());
24738    }
24739
24740    #[test]
24741    fn franken_storage_open_repairs_duplicate_fts_messages_schema_rows() {
24742        let dir = TempDir::new().unwrap();
24743        let db_path = dir.path().join("test_open_repairs_duplicate_fts_schema.db");
24744
24745        let storage = FrankenStorage::open(&db_path).unwrap();
24746        let agent = Agent {
24747            id: None,
24748            slug: "codex".into(),
24749            name: "Codex".into(),
24750            version: None,
24751            kind: AgentKind::Cli,
24752        };
24753        let agent_id = storage.ensure_agent(&agent).unwrap();
24754        let conversation = Conversation {
24755            id: None,
24756            agent_slug: "codex".into(),
24757            workspace: Some(PathBuf::from("/tmp/workspace")),
24758            external_id: Some("dup-fts-schema".into()),
24759            title: Some("Duplicate FTS schema".into()),
24760            source_path: PathBuf::from("/tmp/dup-fts-schema.jsonl"),
24761            started_at: Some(1_700_000_000_000),
24762            ended_at: Some(1_700_000_000_100),
24763            approx_tokens: Some(42),
24764            metadata_json: serde_json::Value::Null,
24765            messages: vec![Message {
24766                id: None,
24767                idx: 0,
24768                role: MessageRole::User,
24769                author: Some("user".into()),
24770                created_at: Some(1_700_000_000_050),
24771                content: "message that should remain queryable".into(),
24772                extra_json: serde_json::Value::Null,
24773                snippets: Vec::new(),
24774            }],
24775            source_id: LOCAL_SOURCE_ID.into(),
24776            origin_host: None,
24777        };
24778        storage
24779            .insert_conversation_tree(agent_id, None, &conversation)
24780            .unwrap();
24781        drop(storage);
24782        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
24783
24784        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
24785        let conn = rusqlite_test_fixture_conn(&db_path);
24786        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
24787        conn.execute(
24788            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
24789             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
24790            [duplicate_legacy_fts_sql],
24791        )
24792        .unwrap();
24793        conn.execute(
24794            "DELETE FROM meta WHERE key = ?1",
24795            [FTS_FRANKEN_REBUILD_META_KEY],
24796        )
24797        .unwrap();
24798        // Simulate a pre-fix upgraded database that has never gone through the
24799        // authoritative frankensqlite FTS rebuild generation yet.
24800        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
24801
24802        let duplicate_rows: i64 = conn
24803            .query_row(
24804                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
24805                [],
24806                |row| row.get(0),
24807            )
24808            .unwrap();
24809        assert_eq!(duplicate_rows, 2);
24810        drop(conn);
24811
24812        let reopened = FrankenStorage::open(&db_path).unwrap();
24813        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24814        let generation_rows: Vec<String> = reopened
24815            .raw()
24816            .query_map_collect(
24817                "SELECT value FROM meta WHERE key = ?1",
24818                fparams![FTS_FRANKEN_REBUILD_META_KEY],
24819                |row| row.get_typed(0),
24820            )
24821            .unwrap();
24822        assert_eq!(
24823            generation_rows.len(),
24824            0,
24825            "canonical open should not eagerly rewrite FTS repair metadata"
24826        );
24827        reopened.ensure_search_fallback_fts_consistency().unwrap();
24828        let repaired = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24829        assert_eq!(franken_fts_schema_rows(&repaired).unwrap(), 1);
24830
24831        let total_messages: i64 = reopened
24832            .raw()
24833            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
24834                row.get_typed(0)
24835            })
24836            .unwrap();
24837        let total_fts_rows: i64 = reopened
24838            .raw()
24839            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
24840                row.get_typed(0)
24841            })
24842            .unwrap();
24843        assert_eq!(total_fts_rows, total_messages);
24844    }
24845
24846    #[test]
24847    fn fts_messages_integrity_reports_missing_shadow_tables() {
24848        let dir = TempDir::new().unwrap();
24849        let healthy_db_path = dir.path().join("healthy_fts.db");
24850
24851        {
24852            let storage = FrankenStorage::open(&healthy_db_path).unwrap();
24853            storage.ensure_search_fallback_fts_consistency().unwrap();
24854            storage
24855                .validate_fts_messages_integrity()
24856                .expect("freshly materialized fts_messages should pass integrity validation");
24857        }
24858
24859        let corrupt_db_path = dir.path().join("test_corrupt_fts_missing_shadows.db");
24860        {
24861            let conn = rusqlite_test_fixture_conn(&corrupt_db_path);
24862            conn.execute("CREATE TABLE schema_anchor(id INTEGER PRIMARY KEY)", [])
24863                .unwrap();
24864            let orphaned_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
24865            conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
24866            conn.execute(
24867                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
24868                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
24869                [orphaned_fts_sql],
24870            )
24871            .unwrap();
24872            conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
24873        }
24874
24875        let open_err = FrankenConnection::open(corrupt_db_path.to_string_lossy().to_string())
24876            .expect_err("orphaned fts_messages schema should fail during connection open");
24877        let integrity = fts_messages_integrity_error_from_message(open_err.to_string())
24878            .expect("open-time FTS corruption should map to the typed FTS integrity kind");
24879        assert_eq!(integrity.missing_shadow_tables(), &["fts_messages_content"]);
24880        let rendered = integrity.to_string();
24881        assert!(
24882            rendered.contains("fts_messages")
24883                && rendered.contains("required FTS5 shadow tables")
24884                && rendered.contains("fts_messages_content"),
24885            "error should be an operator-facing FTS corruption diagnosis: {rendered}"
24886        );
24887    }
24888
24889    #[test]
24890    fn franken_storage_open_fresh_db_keeps_single_franken_fts_schema_row() {
24891        let dir = TempDir::new().unwrap();
24892        let db_path = dir.path().join("fresh-franken-storage-open.db");
24893
24894        let storage = FrankenStorage::open(&db_path).unwrap();
24895        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24896
24897        // The FTS5 virtual table is no longer created eagerly by the
24898        // migration runner (V14 drops the old internal-content table and the
24899        // current contentless table is recreated lazily — see MIGRATION_V14).
24900        // Invoke the repair path to match normal cass startup, then assert
24901        // there is exactly one fts_messages entry in sqlite_schema (no
24902        // duplicates).
24903        storage
24904            .ensure_search_fallback_fts_consistency()
24905            .expect("ensure FTS consistency after fresh open");
24906        drop(storage);
24907
24908        let c_reader = FrankenConnection::open(db_path.to_string_lossy().into_owned())
24909            .expect("open DB via frankensqlite for sqlite_master inspection");
24910        assert_eq!(
24911            franken_fts_schema_rows(&c_reader).unwrap(),
24912            1,
24913            "exactly one fts_messages schema row should exist after ensure_search_fallback_fts_consistency"
24914        );
24915        drop(c_reader);
24916
24917        let storage = FrankenStorage::open(&db_path).unwrap();
24918        assert!(
24919            storage
24920                .raw()
24921                .query("SELECT COUNT(*) FROM fts_messages")
24922                .is_ok(),
24923            "fts_messages must be queryable through frankensqlite after open"
24924        );
24925    }
24926
24927    #[test]
24928    fn franken_storage_open_repairs_missing_analytics_tables_when_version_markers_lie() {
24929        let dir = TempDir::new().unwrap();
24930        let db_path = dir.path().join("test_repair_missing_analytics.db");
24931
24932        {
24933            let storage = FrankenStorage::open(&db_path).unwrap();
24934            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24935        }
24936
24937        {
24938            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24939            for table in &[
24940                "usage_models_daily",
24941                "usage_daily",
24942                "usage_hourly",
24943                "message_metrics",
24944                "token_daily_stats",
24945                "token_usage",
24946                "model_pricing",
24947                "embedding_jobs",
24948                "daily_stats",
24949            ] {
24950                conn.execute(&format!("DROP TABLE IF EXISTS {table}"))
24951                    .unwrap();
24952            }
24953            conn.execute_compat(
24954                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24955                &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24956            )
24957            .unwrap();
24958        }
24959
24960        let repaired = FrankenStorage::open(&db_path).unwrap();
24961        assert_eq!(repaired.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24962
24963        let analytics_count: i64 = repaired
24964            .raw()
24965            .query_row_map(
24966                "SELECT COUNT(*) FROM sqlite_master
24967                 WHERE type='table'
24968                   AND name IN (
24969                     'daily_stats',
24970                     'embedding_jobs',
24971                     'token_usage',
24972                     'token_daily_stats',
24973                     'model_pricing',
24974                     'message_metrics',
24975                     'usage_hourly',
24976                     'usage_daily',
24977                     'usage_models_daily'
24978                   )",
24979                &[],
24980                |row| row.get_typed(0),
24981            )
24982            .unwrap();
24983        assert_eq!(
24984            analytics_count, 9,
24985            "open() should recreate the missing analytics tables even when schema_version already says current"
24986        );
24987    }
24988
24989    #[test]
24990    fn current_schema_repair_batches_cover_every_required_probe() {
24991        let missing_tables: Vec<&'static str> = REQUIRED_CURRENT_SCHEMA_TABLE_PROBES
24992            .iter()
24993            .map(|(table_name, _)| *table_name)
24994            .collect();
24995
24996        let batches = current_schema_repair_batches_for_missing_tables(&missing_tables).unwrap();
24997        let covered_tables: HashSet<&'static str> = batches
24998            .iter()
24999            .flat_map(|batch| batch.tables.iter().copied())
25000            .collect();
25001
25002        for table_name in missing_tables {
25003            assert!(
25004                covered_tables.contains(table_name),
25005                "missing repair coverage for {table_name}"
25006            );
25007        }
25008    }
25009
25010    #[test]
25011    fn current_schema_repair_batches_do_not_replay_core_schema_bootstrap() {
25012        for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
25013            assert!(
25014                !batch.sql.contains("CREATE TABLE IF NOT EXISTS meta"),
25015                "repair batch {} should not recreate meta",
25016                batch.name
25017            );
25018            assert!(
25019                !batch.sql.contains("CREATE TABLE IF NOT EXISTS agents"),
25020                "repair batch {} should not recreate agents",
25021                batch.name
25022            );
25023            assert!(
25024                !batch.sql.contains("CREATE TABLE IF NOT EXISTS workspaces"),
25025                "repair batch {} should not recreate workspaces",
25026                batch.name
25027            );
25028            assert!(
25029                !batch
25030                    .sql
25031                    .contains("CREATE TABLE IF NOT EXISTS conversations"),
25032                "repair batch {} should not recreate conversations",
25033                batch.name
25034            );
25035            assert!(
25036                !batch.sql.contains("CREATE TABLE IF NOT EXISTS messages"),
25037                "repair batch {} should not recreate messages",
25038                batch.name
25039            );
25040            assert!(
25041                !batch.sql.contains("CREATE TABLE IF NOT EXISTS snippets"),
25042                "repair batch {} should not recreate snippets",
25043                batch.name
25044            );
25045            assert!(
25046                !batch.sql.contains("CREATE VIRTUAL TABLE fts_messages"),
25047                "repair batch {} should not recreate FTS tables",
25048                batch.name
25049            );
25050            assert!(
25051                !batch.sql.contains("DROP TABLE"),
25052                "repair batch {} should never drop tables",
25053                batch.name
25054            );
25055        }
25056    }
25057
25058    #[test]
25059    fn build_cass_migrations_applies_combined_v13() {
25060        let conn = FrankenConnection::open(":memory:").unwrap();
25061        let base_result = build_cass_migrations_before_tail_cache()
25062            .run(&conn)
25063            .unwrap();
25064        assert!(apply_conversation_tail_state_cache_migration(&conn).unwrap());
25065        let post_result = build_cass_migrations_after_tail_cache().run(&conn).unwrap();
25066
25067        assert!(base_result.was_fresh);
25068        let mut applied = base_result.applied;
25069        applied.push(15);
25070        applied.extend(post_result.applied);
25071        assert_eq!(
25072            applied,
25073            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
25074            "should apply combined V13 plus additive post-V13 migrations"
25075        );
25076        let current: i64 = conn
25077            .query("SELECT MAX(version) FROM _schema_migrations;")
25078            .unwrap()
25079            .first()
25080            .unwrap()
25081            .get_typed(0)
25082            .unwrap();
25083        assert_eq!(current, CURRENT_SCHEMA_VERSION);
25084    }
25085
25086    #[test]
25087    fn franken_insert_conversations_batched_populates_analytics_rollups() {
25088        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
25089        use frankensqlite::compat::{ConnectionExt, RowExt};
25090        use std::path::PathBuf;
25091
25092        let dir = TempDir::new().unwrap();
25093        let db_path = dir.path().join("franken-index.db");
25094        let storage = FrankenStorage::open(&db_path).unwrap();
25095
25096        let agent = Agent {
25097            id: None,
25098            slug: "claude_code".into(),
25099            name: "Claude Code".into(),
25100            version: Some("1.0".into()),
25101            kind: AgentKind::Cli,
25102        };
25103        let agent_id = storage.ensure_agent(&agent).unwrap();
25104
25105        let ts_ms = 1_770_551_400_000_i64;
25106        let usage_json = serde_json::json!({
25107            "message": {
25108                "model": "claude-opus-4-6",
25109                "usage": {
25110                    "input_tokens": 100,
25111                    "output_tokens": 50,
25112                    "cache_read_input_tokens": 25,
25113                    "cache_creation_input_tokens": 10,
25114                    "service_tier": "standard"
25115                }
25116            }
25117        });
25118
25119        let conv = Conversation {
25120            id: None,
25121            agent_slug: "claude_code".into(),
25122            workspace: Some(PathBuf::from("/tmp/workspace")),
25123            external_id: Some("franken-batch-upsert".into()),
25124            title: Some("Franken batch upsert".into()),
25125            source_path: PathBuf::from("/tmp/franken.jsonl"),
25126            started_at: Some(ts_ms),
25127            ended_at: Some(ts_ms + 60_000),
25128            approx_tokens: None,
25129            metadata_json: serde_json::Value::Null,
25130            messages: vec![
25131                Message {
25132                    id: None,
25133                    idx: 0,
25134                    role: MessageRole::User,
25135                    author: None,
25136                    created_at: Some(ts_ms),
25137                    content: "Please make a plan.".into(),
25138                    extra_json: serde_json::Value::Null,
25139                    snippets: vec![],
25140                },
25141                Message {
25142                    id: None,
25143                    idx: 1,
25144                    role: MessageRole::Agent,
25145                    author: None,
25146                    created_at: Some(ts_ms + 30_000),
25147                    content: "## Plan\n\n1. Reproduce\n2. Patch\n3. Verify".into(),
25148                    extra_json: usage_json,
25149                    snippets: vec![],
25150                },
25151            ],
25152            source_id: "local".into(),
25153            origin_host: None,
25154        };
25155
25156        let outcomes = storage
25157            .insert_conversations_batched(&[(agent_id, None, &conv)])
25158            .unwrap();
25159        assert_eq!(outcomes.len(), 1);
25160        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
25161
25162        let conn = storage.raw();
25163        let daily_stats_rows: i64 = conn
25164            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
25165                row.get_typed(0)
25166            })
25167            .unwrap();
25168        let token_daily_rows: i64 = conn
25169            .query_row_map(
25170                "SELECT COUNT(*) FROM token_daily_stats",
25171                fparams![],
25172                |row| row.get_typed(0),
25173            )
25174            .unwrap();
25175        let usage_daily_rows: i64 = conn
25176            .query_row_map("SELECT COUNT(*) FROM usage_daily", fparams![], |row| {
25177                row.get_typed(0)
25178            })
25179            .unwrap();
25180        let model_daily_rows: i64 = conn
25181            .query_row_map(
25182                "SELECT COUNT(*) FROM usage_models_daily",
25183                fparams![],
25184                |row| row.get_typed(0),
25185            )
25186            .unwrap();
25187
25188        assert!(daily_stats_rows > 0, "daily_stats should be populated");
25189        assert!(
25190            token_daily_rows > 0,
25191            "token_daily_stats should be populated"
25192        );
25193        assert!(usage_daily_rows > 0, "usage_daily should be populated");
25194        assert!(
25195            model_daily_rows > 0,
25196            "usage_models_daily should be populated"
25197        );
25198    }
25199
25200    // =========================================================================
25201    // FrankenConnectionManager tests (bead 3rlf8)
25202    // =========================================================================
25203
25204    #[test]
25205    fn connection_manager_creates_readers() {
25206        let dir = TempDir::new().unwrap();
25207        let db_path = dir.path().join("cm.db");
25208
25209        // Create the DB first
25210        let fs = FrankenStorage::open(&db_path).unwrap();
25211        drop(fs);
25212
25213        let config = ConnectionManagerConfig {
25214            reader_count: 3,
25215            max_writers: 2,
25216        };
25217        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25218        assert_eq!(mgr.reader_count(), 3);
25219        assert_eq!(mgr.max_writers(), 2);
25220    }
25221
25222    #[test]
25223    fn connection_manager_clamps_zero_writer_limit_to_prevent_deadlock() {
25224        let dir = TempDir::new().unwrap();
25225        let db_path = dir.path().join("cm.db");
25226
25227        let fs = FrankenStorage::open(&db_path).unwrap();
25228        drop(fs);
25229
25230        let mgr = std::sync::Arc::new(
25231            FrankenConnectionManager::new(
25232                &db_path,
25233                ConnectionManagerConfig {
25234                    reader_count: 0,
25235                    max_writers: 0,
25236                },
25237            )
25238            .unwrap(),
25239        );
25240        assert_eq!(mgr.reader_count(), 1);
25241        assert_eq!(mgr.max_writers(), 1);
25242
25243        let (tx, rx) = std::sync::mpsc::channel();
25244        let mgr_for_thread = std::sync::Arc::clone(&mgr);
25245        std::thread::spawn(move || {
25246            let result = mgr_for_thread.writer().map(|mut guard| {
25247                guard.mark_committed();
25248            });
25249            tx.send(result.is_ok()).expect("writer result send");
25250        });
25251
25252        assert!(
25253            rx.recv_timeout(Duration::from_secs(10)).unwrap(),
25254            "writer acquisition should not block forever when configured with zero writer slots"
25255        );
25256    }
25257
25258    #[test]
25259    fn connection_manager_reader_round_robin() {
25260        let dir = TempDir::new().unwrap();
25261        let db_path = dir.path().join("cm.db");
25262
25263        let fs = FrankenStorage::open(&db_path).unwrap();
25264        drop(fs);
25265
25266        let config = ConnectionManagerConfig {
25267            reader_count: 2,
25268            max_writers: 1,
25269        };
25270        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25271
25272        // Reader index should advance (round-robin)
25273        let idx_before = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
25274        let _r1 = mgr.reader();
25275        let idx_after = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
25276        assert_eq!(idx_after, idx_before + 1, "reader index should advance");
25277    }
25278
25279    #[test]
25280    fn connection_manager_writer_reads_and_writes() {
25281        use frankensqlite::compat::RowExt;
25282
25283        let dir = TempDir::new().unwrap();
25284        let db_path = dir.path().join("cm.db");
25285
25286        let fs = FrankenStorage::open(&db_path).unwrap();
25287        drop(fs);
25288
25289        let mgr = FrankenConnectionManager::new(&db_path, Default::default()).unwrap();
25290
25291        // Acquire writer and insert data
25292        {
25293            let mut guard = mgr.writer().unwrap();
25294            guard
25295                .storage()
25296                .raw()
25297                .execute("CREATE TABLE IF NOT EXISTS cm_test (id INTEGER PRIMARY KEY, val TEXT)")
25298                .unwrap();
25299            guard
25300                .storage()
25301                .raw()
25302                .execute("INSERT INTO cm_test (val) VALUES ('hello')")
25303                .unwrap();
25304            guard.mark_committed();
25305        }
25306
25307        // Verify via reader (returns MutexGuard<SendFrankenConnection>)
25308        let reader_guard = mgr.reader();
25309        let rows = reader_guard.query("SELECT val FROM cm_test").unwrap();
25310        assert_eq!(rows.len(), 1);
25311        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "hello");
25312    }
25313
25314    #[test]
25315    fn connection_manager_writer_guard_drops_releases_slot() {
25316        let dir = TempDir::new().unwrap();
25317        let db_path = dir.path().join("cm.db");
25318
25319        let fs = FrankenStorage::open(&db_path).unwrap();
25320        drop(fs);
25321
25322        let config = ConnectionManagerConfig {
25323            reader_count: 1,
25324            max_writers: 1,
25325        };
25326        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25327
25328        // Acquire and release writer
25329        {
25330            let mut guard = mgr.writer().unwrap();
25331            guard.mark_committed();
25332        }
25333
25334        // Should be able to acquire again (slot released)
25335        let mut guard2 = mgr.writer().unwrap();
25336        guard2.mark_committed();
25337    }
25338
25339    #[test]
25340    fn connection_manager_concurrent_writer_works() {
25341        use frankensqlite::compat::RowExt;
25342
25343        let dir = TempDir::new().unwrap();
25344        let db_path = dir.path().join("cm.db");
25345
25346        let fs = FrankenStorage::open(&db_path).unwrap();
25347        drop(fs);
25348
25349        let config = ConnectionManagerConfig {
25350            reader_count: 1,
25351            max_writers: 2,
25352        };
25353        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25354
25355        {
25356            let mut guard = mgr.concurrent_writer().unwrap();
25357            guard
25358                .storage()
25359                .raw()
25360                .execute("CREATE TABLE IF NOT EXISTS cm_conc (id INTEGER PRIMARY KEY, val TEXT)")
25361                .unwrap();
25362            guard
25363                .storage()
25364                .raw()
25365                .execute("INSERT INTO cm_conc (val) VALUES ('concurrent')")
25366                .unwrap();
25367            guard.mark_committed();
25368        }
25369
25370        let reader_guard = mgr.reader();
25371        let rows = reader_guard.query("SELECT val FROM cm_conc").unwrap();
25372        assert_eq!(rows.len(), 1);
25373        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "concurrent");
25374    }
25375
25376    #[test]
25377    fn connection_manager_default_config() {
25378        let config = ConnectionManagerConfig::default();
25379        assert_eq!(config.reader_count, 4);
25380        assert!(config.max_writers > 0);
25381    }
25382
25383    #[test]
25384    fn purge_agent_archive_data_removes_only_target_agent_and_rebuilds_derived_tables() {
25385        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
25386        use std::path::PathBuf;
25387
25388        fn seed_conversation(storage: &FrankenStorage, agent_slug: &str, marker: &str) {
25389            let agent = Agent {
25390                id: None,
25391                slug: agent_slug.into(),
25392                name: agent_slug.into(),
25393                version: None,
25394                kind: AgentKind::Cli,
25395            };
25396            let agent_id = storage.ensure_agent(&agent).unwrap();
25397            let conversation = Conversation {
25398                id: None,
25399                agent_slug: agent_slug.into(),
25400                workspace: Some(PathBuf::from("/tmp/workspace")),
25401                external_id: Some(format!("{agent_slug}-{marker}")),
25402                title: Some(format!("{agent_slug} {marker}")),
25403                source_path: PathBuf::from(format!("/tmp/{agent_slug}-{marker}.jsonl")),
25404                started_at: Some(1_700_000_000_000),
25405                ended_at: Some(1_700_000_000_100),
25406                approx_tokens: None,
25407                metadata_json: serde_json::Value::Null,
25408                messages: vec![
25409                    Message {
25410                        id: None,
25411                        idx: 0,
25412                        role: MessageRole::User,
25413                        author: Some("user".into()),
25414                        created_at: Some(1_700_000_000_010),
25415                        content: format!("{agent_slug} {marker} user"),
25416                        extra_json: serde_json::Value::Null,
25417                        snippets: Vec::new(),
25418                    },
25419                    Message {
25420                        id: None,
25421                        idx: 1,
25422                        role: MessageRole::Agent,
25423                        author: Some("assistant".into()),
25424                        created_at: Some(1_700_000_000_020),
25425                        content: format!("{agent_slug} {marker} assistant"),
25426                        extra_json: serde_json::Value::Null,
25427                        snippets: Vec::new(),
25428                    },
25429                ],
25430                source_id: LOCAL_SOURCE_ID.into(),
25431                origin_host: None,
25432            };
25433            storage
25434                .insert_conversation_tree(agent_id, None, &conversation)
25435                .unwrap();
25436        }
25437
25438        let dir = TempDir::new().unwrap();
25439        let db_path = dir.path().join("agent_search.db");
25440        let storage = FrankenStorage::open(&db_path).unwrap();
25441
25442        seed_conversation(&storage, "openclaw", "purge-target");
25443        seed_conversation(&storage, "codex", "keep-target");
25444
25445        let purge = storage.purge_agent_archive_data("openclaw").unwrap();
25446        assert_eq!(purge.conversations_deleted, 1);
25447        assert_eq!(purge.messages_deleted, 2);
25448
25449        storage.rebuild_fts().unwrap();
25450        storage.rebuild_analytics().unwrap();
25451        storage.rebuild_daily_stats().unwrap();
25452        storage.rebuild_token_daily_stats().unwrap();
25453
25454        let agents = storage.list_agents().unwrap();
25455        assert_eq!(agents.len(), 1);
25456        assert_eq!(agents[0].slug, "codex");
25457        assert_eq!(storage.total_conversation_count().unwrap(), 1);
25458        assert_eq!(storage.total_message_count().unwrap(), 2);
25459
25460        let fts_rows: i64 = storage
25461            .raw()
25462            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
25463                row.get_typed(0)
25464            })
25465            .unwrap();
25466        assert_eq!(fts_rows, 2);
25467
25468        let total_daily_sessions: i64 = storage
25469            .raw()
25470            .query_row_map(
25471                "SELECT COALESCE(SUM(session_count), 0)
25472                 FROM daily_stats
25473                 WHERE agent_slug = 'all' AND source_id = 'all'",
25474                fparams![],
25475                |row| row.get_typed(0),
25476            )
25477            .unwrap();
25478        assert_eq!(total_daily_sessions, 1);
25479
25480        let openclaw_token_rows: i64 = storage
25481            .raw()
25482            .query_row_map(
25483                "SELECT COUNT(*) FROM token_daily_stats WHERE agent_slug = 'openclaw'",
25484                fparams![],
25485                |row| row.get_typed(0),
25486            )
25487            .unwrap();
25488        assert_eq!(openclaw_token_rows, 0);
25489    }
25490
25491    /// Regression for cass#202: a `Connection` dropped mid-transaction can
25492    /// leave child rows persisted without a matching parent. The next indexer
25493    /// pass then trips `FOREIGN KEY constraint failed` on every write, the
25494    /// session never gets marked indexed, and the pending backlog grows
25495    /// without bound. `cleanup_orphan_fk_rows` is the indexer-startup
25496    /// self-heal that breaks the cycle.
25497    #[test]
25498    fn cleanup_orphan_fk_rows_removes_orphans_and_is_noop_on_clean_db() {
25499        let dir = TempDir::new().unwrap();
25500        let db_path = dir.path().join("orphan_fk_self_heal.db");
25501        let storage = FrankenStorage::open(&db_path).unwrap();
25502
25503        // Plant orphan rows directly: rows whose FK parent does not exist.
25504        // FK enforcement is temporarily off so the planted rows can land.
25505        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25506
25507        // Seed a real conversation so a subset of children DO have valid
25508        // parents — we want the cleanup to be precise, not a table-flush.
25509        storage
25510            .raw()
25511            .execute_compat(
25512                "INSERT INTO agents(id, slug, name, kind, created_at, updated_at) \
25513                 VALUES(1, 'test-agent', 'Test Agent', 'cli', 0, 0)",
25514                fparams![],
25515            )
25516            .unwrap();
25517        storage
25518            .raw()
25519            .execute_compat(
25520                "INSERT INTO conversations(id, agent_id, source_id, source_path, started_at) \
25521                 VALUES(1, 1, 'local', '/tmp/real.jsonl', 0)",
25522                fparams![],
25523            )
25524            .unwrap();
25525        storage
25526            .raw()
25527            .execute_compat(
25528                "INSERT INTO messages(id, conversation_id, idx, role, content) \
25529                 VALUES(1, 1, 0, 'user', 'real message')",
25530                fparams![],
25531            )
25532            .unwrap();
25533
25534        // Plant orphan messages referencing conversation_id=99999 (does not exist)
25535        // and conversation_id=0 (the specific shape reported in #202). Distinct
25536        // (conversation_id, idx) pairs are required by the UNIQUE constraint.
25537        for (mid, cid, idx) in [(101_i64, 99_999_i64, 0_i64), (102, 0, 0), (103, 0, 1)] {
25538            storage
25539                .raw()
25540                .execute_compat(
25541                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25542                     VALUES(?1, ?2, ?3, 'user', 'orphan message')",
25543                    fparams![mid, cid, idx],
25544                )
25545                .unwrap();
25546        }
25547
25548        // Rows below are not directly orphaned because their immediate
25549        // `messages` parent exists, but that parent is itself orphaned. The
25550        // cleanup deletes them explicitly before deleting orphan messages so the
25551        // FK cascade engine does not have to run one delete program per orphan.
25552        for message_id in [1_i64, 101_i64, 102_i64] {
25553            storage
25554                .raw()
25555                .execute_compat(
25556                    "INSERT INTO message_metrics(
25557                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25558                         role, content_chars, content_tokens_est
25559                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 13, 2)",
25560                    fparams![message_id],
25561                )
25562                .unwrap();
25563            storage
25564                .raw()
25565                .execute_compat(
25566                    "INSERT INTO token_usage(
25567                         message_id, conversation_id, agent_id, timestamp_ms, day_id,
25568                         role, content_chars
25569                     ) VALUES(?1, 1, 1, 0, 0, 'user', 13)",
25570                    fparams![message_id],
25571                )
25572                .unwrap();
25573        }
25574
25575        // Plant a directly-orphan snippet — message_id=99999 does not exist
25576        // anywhere, so this exercises the snippets DELETE path rather than
25577        // riding on the cascade from the orphan-message DELETE.
25578        storage
25579            .raw()
25580            .execute_compat(
25581                "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text) \
25582                 VALUES(99999, '/tmp/orphan-snippet.rs', 1, 2, 'rust', 'fn main() {}')",
25583                fparams![],
25584            )
25585            .unwrap();
25586
25587        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25588
25589        // Sanity: the planted orphans are visible.
25590        let messages_before: i64 = storage
25591            .raw()
25592            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25593                row.get_typed(0)
25594            })
25595            .unwrap();
25596        assert_eq!(messages_before, 4); // 1 real + 3 orphans
25597        let snippets_before: i64 = storage
25598            .raw()
25599            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25600                row.get_typed(0)
25601            })
25602            .unwrap();
25603        assert_eq!(snippets_before, 1);
25604        let metrics_before: i64 = storage
25605            .raw()
25606            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25607                row.get_typed(0)
25608            })
25609            .unwrap();
25610        assert_eq!(metrics_before, 3);
25611        let token_usage_before: i64 = storage
25612            .raw()
25613            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25614                row.get_typed(0)
25615            })
25616            .unwrap();
25617        assert_eq!(token_usage_before, 3);
25618
25619        // Run the self-heal.
25620        let report = storage.cleanup_orphan_fk_rows().unwrap();
25621
25622        // 3 orphan messages + 1 directly-orphan snippet = 4 primary orphans
25623        // reported. Dependent message_metrics/token_usage rows for orphan
25624        // messages are pruned too, but they are not double-counted because the
25625        // orphan message is the root row that made them invalid.
25626        let messages_after: i64 = storage
25627            .raw()
25628            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25629                row.get_typed(0)
25630            })
25631            .unwrap();
25632        assert_eq!(messages_after, 1, "real message must be preserved");
25633        let snippets_after: i64 = storage
25634            .raw()
25635            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25636                row.get_typed(0)
25637            })
25638            .unwrap();
25639        assert_eq!(snippets_after, 0);
25640        let metrics_after: i64 = storage
25641            .raw()
25642            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25643                row.get_typed(0)
25644            })
25645            .unwrap();
25646        assert_eq!(metrics_after, 1, "real message metric must be preserved");
25647        let token_usage_after: i64 = storage
25648            .raw()
25649            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25650                row.get_typed(0)
25651            })
25652            .unwrap();
25653        assert_eq!(token_usage_after, 1, "real token row must be preserved");
25654
25655        assert_eq!(report.total, 4, "report total: {:?}", report);
25656        let messages_count = report
25657            .per_table
25658            .iter()
25659            .find(|(t, _)| *t == "messages")
25660            .map(|(_, c)| *c);
25661        assert_eq!(messages_count, Some(3));
25662        let snippets_count = report
25663            .per_table
25664            .iter()
25665            .find(|(t, _)| *t == "snippets")
25666            .map(|(_, c)| *c);
25667        assert_eq!(snippets_count, Some(1));
25668
25669        // Second invocation on a now-clean DB must be a no-op.
25670        let second = storage.cleanup_orphan_fk_rows().unwrap();
25671        assert_eq!(second.total, 0);
25672        assert!(second.per_table.is_empty());
25673    }
25674
25675    #[test]
25676    fn cleanup_orphan_fk_rows_handles_more_than_one_delete_chunk() {
25677        let dir = TempDir::new().unwrap();
25678        let db_path = dir.path().join("orphan_fk_chunked_self_heal.db");
25679        let storage = FrankenStorage::open(&db_path).unwrap();
25680        let orphan_count = ORPHAN_FK_ID_CHUNK_SIZE + 3;
25681
25682        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25683        {
25684            let mut tx = storage.raw().transaction().unwrap();
25685            for idx in 0..orphan_count {
25686                let message_id = 10_000_i64 + i64::try_from(idx).unwrap();
25687                let conversation_id = 20_000_i64 + i64::try_from(idx).unwrap();
25688                tx.execute_compat(
25689                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25690                     VALUES(?1, ?2, 0, 'user', 'orphan message')",
25691                    fparams![message_id, conversation_id],
25692                )
25693                .unwrap();
25694                tx.execute_compat(
25695                    "INSERT INTO message_metrics(
25696                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25697                         role, content_chars, content_tokens_est
25698                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 14, 2)",
25699                    fparams![message_id],
25700                )
25701                .unwrap();
25702            }
25703            tx.commit().unwrap();
25704        }
25705        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25706
25707        let report = storage.cleanup_orphan_fk_rows().unwrap();
25708
25709        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25710        let messages_count = report
25711            .per_table
25712            .iter()
25713            .find(|(table, _)| *table == "messages")
25714            .map(|(_, count)| *count);
25715        assert_eq!(messages_count, Some(i64::try_from(orphan_count).unwrap()));
25716        let messages_after: i64 = storage
25717            .raw()
25718            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25719                row.get_typed(0)
25720            })
25721            .unwrap();
25722        assert_eq!(messages_after, 0);
25723        let metrics_after: i64 = storage
25724            .raw()
25725            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25726                row.get_typed(0)
25727            })
25728            .unwrap();
25729        assert_eq!(metrics_after, 0);
25730    }
25731
25732    #[test]
25733    fn cleanup_orphan_fk_rows_pages_direct_child_orphans() {
25734        let dir = TempDir::new().unwrap();
25735        let db_path = dir.path().join("direct_orphan_fk_paged_self_heal.db");
25736        let storage = FrankenStorage::open(&db_path).unwrap();
25737        let orphan_count = (ORPHAN_FK_ID_CHUNK_SIZE * 2) + 5;
25738
25739        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25740        {
25741            let mut tx = storage.raw().transaction().unwrap();
25742            for idx in 0..orphan_count {
25743                let message_id = 50_000_i64 + i64::try_from(idx).unwrap();
25744                tx.execute_compat(
25745                    "INSERT INTO message_metrics(
25746                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25747                         role, content_chars, content_tokens_est
25748                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 21, 3)",
25749                    fparams![message_id],
25750                )
25751                .unwrap();
25752            }
25753            tx.commit().unwrap();
25754        }
25755        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25756
25757        let report = storage.cleanup_orphan_fk_rows().unwrap();
25758
25759        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25760        let metrics_count = report
25761            .per_table
25762            .iter()
25763            .filter(|(table, _)| *table == "message_metrics")
25764            .map(|(_, count)| *count)
25765            .sum::<i64>();
25766        assert_eq!(metrics_count, i64::try_from(orphan_count).unwrap());
25767        assert_eq!(
25768            report
25769                .per_table
25770                .iter()
25771                .filter(|(table, _)| *table == "message_metrics")
25772                .count(),
25773            1,
25774            "paged cleanup should aggregate report entries by table: {report:?}"
25775        );
25776        let metrics_after: i64 = storage
25777            .raw()
25778            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25779                row.get_typed(0)
25780            })
25781            .unwrap();
25782        assert_eq!(metrics_after, 0);
25783    }
25784}