Skip to main content

coding_agent_search/storage/
sqlite.rs

1//! `SQLite` backend: schema, pragmas, and migrations.
2
3use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
4use crate::sources::provenance::{LOCAL_SOURCE_ID, Source, SourceKind};
5use anyhow::{Context, Result, anyhow, bail};
6use frankensqlite::{
7    Connection as FrankenConnection, Row as FrankenRow, SqliteValue,
8    compat::{
9        ConnectionExt as FrankenConnectionExt, OpenFlags as FrankenOpenFlags,
10        OptionalExtension as FrankenOptionalExtension, ParamValue, RowExt as FrankenRowExt,
11        Transaction as FrankenTransaction, TransactionExt as FrankenTransactionExt,
12        open_with_flags as open_franken_with_flags, param_slice_to_values, params_from_iter,
13    },
14    migrate::MigrationRunner,
15};
16use serde::{Deserialize, Serialize};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::io::{BufRead, BufReader, Write};
22use std::process::{Command, Stdio};
23use std::sync::{
24    Arc,
25    atomic::{AtomicBool, AtomicI8, AtomicI64, AtomicU64, AtomicUsize, Ordering},
26};
27
28/// Frankensqlite parameter list builder.
29macro_rules! fparams {
30    () => {
31        &[] as &[ParamValue]
32    };
33    ($($val:expr),+ $(,)?) => {
34        &[$(ParamValue::from($val)),+] as &[ParamValue]
35    };
36}
37use std::path::{Path, PathBuf};
38use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
39use thiserror::Error;
40use tracing::info;
41
42const DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT: Duration = Duration::from_secs(30);
43const DOCTOR_MUTATION_LOCK_MAX_METADATA_READ: u64 = 64 * 1024;
44
45// -------------------------------------------------------------------------
46// Lazy FrankenSQLite Connection (bd-1ueu)
47// -------------------------------------------------------------------------
48// Defers opening the database until first use, cutting startup cost for
49// commands that may not need the DB at all.  Thread-safe via parking_lot
50// Mutex; logs the reason and duration of the open on first access.
51
52/// Error from lazy database initialization.
53#[derive(Debug, Error)]
54pub enum LazyDbError {
55    #[error("Database not found at {0}")]
56    NotFound(PathBuf),
57    #[error("Failed to open FrankenSQLite database at {path}: {source}")]
58    FrankenOpenFailed {
59        path: PathBuf,
60        source: frankensqlite::FrankenError,
61    },
62}
63
64// -------------------------------------------------------------------------
65// LazyFrankenDb — lazy wrapper around FrankenConnection
66// -------------------------------------------------------------------------
67
68/// Wrapper around `FrankenConnection` that implements `Send`.
69///
70/// `FrankenConnection` is `!Send` because it uses `Rc` internally.
71/// However, the `Rc` values are entirely self-contained within the Connection
72/// and are not shared externally.  When wrapped in a `Mutex`,
73/// exclusive access is guaranteed, making cross-thread transfer safe.
74pub struct SendFrankenConnection(FrankenConnection, i64, u64);
75
76// Safety: Rc fields inside FrankenConnection are not cloned or shared externally.
77// The Mutex<Option<SendFrankenConnection>> ensures exclusive access.
78unsafe impl Send for SendFrankenConnection {}
79
80impl SendFrankenConnection {
81    pub(crate) fn new(conn: FrankenConnection) -> Self {
82        Self(
83            conn,
84            UNSET_INDEX_WRITER_CHECKPOINT_PAGES,
85            UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS,
86        )
87    }
88
89    pub(crate) fn new_with_index_writer_state(
90        conn: FrankenConnection,
91        checkpoint_pages: i64,
92        busy_timeout_ms: u64,
93    ) -> Self {
94        Self(conn, checkpoint_pages, busy_timeout_ms)
95    }
96
97    pub(crate) fn into_parts(self) -> (FrankenConnection, i64, u64) {
98        (self.0, self.1, self.2)
99    }
100}
101
102impl std::ops::Deref for SendFrankenConnection {
103    type Target = FrankenConnection;
104    fn deref(&self) -> &FrankenConnection {
105        &self.0
106    }
107}
108
109/// Lazy-opening wrapper for `FrankenConnection` (frankensqlite).
110///
111/// Constructing a `LazyFrankenDb` is cheap (no I/O).  The underlying
112/// `FrankenConnection` is opened on the first call to [`get`].
113/// Subsequent calls return the cached connection.
114pub struct LazyFrankenDb {
115    path: PathBuf,
116    conn: parking_lot::Mutex<Option<SendFrankenConnection>>,
117}
118
119/// RAII guard that dereferences to the inner `FrankenConnection`.
120pub struct LazyFrankenDbGuard<'a>(parking_lot::MutexGuard<'a, Option<SendFrankenConnection>>);
121
122impl std::fmt::Debug for LazyFrankenDbGuard<'_> {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        f.debug_tuple("LazyFrankenDbGuard")
125            .field(&self.0.is_some())
126            .finish()
127    }
128}
129
130impl std::ops::Deref for LazyFrankenDbGuard<'_> {
131    type Target = FrankenConnection;
132    fn deref(&self) -> &FrankenConnection {
133        self.0
134            .as_ref()
135            .expect("LazyFrankenDb connection must be initialized before access")
136    }
137}
138
139impl LazyFrankenDb {
140    /// Create a lazy handle pointing at `path`.  No I/O is performed.
141    pub fn new(path: PathBuf) -> Self {
142        Self {
143            path,
144            conn: parking_lot::Mutex::new(None),
145        }
146    }
147
148    /// Resolve path from optional CLI overrides.
149    ///
150    /// Uses `data_dir / agent_search.db` as fallback.
151    pub fn from_overrides(data_dir: &Option<PathBuf>, db_override: Option<PathBuf>) -> Self {
152        let data_dir = data_dir.clone().unwrap_or_else(crate::default_data_dir);
153        let path = db_override.unwrap_or_else(|| data_dir.join("agent_search.db"));
154        Self::new(path)
155    }
156
157    /// Get the connection, opening the database on first access.
158    ///
159    /// `reason` is logged alongside the open duration so callers can
160    /// identify which command triggered the open.
161    pub fn get(&self, reason: &str) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
162        let mut guard = self.conn.lock();
163        if guard.is_none() {
164            if !self.path.exists() {
165                return Err(LazyDbError::NotFound(self.path.clone()));
166            }
167            let start = Instant::now();
168            let _doctor_guard = acquire_doctor_mutation_db_open_guard(
169                &self.path,
170                DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT,
171            )
172            .map_err(|err| LazyDbError::FrankenOpenFailed {
173                path: self.path.clone(),
174                source: frankensqlite::FrankenError::Internal(err.to_string()),
175            })?;
176            let conn =
177                FrankenConnection::open(self.path.to_string_lossy().into_owned()).map_err(|e| {
178                    LazyDbError::FrankenOpenFailed {
179                        path: self.path.clone(),
180                        source: e,
181                    }
182                })?;
183            let elapsed_ms = start.elapsed().as_millis();
184            info!(
185                path = %self.path.display(),
186                elapsed_ms = elapsed_ms,
187                reason = reason,
188                "lazily opened FrankenSQLite database"
189            );
190            *guard = Some(SendFrankenConnection::new(conn));
191        }
192        Ok(LazyFrankenDbGuard(guard))
193    }
194
195    /// Get the connection with a timeout, opening the database on first access.
196    ///
197    /// Like [`get`] but spawns the open in a background thread and waits up to
198    /// `timeout` for it to complete. Returns `LazyDbError::FrankenOpenFailed`
199    /// with a descriptive message if the timeout elapses. Fix for #128.
200    pub fn get_with_timeout(
201        &self,
202        reason: &str,
203        timeout: Duration,
204    ) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
205        let mut guard = self.conn.lock();
206        if guard.is_none() {
207            if !self.path.exists() {
208                return Err(LazyDbError::NotFound(self.path.clone()));
209            }
210            let start = Instant::now();
211            let path_owned = self.path.to_string_lossy().into_owned();
212            let path_for_guard = self.path.clone();
213            let (tx, rx) = std::sync::mpsc::channel();
214            std::thread::spawn(move || {
215                let _doctor_guard =
216                    match acquire_doctor_mutation_db_open_guard(&path_for_guard, timeout) {
217                        Ok(guard) => guard,
218                        Err(err) => {
219                            let _ = tx
220                                .send(Err(frankensqlite::FrankenError::Internal(err.to_string())));
221                            return;
222                        }
223                    };
224                let _ =
225                    tx.send(FrankenConnection::open(path_owned).map(SendFrankenConnection::new));
226            });
227            let conn = rx
228                .recv_timeout(timeout)
229                .map_err(|_| LazyDbError::FrankenOpenFailed {
230                    path: self.path.clone(),
231                    source: frankensqlite::FrankenError::Internal(format!(
232                        "database open timed out after {}s (possible corruption or lock contention)",
233                        timeout.as_secs()
234                    )),
235                })?
236                .map_err(|e| LazyDbError::FrankenOpenFailed {
237                    path: self.path.clone(),
238                    source: e,
239                })?;
240            let elapsed_ms = start.elapsed().as_millis();
241            info!(
242                path = %self.path.display(),
243                elapsed_ms = elapsed_ms,
244                reason = reason,
245                "lazily opened FrankenSQLite database (with timeout)"
246            );
247            *guard = Some(conn);
248        }
249        Ok(LazyFrankenDbGuard(guard))
250    }
251
252    /// Path to the database file (even if not yet opened).
253    pub fn path(&self) -> &Path {
254        &self.path
255    }
256
257    /// Whether the connection has been opened.
258    pub fn is_open(&self) -> bool {
259        self.conn.lock().is_some()
260    }
261}
262
263static FRANKEN_RETRY_JITTER_STATE: AtomicU64 = AtomicU64::new(0x9e37_79b9_7f4a_7c15);
264static DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH: AtomicUsize = AtomicUsize::new(0);
265static MESSAGE_LOOKUP_TRACE_ENABLED: AtomicBool = AtomicBool::new(false);
266static MESSAGE_LOOKUP_EXACT_IDX_PROBES: AtomicU64 = AtomicU64::new(0);
267static MESSAGE_LOOKUP_BOUNDED_QUERIES: AtomicU64 = AtomicU64::new(0);
268static MESSAGE_LOOKUP_FULL_SCAN_QUERIES: AtomicU64 = AtomicU64::new(0);
269static MESSAGE_LOOKUP_ROWS_MATERIALIZED: AtomicU64 = AtomicU64::new(0);
270
271#[derive(Debug, Clone, Copy, Default, Serialize)]
272pub(crate) struct MessageLookupTraceCounters {
273    pub exact_idx_probes: u64,
274    pub bounded_lookup_queries: u64,
275    pub full_scan_queries: u64,
276    pub rows_materialized: u64,
277}
278
279impl MessageLookupTraceCounters {
280    pub(crate) fn saturating_sub(self, before: Self) -> Self {
281        Self {
282            exact_idx_probes: self
283                .exact_idx_probes
284                .saturating_sub(before.exact_idx_probes),
285            bounded_lookup_queries: self
286                .bounded_lookup_queries
287                .saturating_sub(before.bounded_lookup_queries),
288            full_scan_queries: self
289                .full_scan_queries
290                .saturating_sub(before.full_scan_queries),
291            rows_materialized: self
292                .rows_materialized
293                .saturating_sub(before.rows_materialized),
294        }
295    }
296
297    pub(crate) fn lookups_against_global(self) -> u64 {
298        self.exact_idx_probes.saturating_add(self.rows_materialized)
299    }
300}
301
302pub(crate) fn set_message_lookup_trace_enabled(enabled: bool) -> bool {
303    MESSAGE_LOOKUP_TRACE_ENABLED.swap(enabled, Ordering::Relaxed)
304}
305
306pub(crate) fn message_lookup_trace_snapshot() -> MessageLookupTraceCounters {
307    MessageLookupTraceCounters {
308        exact_idx_probes: MESSAGE_LOOKUP_EXACT_IDX_PROBES.load(Ordering::Relaxed),
309        bounded_lookup_queries: MESSAGE_LOOKUP_BOUNDED_QUERIES.load(Ordering::Relaxed),
310        full_scan_queries: MESSAGE_LOOKUP_FULL_SCAN_QUERIES.load(Ordering::Relaxed),
311        rows_materialized: MESSAGE_LOOKUP_ROWS_MATERIALIZED.load(Ordering::Relaxed),
312    }
313}
314
315fn record_message_lookup_exact_idx_probe() {
316    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
317        MESSAGE_LOOKUP_EXACT_IDX_PROBES.fetch_add(1, Ordering::Relaxed);
318    }
319}
320
321fn record_message_lookup_bounded_queries(query_count: u64, rows: usize) {
322    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
323        MESSAGE_LOOKUP_BOUNDED_QUERIES.fetch_add(query_count, Ordering::Relaxed);
324        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
325    }
326}
327
328fn record_message_lookup_full_scan_query(rows: usize) {
329    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
330        MESSAGE_LOOKUP_FULL_SCAN_QUERIES.fetch_add(1, Ordering::Relaxed);
331        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
332    }
333}
334
335pub(crate) struct DoctorMutationDbOpenBypassGuard;
336
337impl Drop for DoctorMutationDbOpenBypassGuard {
338    fn drop(&mut self) {
339        DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_sub(1, Ordering::SeqCst);
340    }
341}
342
343pub(crate) fn enter_doctor_mutation_db_open_bypass() -> DoctorMutationDbOpenBypassGuard {
344    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_add(1, Ordering::SeqCst);
345    DoctorMutationDbOpenBypassGuard
346}
347
348fn doctor_mutation_db_open_bypass_active() -> bool {
349    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.load(Ordering::SeqCst) > 0
350}
351
352fn next_franken_retry_jitter_ms(max_inclusive: u64) -> u64 {
353    let mut value = FRANKEN_RETRY_JITTER_STATE.fetch_add(0x9e37_79b9_7f4a_7c15, Ordering::Relaxed);
354    value ^= value >> 30;
355    value = value.wrapping_mul(0xbf58_476d_1ce4_e5b9);
356    value ^= value >> 27;
357    value = value.wrapping_mul(0x94d0_49bb_1331_11eb);
358    value ^= value >> 31;
359    value % max_inclusive.saturating_add(1)
360}
361
362/// Sleep with jittered exponential backoff to avoid lock-step retry storms
363/// when many threads hit the same transient SQLite/frankensqlite contention.
364pub(crate) fn sleep_with_franken_retry_backoff(
365    backoff: &mut Duration,
366    remaining: Duration,
367    max_backoff: Duration,
368) {
369    let capped = (*backoff).min(remaining);
370    let extra_budget = remaining.saturating_sub(capped).min(capped);
371    let extra_ms = extra_budget.as_millis().min(u128::from(u64::MAX)) as u64;
372    let sleep_for = if extra_ms == 0 {
373        capped
374    } else {
375        capped
376            .saturating_add(Duration::from_millis(next_franken_retry_jitter_ms(
377                extra_ms,
378            )))
379            .min(remaining)
380    };
381    std::thread::sleep(sleep_for);
382    *backoff = backoff.saturating_mul(2).min(max_backoff);
383}
384
385struct DoctorMutationDbOpenGuard(Option<fs::File>);
386
387impl Drop for DoctorMutationDbOpenGuard {
388    fn drop(&mut self) {
389        if let Some(file) = self.0.as_ref() {
390            let _ = fs2::FileExt::unlock(file);
391        }
392    }
393}
394
395fn doctor_mutation_lock_path_for_db_open(db_path: &Path) -> Option<PathBuf> {
396    if db_path.file_name().and_then(|name| name.to_str()) != Some("agent_search.db") {
397        return None;
398    }
399
400    Some(
401        db_path
402            .parent()?
403            .join("doctor")
404            .join("locks")
405            .join("doctor-repair.lock"),
406    )
407}
408
409fn doctor_lock_metadata_pid_is_current_process(raw: &str) -> bool {
410    raw.lines().any(|line| {
411        let Some((key, value)) = line.split_once('=') else {
412            return false;
413        };
414        key.trim() == "pid"
415            && value
416                .trim()
417                .parse::<u32>()
418                .is_ok_and(|pid| pid == std::process::id())
419    })
420}
421
422fn doctor_lock_file_pid_is_current_process(file: &fs::File) -> bool {
423    use std::io::Read as _;
424
425    let Ok(mut file) = file.try_clone() else {
426        return false;
427    };
428    let mut raw = String::new();
429    let _ = std::io::Read::take(&mut file, DOCTOR_MUTATION_LOCK_MAX_METADATA_READ)
430        .read_to_string(&mut raw);
431    doctor_lock_metadata_pid_is_current_process(&raw)
432}
433
434fn acquire_doctor_mutation_db_open_guard(
435    db_path: &Path,
436    timeout: Duration,
437) -> Result<DoctorMutationDbOpenGuard> {
438    let Some(lock_path) = doctor_mutation_lock_path_for_db_open(db_path) else {
439        return Ok(DoctorMutationDbOpenGuard(None));
440    };
441    if doctor_mutation_db_open_bypass_active() {
442        return Ok(DoctorMutationDbOpenGuard(None));
443    }
444
445    if let Some(parent) = lock_path.parent() {
446        fs::create_dir_all(parent).with_context(|| {
447            format!(
448                "creating doctor mutation lock directory {} before opening {}",
449                parent.display(),
450                db_path.display()
451            )
452        })?;
453    }
454
455    let deadline = Instant::now() + timeout;
456    let mut backoff = Duration::from_millis(4);
457    loop {
458        let file = fs::OpenOptions::new()
459            .create(true)
460            .truncate(false)
461            .read(true)
462            .write(true)
463            .open(&lock_path)
464            .with_context(|| {
465                format!(
466                    "opening doctor mutation lock {} before opening {}",
467                    lock_path.display(),
468                    db_path.display()
469                )
470            })?;
471
472        if doctor_lock_file_pid_is_current_process(&file) {
473            return Ok(DoctorMutationDbOpenGuard(None));
474        }
475
476        match fs2::FileExt::try_lock_shared(&file) {
477            Ok(()) => return Ok(DoctorMutationDbOpenGuard(Some(file))),
478            Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
479                let now = Instant::now();
480                if now >= deadline {
481                    return Err(anyhow!(
482                        "doctor mutation lock {} is active while opening {}; refusing to open during repair after waiting {}ms",
483                        lock_path.display(),
484                        db_path.display(),
485                        timeout.as_millis()
486                    ));
487                }
488                let remaining = deadline.saturating_duration_since(now);
489                sleep_with_franken_retry_backoff(
490                    &mut backoff,
491                    remaining,
492                    Duration::from_millis(128),
493                );
494            }
495            Err(err) => {
496                return Err(anyhow!(
497                    "failed to acquire shared doctor mutation lock {} before opening {}: {}",
498                    lock_path.display(),
499                    db_path.display(),
500                    err
501                ));
502            }
503        }
504    }
505}
506
507pub(crate) fn open_franken_storage_with_timeout(
508    path: &Path,
509    timeout: Duration,
510) -> Result<FrankenStorage> {
511    if !path.exists() {
512        return Err(anyhow!("Database not found at {}", path.display()));
513    }
514
515    let deadline = Instant::now() + timeout;
516    let mut backoff = Duration::from_millis(4);
517    loop {
518        match FrankenStorage::open(path) {
519            Ok(storage) => return Ok(storage),
520            Err(err) if retryable_franken_anyhow(&err) => {
521                let now = Instant::now();
522                if now >= deadline {
523                    return Err(err);
524                }
525                let remaining = deadline.saturating_duration_since(now);
526                sleep_with_franken_retry_backoff(
527                    &mut backoff,
528                    remaining,
529                    Duration::from_millis(128),
530                );
531            }
532            Err(err) => return Err(err),
533        }
534    }
535}
536
537pub(crate) fn open_current_schema_storage_with_timeout(
538    path: &Path,
539    timeout: Duration,
540) -> Result<Option<FrankenStorage>> {
541    if !path.exists() {
542        return Ok(None);
543    }
544
545    let mut storage = FrankenStorage::new(
546        open_franken_raw_connection_with_timeout(path, timeout)?,
547        path.to_path_buf(),
548    );
549    storage.apply_open_stage_busy_timeout();
550
551    let version = storage
552        .raw()
553        .query("SELECT value FROM meta WHERE key = 'schema_version';")
554        .ok()
555        .and_then(|rows| rows.first().cloned())
556        .and_then(|row| row.get_typed::<String>(0).ok())
557        .and_then(|raw| raw.parse::<i64>().ok());
558
559    if version != Some(CURRENT_SCHEMA_VERSION) {
560        if let Err(close_err) = storage.close_without_checkpoint_in_place() {
561            tracing::debug!(
562                error = %close_err,
563                db_path = %path.display(),
564                "open_current_schema_storage_with_timeout: close_without_checkpoint_in_place failed; falling back to best-effort close"
565            );
566            storage.close_best_effort_in_place();
567        }
568        return Ok(None);
569    }
570
571    transition_from_meta_version(&storage.conn)?;
572    storage.repair_missing_current_schema_objects()?;
573    storage.apply_config()?;
574    Ok(Some(storage))
575}
576
577pub(crate) fn open_franken_readonly_storage_with_timeout(
578    path: &Path,
579    timeout: Duration,
580) -> Result<FrankenStorage> {
581    if !path.exists() {
582        return Err(anyhow!("Database not found at {}", path.display()));
583    }
584
585    let deadline = Instant::now() + timeout;
586    let mut backoff = Duration::from_millis(4);
587    loop {
588        match FrankenStorage::open_readonly(path) {
589            Ok(storage) => return Ok(storage),
590            Err(err) if retryable_franken_anyhow(&err) => {
591                let now = Instant::now();
592                if now >= deadline {
593                    return Err(err);
594                }
595                let remaining = deadline.saturating_duration_since(now);
596                sleep_with_franken_retry_backoff(
597                    &mut backoff,
598                    remaining,
599                    Duration::from_millis(128),
600                );
601            }
602            Err(err) => return Err(err),
603        }
604    }
605}
606
607pub(crate) fn open_franken_raw_connection_with_timeout(
608    path: &Path,
609    timeout: Duration,
610) -> Result<FrankenConnection> {
611    if !path.exists() {
612        return Err(anyhow!("Database not found at {}", path.display()));
613    }
614
615    let path_str = path.to_string_lossy().to_string();
616    let deadline = Instant::now() + timeout;
617    let mut backoff = Duration::from_millis(4);
618    loop {
619        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
620        match FrankenConnection::open(&path_str)
621            .with_context(|| format!("opening raw frankensqlite db at {}", path.display()))
622        {
623            Ok(conn) => return Ok(conn),
624            Err(err) if retryable_franken_anyhow(&err) => {
625                let now = Instant::now();
626                if now >= deadline {
627                    return Err(err);
628                }
629                let remaining = deadline.saturating_duration_since(now);
630                sleep_with_franken_retry_backoff(
631                    &mut backoff,
632                    remaining,
633                    Duration::from_millis(128),
634                );
635            }
636            Err(err) => return Err(err),
637        }
638    }
639}
640
641pub(crate) fn open_franken_raw_readonly_connection_with_timeout(
642    path: &Path,
643    timeout: Duration,
644) -> Result<FrankenConnection> {
645    if !path.exists() {
646        return Err(anyhow!("Database not found at {}", path.display()));
647    }
648
649    let path_str = path.to_string_lossy().to_string();
650    let deadline = Instant::now() + timeout;
651    let mut backoff = Duration::from_millis(4);
652    loop {
653        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
654        match open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
655            .with_context(|| {
656                format!(
657                    "opening raw frankensqlite db readonly at {}",
658                    path.display()
659                )
660            }) {
661            Ok(conn) => return Ok(conn),
662            Err(err) if retryable_franken_anyhow(&err) => {
663                let now = Instant::now();
664                if now >= deadline {
665                    return Err(err);
666                }
667                let remaining = deadline.saturating_duration_since(now);
668                sleep_with_franken_retry_backoff(
669                    &mut backoff,
670                    remaining,
671                    Duration::from_millis(128),
672                );
673            }
674            Err(err) => return Err(err),
675        }
676    }
677}
678
679pub(crate) fn retryable_franken_error(err: &frankensqlite::FrankenError) -> bool {
680    matches!(
681        err,
682        frankensqlite::FrankenError::Busy
683            | frankensqlite::FrankenError::BusyRecovery
684            | frankensqlite::FrankenError::BusySnapshot { .. }
685            | frankensqlite::FrankenError::DatabaseLocked { .. }
686            | frankensqlite::FrankenError::LockFailed { .. }
687            | frankensqlite::FrankenError::WriteConflict { .. }
688            | frankensqlite::FrankenError::SerializationFailure { .. }
689    ) || retryable_storage_error_message(&err.to_string())
690}
691
692pub(crate) fn retryable_storage_error_message(message: &str) -> bool {
693    let lower = message.to_ascii_lowercase();
694    lower.contains("busy")
695        || lower.contains("locked")
696        || lower.contains("locking")
697        || lower.contains("contention")
698        || lower.contains("temporarily unavailable")
699        || lower.contains("would block")
700}
701
702pub(crate) fn retryable_franken_anyhow(err: &anyhow::Error) -> bool {
703    err.chain().any(|cause| {
704        cause
705            .downcast_ref::<frankensqlite::FrankenError>()
706            .is_some_and(retryable_franken_error)
707            || retryable_storage_error_message(&cause.to_string())
708    })
709}
710
711impl Drop for LazyFrankenDb {
712    fn drop(&mut self) {
713        let Some(mut conn) = self.conn.get_mut().take() else {
714            return;
715        };
716        conn.0.close_best_effort_in_place();
717    }
718}
719
720// -------------------------------------------------------------------------
721// FrankenSQLite Connection Manager (bead 3rlf8)
722// -------------------------------------------------------------------------
723// Multi-connection management: reader pool + concurrent writer connections.
724// Replaces the LazyFrankenDb single-connection bottleneck for high-throughput
725// scenarios (indexer parallel writes, concurrent TUI reads + indexer writes).
726
727/// Configuration for the [`FrankenConnectionManager`].
728#[derive(Debug, Clone)]
729pub struct ConnectionManagerConfig {
730    /// Number of pre-opened reader connections (default: 4).
731    pub reader_count: usize,
732    /// Maximum concurrent writer connections (default: available parallelism).
733    pub max_writers: usize,
734}
735
736impl Default for ConnectionManagerConfig {
737    fn default() -> Self {
738        let cpus = std::thread::available_parallelism()
739            .map(|n| n.get())
740            .unwrap_or(4);
741        Self {
742            reader_count: 4,
743            max_writers: cpus,
744        }
745    }
746}
747
748/// Multi-connection manager for frankensqlite.
749///
750/// Provides:
751/// - A pool of pre-opened reader connections (round-robin, Mutex-protected)
752/// - Controlled creation of writer connections with token-based limits
753/// - RAII guards that auto-rollback uncommitted transactions on drop
754///
755/// Thread-safe: reader connections are wrapped in Mutex (FrankenConnection is !Sync).
756/// Writer connections are created per-request (each thread gets its own).
757pub struct FrankenConnectionManager {
758    db_path: PathBuf,
759    readers: Vec<parking_lot::Mutex<SendFrankenConnection>>,
760    reader_idx: std::sync::atomic::AtomicUsize,
761    /// Token-based writer limit: channel pre-filled with `max_writers` tokens.
762    /// `recv()` = acquire slot, `send()` = release slot.
763    writer_tokens: (
764        crossbeam_channel::Sender<()>,
765        crossbeam_channel::Receiver<()>,
766    ),
767    config: ConnectionManagerConfig,
768}
769
770// Safety: FrankenConnectionManager is Send+Sync because:
771// - readers wrapped in Mutex<SendFrankenConnection> (exclusive access)
772// - writer_tokens uses crossbeam (Send+Sync)
773// - db_path is PathBuf (Send+Sync)
774unsafe impl Send for FrankenConnectionManager {}
775unsafe impl Sync for FrankenConnectionManager {}
776
777impl FrankenConnectionManager {
778    /// Create a new connection manager.
779    ///
780    /// Opens `config.reader_count` reader connections immediately.
781    /// Writer connections are created on demand (up to `config.max_writers`).
782    pub fn new(db_path: impl Into<PathBuf>, config: ConnectionManagerConfig) -> Result<Self> {
783        let db_path = db_path.into();
784        let path_str = db_path.to_string_lossy().to_string();
785
786        let reader_count = config.reader_count.max(1);
787        let mut readers = Vec::with_capacity(reader_count);
788        for _ in 0..reader_count {
789            let conn = FrankenConnection::open(&path_str)
790                .with_context(|| format!("opening reader connection at {}", db_path.display()))?;
791            // Apply read-tuned config (no migration, no write PRAGMAs)
792            let _ = conn.execute("PRAGMA busy_timeout = 5000;"); // match writer config
793            let _ = conn.execute("PRAGMA cache_size = -16384;"); // 16MB reader cache
794            readers.push(parking_lot::Mutex::new(SendFrankenConnection::new(conn)));
795        }
796
797        let max_writers = config.max_writers.max(1);
798
799        // Pre-fill bounded channel with tokens (acts as counting semaphore).
800        // A zero-capacity channel with no initial tokens would make the first
801        // writer acquisition block forever.
802        let (tx, rx) = crossbeam_channel::bounded(max_writers);
803        for _ in 0..max_writers {
804            tx.send(())
805                .map_err(|_| anyhow!("writer token channel closed during initialization"))?;
806        }
807
808        Ok(Self {
809            db_path,
810            readers,
811            reader_idx: std::sync::atomic::AtomicUsize::new(0),
812            writer_tokens: (tx, rx),
813            config: ConnectionManagerConfig {
814                reader_count,
815                max_writers,
816            },
817        })
818    }
819
820    /// Get a reader connection (round-robin from the pool).
821    ///
822    /// Returns a mutex guard wrapping the connection. The guard prevents
823    /// concurrent access to the same connection (FrankenConnection is !Sync).
824    pub fn reader(&self) -> parking_lot::MutexGuard<'_, SendFrankenConnection> {
825        let idx = self
826            .reader_idx
827            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
828        self.readers[idx % self.readers.len()].lock()
829    }
830
831    /// Acquire a writer connection.
832    ///
833    /// Opens a new frankensqlite connection with full config (no migration).
834    /// Blocks if `max_writers` connections are already in use.
835    /// The returned [`WriterGuard`] auto-rolls back on drop.
836    pub fn writer(&self) -> Result<WriterGuard<'_>> {
837        self.writer_tokens
838            .1
839            .recv()
840            .map_err(|_| anyhow!("writer token channel closed"))?;
841        let path_str = self.db_path.to_string_lossy().to_string();
842        let conn = match FrankenConnection::open(&path_str) {
843            Ok(c) => c,
844            Err(e) => {
845                let _ = self.writer_tokens.0.send(());
846                return Err(anyhow::Error::from(e).context(format!(
847                    "opening writer connection at {}",
848                    self.db_path.display()
849                )));
850            }
851        };
852        let storage = FrankenStorage::new(conn, self.db_path.clone());
853        if let Err(e) = storage.apply_config() {
854            let _ = self.writer_tokens.0.send(());
855            return Err(e);
856        }
857        Ok(WriterGuard {
858            storage,
859            mgr: self,
860            committed: false,
861        })
862    }
863
864    /// Acquire a concurrent writer connection (BEGIN CONCURRENT via MVCC).
865    ///
866    /// Similar to [`writer`] but tuned for the parallel indexer write pool.
867    /// Uses reduced cache size and is designed for short-lived batch inserts.
868    pub fn concurrent_writer(&self) -> Result<WriterGuard<'_>> {
869        self.writer_tokens
870            .1
871            .recv()
872            .map_err(|_| anyhow!("writer token channel closed"))?;
873        let path_str = self.db_path.to_string_lossy().to_string();
874        let conn = match FrankenConnection::open(&path_str) {
875            Ok(c) => c,
876            Err(e) => {
877                let _ = self.writer_tokens.0.send(());
878                return Err(anyhow::Error::from(e).context(format!(
879                    "opening concurrent writer at {}",
880                    self.db_path.display()
881                )));
882            }
883        };
884        let storage = FrankenStorage::new(conn, self.db_path.clone());
885        if let Err(e) = storage.apply_config() {
886            let _ = self.writer_tokens.0.send(());
887            return Err(e);
888        }
889        // Reduced cache for concurrent writers (they're short-lived)
890        let _ = storage.raw().execute("PRAGMA cache_size = -4096;");
891        Ok(WriterGuard {
892            storage,
893            mgr: self,
894            committed: false,
895        })
896    }
897
898    /// Database path managed by this pool.
899    pub fn db_path(&self) -> &Path {
900        &self.db_path
901    }
902
903    /// Number of reader connections in the pool.
904    pub fn reader_count(&self) -> usize {
905        self.readers.len()
906    }
907
908    /// Maximum concurrent writers allowed.
909    pub fn max_writers(&self) -> usize {
910        self.config.max_writers
911    }
912}
913
914impl Drop for FrankenConnectionManager {
915    fn drop(&mut self) {
916        for reader in &mut self.readers {
917            reader.get_mut().0.close_best_effort_in_place();
918        }
919    }
920}
921
922/// RAII guard for a writer connection.
923///
924/// Provides access to a [`FrankenStorage`] for write operations.
925/// Releases the writer semaphore slot when dropped.
926pub struct WriterGuard<'a> {
927    storage: FrankenStorage,
928    mgr: &'a FrankenConnectionManager,
929    committed: bool,
930}
931
932impl<'a> WriterGuard<'a> {
933    /// Access the underlying storage for read/write operations.
934    pub fn storage(&self) -> &FrankenStorage {
935        &self.storage
936    }
937
938    /// Mark this writer as successfully committed.
939    ///
940    /// Call after your transaction's `commit()` succeeds. Prevents the drop
941    /// guard from attempting a rollback.
942    pub fn mark_committed(&mut self) {
943        self.committed = true;
944    }
945}
946
947impl Drop for WriterGuard<'_> {
948    fn drop(&mut self) {
949        if !self.committed {
950            // Best-effort rollback — connection may already be in autocommit
951            let _ = self.storage.raw().execute("ROLLBACK;");
952        }
953        self.storage.close_best_effort_in_place();
954        // Release writer token
955        let _ = self.mgr.writer_tokens.0.send(());
956    }
957}
958
959// -------------------------------------------------------------------------
960// Binary Metadata Serialization (Opt 3.1)
961// -------------------------------------------------------------------------
962// MessagePack provides 50-70% storage reduction vs JSON and faster parsing.
963// New rows use binary columns; existing JSON is read on fallback.
964
965/// Serialize a JSON value to MessagePack bytes.
966/// Returns None for null/empty values to save storage.
967fn serialize_json_to_msgpack(value: &serde_json::Value) -> Option<Vec<u8>> {
968    if value.is_null() || value.as_object().is_some_and(|o| o.is_empty()) {
969        return None;
970    }
971    rmp_serde::to_vec(value).ok()
972}
973
974/// Deserialize MessagePack bytes to a JSON value.
975/// Returns default Value::Object({}) on error or empty input.
976fn deserialize_msgpack_to_json(bytes: &[u8]) -> serde_json::Value {
977    if bytes.is_empty() {
978        return serde_json::Value::Object(serde_json::Map::new());
979    }
980    rmp_serde::from_slice(bytes).unwrap_or_else(|e| {
981        tracing::debug!(
982            error = %e,
983            bytes_len = bytes.len(),
984            "Failed to deserialize metadata - returning empty object"
985        );
986        serde_json::Value::Object(serde_json::Map::new())
987    })
988}
989
990/// Read metadata from a frankensqlite Row, preferring binary (msgpack) over JSON.
991fn franken_read_metadata_compat(
992    row: &FrankenRow,
993    json_idx: usize,
994    bin_idx: usize,
995) -> serde_json::Value {
996    // Try binary column first (new format)
997    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
998        && !bytes.is_empty()
999    {
1000        return deserialize_msgpack_to_json(&bytes);
1001    }
1002
1003    // Fall back to JSON column (old format or migration in progress)
1004    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1005        return serde_json::from_str(&json_str)
1006            .unwrap_or_else(|_| serde_json::Value::Object(serde_json::Map::new()));
1007    }
1008
1009    serde_json::Value::Object(serde_json::Map::new())
1010}
1011
1012fn franken_read_message_extra_compat(
1013    row: &FrankenRow,
1014    json_idx: usize,
1015    bin_idx: usize,
1016) -> serde_json::Value {
1017    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1018        && !bytes.is_empty()
1019    {
1020        return deserialize_msgpack_to_json(&bytes);
1021    }
1022
1023    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1024        return serde_json::from_str(&json_str).unwrap_or(serde_json::Value::Null);
1025    }
1026
1027    serde_json::Value::Null
1028}
1029
1030// -------------------------------------------------------------------------
1031// Migration Error Types (P1.5)
1032// -------------------------------------------------------------------------
1033
1034/// Error type for schema migration operations.
1035#[derive(Debug, Error)]
1036pub enum MigrationError {
1037    /// The schema requires a full rebuild. The database has been backed up.
1038    #[error("Rebuild required: {reason}")]
1039    RebuildRequired {
1040        reason: String,
1041        backup_path: Option<std::path::PathBuf>,
1042    },
1043
1044    /// A database error occurred during migration.
1045    #[error("Database error: {0}")]
1046    Database(#[from] frankensqlite::FrankenError),
1047
1048    /// An I/O error occurred during backup.
1049    #[error("I/O error: {0}")]
1050    Io(#[from] std::io::Error),
1051
1052    /// Other migration error.
1053    #[error("{0}")]
1054    Other(String),
1055}
1056
1057impl From<anyhow::Error> for MigrationError {
1058    fn from(e: anyhow::Error) -> Self {
1059        MigrationError::Other(e.to_string())
1060    }
1061}
1062
1063/// Maximum number of backup files to retain.
1064const MAX_BACKUPS: usize = 3;
1065const BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA: &str = "PRAGMA busy_timeout = 30000;";
1066
1067/// Files that contain user-authored state and must NEVER be deleted during rebuild.
1068const USER_DATA_FILES: &[&str] = &["bookmarks.db", "tui_state.json", "sources.toml", ".env"];
1069
1070/// Check if a file is user-authored data that must be preserved during rebuild.
1071pub fn is_user_data_file(path: &Path) -> bool {
1072    path.file_name()
1073        .and_then(|n| n.to_str())
1074        .map(|name| USER_DATA_FILES.contains(&name))
1075        .unwrap_or(false)
1076}
1077
1078/// SQL to register the FTS5 virtual table on a frankensqlite connection.
1079///
1080/// FrankenSQLite skips virtual-table entries (rootpage=0) when loading
1081/// `sqlite_master` from a stock-SQLite database.  Executing this CREATE
1082/// triggers the legacy FTS5 fallback path and materialises the table so
1083/// subsequent FTS queries work.
1084pub const FTS5_REGISTER_SQL: &str = "\
1085    CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(\
1086        content, title, agent, workspace, source_path, \
1087        created_at UNINDEXED, \
1088        content='', tokenize='porter'\
1089    )";
1090
1091const FTS_FRANKEN_REBUILD_META_KEY: &str = "fts_frankensqlite_rebuild_generation";
1092const FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY: &str = "fts_frankensqlite_archive_fingerprint";
1093const FTS_FRANKEN_REBUILD_GENERATION: i64 = 1;
1094const DAILY_STATS_HEALTH_META_KEY: &str = "daily_stats_archive_fingerprint";
1095const DAILY_STATS_HEALTH_GENERATION_META_KEY: &str = "daily_stats_health_generation";
1096const DAILY_STATS_HEALTH_GENERATION: i64 = 1;
1097
1098/// SQL to clear all rows from the contentless `fts_messages` table.
1099///
1100/// Contentless FTS5 tables reject ordinary `DELETE FROM ...` statements.
1101pub const FTS5_DELETE_ALL_SQL: &str =
1102    "INSERT INTO fts_messages(fts_messages) VALUES('delete-all');";
1103
1104#[cfg(test)]
1105pub(crate) fn materialize_fresh_fts_schema_via_rusqlite(db_path: &Path) -> Result<()> {
1106    // Delegate to FrankenStorage: DROP TABLE IF EXISTS + CREATE VIRTUAL TABLE
1107    // is fully supported by the frankensqlite FTS5 path at
1108    // FrankenStorage::rebuild_fts_via_frankensqlite. We call rebuild which
1109    // also populates rows, matching the historical semantics ("fresh FTS"
1110    // means the schema exists and is consistent with message rows).
1111    let storage = FrankenStorage::open(db_path).with_context(|| {
1112        format!(
1113            "opening frankensqlite db at {} for FTS materialization",
1114            db_path.display()
1115        )
1116    })?;
1117    storage.rebuild_fts_via_frankensqlite().map(|_| ())
1118}
1119
1120#[cfg(test)]
1121pub(crate) fn rebuild_fts_via_rusqlite(db_path: &Path) -> Result<usize> {
1122    let storage = FrankenStorage::open(db_path).with_context(|| {
1123        format!(
1124            "opening frankensqlite db at {} for FTS rebuild",
1125            db_path.display()
1126        )
1127    })?;
1128    let inserted = storage.rebuild_fts_via_frankensqlite()?;
1129    storage.record_fts_franken_rebuild_generation()?;
1130    Ok(inserted)
1131}
1132
1133pub(crate) fn ensure_fts_consistency_via_rusqlite(db_path: &Path) -> Result<FtsConsistencyRepair> {
1134    // Delegates to the FrankenStorage-native path. The function name retains
1135    // the `_via_rusqlite` suffix only for backwards compatibility with the
1136    // few test-site callers; all operations now run through frankensqlite.
1137    let storage = FrankenStorage::open(db_path).with_context(|| {
1138        format!(
1139            "opening frankensqlite db at {} for FTS consistency check",
1140            db_path.display()
1141        )
1142    })?;
1143    storage.ensure_search_fallback_fts_consistency()
1144}
1145
1146/// Create a uniquely named backup of the database file.
1147///
1148/// Returns the path to the backup file, or None if the source doesn't exist.
1149pub fn create_backup(db_path: &Path) -> Result<Option<std::path::PathBuf>, MigrationError> {
1150    if !bundle_path_exists(db_path)? {
1151        return Ok(None);
1152    }
1153
1154    if !copyable_bundle_file_exists(db_path)? {
1155        return Ok(None);
1156    }
1157    let _ = copyable_bundle_sidecar_sources(db_path)?;
1158
1159    let backup_path = unique_backup_path(db_path);
1160    let vacuum_stage_path = vacuum_stage_backup_path(&backup_path);
1161
1162    // Try to use SQLite's VACUUM INTO command first, which safely handles WAL files
1163    // and produces a clean, minimized backup.
1164    match vacuum_into_backup_stage(db_path, &vacuum_stage_path) {
1165        Ok(()) => {
1166            fs::rename(&vacuum_stage_path, &backup_path)?;
1167        }
1168        Err(err) if backup_vacuum_error_requires_consistent_retry(&err) => {
1169            tracing::warn!(
1170                db_path = %db_path.display(),
1171                error = %err,
1172                "create_backup: VACUUM INTO hit transient contention; refusing raw WAL bundle copy"
1173            );
1174            return Err(MigrationError::Database(err));
1175        }
1176        Err(err) => {
1177            tracing::warn!(
1178                db_path = %db_path.display(),
1179                error = %err,
1180                "create_backup: VACUUM INTO failed; falling back to raw evidence copy"
1181            );
1182        }
1183    }
1184
1185    if backup_path.exists() {
1186        sync_file_if_exists(&backup_path)?;
1187        if let Some(parent) = backup_path.parent() {
1188            sync_parent_directory(parent)?;
1189        }
1190        return Ok(Some(backup_path));
1191    }
1192
1193    // Fallback to a raw evidence copy if VACUUM INTO failed (e.g., older SQLite
1194    // or corruption). Keep this on the same symlink-safe bundle path as
1195    // historical seeding so a malformed archive root cannot make us copy an
1196    // arbitrary symlink target or publish a partial sidecar backup.
1197    copy_database_bundle(db_path, &backup_path)?;
1198
1199    Ok(Some(backup_path))
1200}
1201
1202fn vacuum_into_backup_stage(
1203    db_path: &Path,
1204    stage_path: &Path,
1205) -> std::result::Result<(), frankensqlite::FrankenError> {
1206    let mut conn = open_franken_with_flags(
1207        &db_path.to_string_lossy(),
1208        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
1209    )?;
1210    let result = (|| {
1211        conn.execute(BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA)?;
1212        let path_str = stage_path.to_string_lossy();
1213        conn.execute_compat("VACUUM INTO ?", fparams![path_str.as_ref()])?;
1214        Ok(())
1215    })();
1216    if let Err(close_err) = conn.close_in_place() {
1217        tracing::warn!(
1218            error = %close_err,
1219            db_path = %db_path.display(),
1220            "create_backup: close_in_place failed after VACUUM INTO; falling back to best-effort close"
1221        );
1222        conn.close_best_effort_in_place();
1223    }
1224    result
1225}
1226
1227fn backup_vacuum_error_requires_consistent_retry(err: &frankensqlite::FrankenError) -> bool {
1228    retryable_franken_error(err)
1229}
1230
1231#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1232pub struct DatabaseBundleMoveResult {
1233    pub database: bool,
1234    pub wal: bool,
1235    pub shm: bool,
1236}
1237
1238impl DatabaseBundleMoveResult {
1239    pub fn moved_any(&self) -> bool {
1240        self.database || self.wal || self.shm
1241    }
1242}
1243
1244fn database_sidecar_path(path: &Path, suffix: &str) -> PathBuf {
1245    PathBuf::from(format!("{}{}", path.to_string_lossy(), suffix))
1246}
1247
1248/// Move a database file and its WAL/SHM sidecars to a new basename.
1249///
1250/// This is used for non-destructive quarantine of a corrupted bundle before a
1251/// rebuild. If the main database file is already missing but orphaned sidecars
1252/// remain, those sidecars are still moved so a fresh database can be created
1253/// without inheriting stale WAL state.
1254pub(crate) fn move_database_bundle(
1255    source_root: &Path,
1256    destination_root: &Path,
1257) -> std::io::Result<DatabaseBundleMoveResult> {
1258    let mut moved = DatabaseBundleMoveResult::default();
1259    if let Some(parent) = destination_root.parent() {
1260        fs::create_dir_all(parent)?;
1261        sync_parent_directory(parent)?;
1262    }
1263
1264    if bundle_path_exists(source_root)? {
1265        fs::rename(source_root, destination_root)?;
1266        moved.database = true;
1267    }
1268
1269    let wal_source = database_sidecar_path(source_root, "-wal");
1270    if bundle_path_exists(&wal_source)? {
1271        fs::rename(&wal_source, database_sidecar_path(destination_root, "-wal"))?;
1272        moved.wal = true;
1273    }
1274
1275    let shm_source = database_sidecar_path(source_root, "-shm");
1276    if bundle_path_exists(&shm_source)? {
1277        fs::rename(&shm_source, database_sidecar_path(destination_root, "-shm"))?;
1278        moved.shm = true;
1279    }
1280
1281    if moved.moved_any() {
1282        if let Some(parent) = source_root.parent() {
1283            sync_parent_directory(parent)?;
1284        }
1285        if let Some(parent) = destination_root.parent() {
1286            sync_parent_directory(parent)?;
1287        }
1288    }
1289
1290    Ok(moved)
1291}
1292
1293fn bundle_path_exists(path: &Path) -> std::io::Result<bool> {
1294    match fs::symlink_metadata(path) {
1295        Ok(_) => Ok(true),
1296        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1297        Err(err) => Err(err),
1298    }
1299}
1300
1301fn copy_database_bundle(source_root: &Path, destination_root: &Path) -> Result<()> {
1302    if let Some(parent) = destination_root.parent() {
1303        fs::create_dir_all(parent).with_context(|| {
1304            format!(
1305                "creating destination directory for database bundle copy: {}",
1306                parent.display()
1307            )
1308        })?;
1309        sync_parent_directory(parent)
1310            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1311    }
1312
1313    if !copyable_bundle_file_exists(source_root)? {
1314        bail!(
1315            "database bundle root is missing before copy: {}",
1316            source_root.display()
1317        );
1318    }
1319
1320    let sidecars = copyable_bundle_sidecar_sources(source_root)?;
1321
1322    fs::copy(source_root, destination_root).with_context(|| {
1323        format!(
1324            "copying database bundle {} -> {}",
1325            source_root.display(),
1326            destination_root.display()
1327        )
1328    })?;
1329    sync_file_if_exists(destination_root).with_context(|| {
1330        format!(
1331            "syncing copied database bundle {}",
1332            destination_root.display()
1333        )
1334    })?;
1335
1336    for (source_sidecar, suffix) in sidecars {
1337        let destination_sidecar = database_sidecar_path(destination_root, suffix);
1338        fs::copy(&source_sidecar, &destination_sidecar).with_context(|| {
1339            format!(
1340                "copying database bundle sidecar {} -> {}",
1341                source_sidecar.display(),
1342                destination_sidecar.display()
1343            )
1344        })?;
1345        sync_file_if_exists(&destination_sidecar).with_context(|| {
1346            format!(
1347                "syncing copied database bundle sidecar {}",
1348                destination_sidecar.display()
1349            )
1350        })?;
1351    }
1352
1353    if let Some(parent) = destination_root.parent() {
1354        sync_parent_directory(parent)
1355            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1356    }
1357
1358    Ok(())
1359}
1360
1361fn copyable_bundle_sidecar_sources(source_root: &Path) -> Result<Vec<(PathBuf, &'static str)>> {
1362    let mut sidecars = Vec::new();
1363    for suffix in ["-wal", "-shm"] {
1364        let source_sidecar = database_sidecar_path(source_root, suffix);
1365        if copyable_bundle_file_exists(&source_sidecar)? {
1366            sidecars.push((source_sidecar, suffix));
1367        }
1368    }
1369    Ok(sidecars)
1370}
1371
1372fn copyable_bundle_file_exists(path: &Path) -> Result<bool> {
1373    match fs::symlink_metadata(path) {
1374        Ok(metadata) => {
1375            let file_type = metadata.file_type();
1376            if file_type.is_symlink() {
1377                bail!(
1378                    "refusing to copy database bundle symlink: {}",
1379                    path.display()
1380                );
1381            }
1382            if !file_type.is_file() {
1383                bail!(
1384                    "refusing to copy non-file database bundle path: {}",
1385                    path.display()
1386                );
1387            }
1388            Ok(true)
1389        }
1390        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1391        Err(err) => Err(err).with_context(|| {
1392            format!(
1393                "checking database bundle path before copy: {}",
1394                path.display()
1395            )
1396        }),
1397    }
1398}
1399
1400/// Helper to safely remove a database file and its potential WAL/SHM sidecars.
1401pub(crate) fn remove_database_files(path: &Path) -> std::io::Result<()> {
1402    let mut removed_any = false;
1403
1404    match fs::remove_file(path) {
1405        Ok(()) => removed_any = true,
1406        Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1407        Err(err) => return Err(err),
1408    }
1409
1410    // Best-effort removal of sidecar files (ignore errors if they don't exist)
1411    for suffix in ["-wal", "-shm"] {
1412        match fs::remove_file(database_sidecar_path(path, suffix)) {
1413            Ok(()) => removed_any = true,
1414            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1415            Err(err) => return Err(err),
1416        }
1417    }
1418
1419    if removed_any && let Some(parent) = path.parent() {
1420        sync_parent_directory(parent)?;
1421    }
1422
1423    Ok(())
1424}
1425
1426#[cfg(not(windows))]
1427fn sync_parent_directory(path: &Path) -> std::io::Result<()> {
1428    fs::File::open(path)?.sync_all()
1429}
1430
1431#[cfg(windows)]
1432fn sync_parent_directory(_path: &Path) -> std::io::Result<()> {
1433    Ok(())
1434}
1435
1436fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1437    if path.exists() {
1438        fs::File::open(path)?.sync_all()?;
1439    }
1440    Ok(())
1441}
1442
1443/// Remove old backup files, keeping only the most recent `keep_count`.
1444pub fn cleanup_old_backups(db_path: &Path, keep_count: usize) -> Result<(), std::io::Error> {
1445    let parent = match db_path.parent() {
1446        Some(p) => p,
1447        None => return Ok(()),
1448    };
1449
1450    let db_name = db_path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
1451
1452    let prefix = format!("{}.backup.", db_name);
1453
1454    // Collect backup files matching the pattern
1455    let mut backups: Vec<(std::path::PathBuf, SystemTime)> = Vec::new();
1456
1457    if let Ok(entries) = fs::read_dir(parent) {
1458        for entry in entries.flatten() {
1459            let path = entry.path();
1460            if let Some(name) = path.file_name().and_then(|n| n.to_str())
1461                && is_backup_root_name(name, &prefix)
1462                && let Ok(meta) = fs::metadata(&path)
1463                && meta.is_file()
1464                && let Ok(mtime) = meta.modified()
1465            {
1466                backups.push((path, mtime));
1467            }
1468        }
1469    }
1470
1471    // Sort by modification time, newest first
1472    backups.sort_by_key(|entry| std::cmp::Reverse(entry.1));
1473
1474    // Delete oldest backups beyond keep_count
1475    for (path, _) in backups.into_iter().skip(keep_count) {
1476        let _ = fs::remove_file(&path);
1477
1478        // Also try to cleanup potential sidecars from fs::copy fallback
1479        let _ = fs::remove_file(database_sidecar_path(&path, "-wal"));
1480        let _ = fs::remove_file(database_sidecar_path(&path, "-shm"));
1481    }
1482
1483    Ok(())
1484}
1485
1486#[derive(Debug, Clone)]
1487pub(crate) struct HistoricalDatabaseBundle {
1488    root_path: PathBuf,
1489    total_bytes: u64,
1490    modified_at_ms: i64,
1491    supports_direct_readonly: bool,
1492    probe: HistoricalBundleProbe,
1493}
1494
1495#[derive(Debug, Clone, Copy, Default)]
1496struct HistoricalBundleProbe {
1497    schema_version: Option<i64>,
1498    fts_schema_rows: Option<i64>,
1499    fts_queryable: bool,
1500    max_message_id: i64,
1501}
1502
1503#[cfg(test)]
1504#[allow(dead_code)]
1505#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1506pub(crate) struct SqliteDatabaseHealthProbe {
1507    pub schema_version: Option<i64>,
1508    pub quick_check_ok: bool,
1509    pub fts_schema_rows: i64,
1510    pub fts_queryable: bool,
1511    pub message_count: i64,
1512    pub max_message_id: i64,
1513}
1514
1515#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1516pub(crate) enum FtsConsistencyRepair {
1517    AlreadyHealthy {
1518        rows: usize,
1519    },
1520    IncrementalCatchUp {
1521        inserted_rows: usize,
1522        total_rows: usize,
1523    },
1524    Rebuilt {
1525        inserted_rows: usize,
1526    },
1527}
1528
1529#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
1530pub struct HistoricalSalvageOutcome {
1531    pub bundles_considered: usize,
1532    pub bundles_imported: usize,
1533    pub conversations_imported: usize,
1534    pub messages_imported: usize,
1535}
1536
1537impl HistoricalSalvageOutcome {
1538    pub(crate) fn accumulate(&mut self, other: Self) {
1539        self.bundles_considered += other.bundles_considered;
1540        self.bundles_imported += other.bundles_imported;
1541        self.conversations_imported += other.conversations_imported;
1542        self.messages_imported += other.messages_imported;
1543    }
1544}
1545
1546#[derive(Debug)]
1547struct HistoricalReadConnection {
1548    conn: FrankenConnection,
1549    method: &'static str,
1550    _tempdir: Option<tempfile::TempDir>,
1551}
1552
1553const HISTORICAL_RECOVERY_CORE_SCHEMA: &str = r"
1554CREATE TABLE sources (
1555    id TEXT PRIMARY KEY,
1556    kind TEXT,
1557    host_label TEXT,
1558    machine_id TEXT,
1559    platform TEXT,
1560    config_json TEXT,
1561    created_at INTEGER,
1562    updated_at INTEGER
1563);
1564CREATE TABLE agents (
1565    id INTEGER PRIMARY KEY,
1566    slug TEXT,
1567    name TEXT,
1568    version TEXT,
1569    kind TEXT,
1570    created_at INTEGER,
1571    updated_at INTEGER
1572);
1573CREATE TABLE workspaces (
1574    id INTEGER PRIMARY KEY,
1575    path TEXT,
1576    display_name TEXT
1577);
1578CREATE TABLE conversations (
1579    id INTEGER PRIMARY KEY,
1580    agent_id INTEGER,
1581    workspace_id INTEGER,
1582    source_id TEXT,
1583    external_id TEXT,
1584    title TEXT,
1585    source_path TEXT,
1586    started_at INTEGER,
1587    ended_at INTEGER,
1588    approx_tokens INTEGER,
1589    metadata_json TEXT,
1590    origin_host TEXT,
1591    metadata_bin BLOB,
1592    total_input_tokens INTEGER,
1593    total_output_tokens INTEGER,
1594    total_cache_read_tokens INTEGER,
1595    total_cache_creation_tokens INTEGER,
1596    grand_total_tokens INTEGER,
1597    estimated_cost_usd REAL,
1598    primary_model TEXT,
1599    api_call_count INTEGER,
1600    tool_call_count INTEGER,
1601    user_message_count INTEGER,
1602    assistant_message_count INTEGER,
1603    last_message_idx INTEGER,
1604    last_message_created_at INTEGER
1605);
1606CREATE TABLE messages (
1607    id INTEGER PRIMARY KEY,
1608    conversation_id INTEGER,
1609    idx INTEGER,
1610    role TEXT,
1611    author TEXT,
1612    created_at INTEGER,
1613    content TEXT,
1614    extra_json TEXT,
1615    extra_bin BLOB
1616);
1617CREATE TABLE snippets (
1618    id INTEGER PRIMARY KEY,
1619    message_id INTEGER,
1620    file_path TEXT,
1621    start_line INTEGER,
1622    end_line INTEGER,
1623    language TEXT,
1624    snippet_text TEXT
1625);
1626";
1627const HISTORICAL_SALVAGE_LEDGER_VERSION: u32 = 2;
1628const HISTORICAL_SALVAGE_PROGRESS_VERSION: u32 = 1;
1629const SOURCE_PATH_MERGE_START_TOLERANCE_MS: i64 = 5 * 60 * 1000;
1630
1631#[derive(Debug, Clone, Serialize, Deserialize)]
1632struct HistoricalBundleProgress {
1633    progress_version: u32,
1634    path: String,
1635    bytes: u64,
1636    modified_at_ms: i64,
1637    method: String,
1638    last_completed_source_row_id: i64,
1639    conversations_imported: usize,
1640    messages_imported: usize,
1641    updated_at_ms: i64,
1642}
1643
1644#[derive(Debug, Clone)]
1645struct HistoricalBatchEntry {
1646    source_row_id: i64,
1647    agent_id: i64,
1648    workspace_id: Option<i64>,
1649    conversation: Conversation,
1650}
1651
1652#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1653struct HistoricalBatchImportTotals {
1654    inserted_source_rows: usize,
1655    inserted_messages: usize,
1656}
1657
1658fn historical_bundle_root_paths(db_path: &Path) -> Vec<PathBuf> {
1659    let mut roots = Vec::new();
1660    let Some(parent) = db_path.parent() else {
1661        return roots;
1662    };
1663    let db_name = db_path
1664        .file_name()
1665        .and_then(|n| n.to_str())
1666        .unwrap_or("agent_search.db");
1667    let db_stem = db_path
1668        .file_stem()
1669        .and_then(|n| n.to_str())
1670        .unwrap_or("agent_search");
1671
1672    let mut push_root = |path: PathBuf| {
1673        if path == db_path {
1674            return;
1675        }
1676        if !roots.iter().any(|existing| existing == &path) {
1677            roots.push(path);
1678        }
1679    };
1680
1681    if let Ok(entries) = fs::read_dir(parent) {
1682        for entry in entries.flatten() {
1683            let path = entry.path();
1684            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1685                continue;
1686            };
1687            if has_db_sidecar_suffix(name) {
1688                continue;
1689            }
1690            if name.starts_with(&format!("{db_name}.backup."))
1691                || name.starts_with(&format!("{db_stem}.corrupt."))
1692            {
1693                push_root(path);
1694            }
1695        }
1696    }
1697
1698    let backups_dir = parent.join("backups");
1699    if let Ok(entries) = fs::read_dir(backups_dir) {
1700        for entry in entries.flatten() {
1701            let path = entry.path();
1702            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1703                continue;
1704            };
1705            if has_db_sidecar_suffix(name) {
1706                continue;
1707            }
1708            if name.starts_with(&format!("{db_name}.")) && name.ends_with(".bak") {
1709                push_root(path);
1710            }
1711        }
1712    }
1713
1714    push_named_database_children(&mut roots, db_path, &parent.join("repair-lab"), db_name);
1715    push_named_database_children(&mut roots, db_path, &parent.join("snapshots"), db_name);
1716
1717    roots
1718}
1719
1720fn push_named_database_children(
1721    roots: &mut Vec<PathBuf>,
1722    canonical_db_path: &Path,
1723    dir: &Path,
1724    db_name: &str,
1725) {
1726    if let Ok(entries) = fs::read_dir(dir) {
1727        for entry in entries.flatten() {
1728            let candidate = entry.path().join(db_name);
1729            if candidate == canonical_db_path {
1730                continue;
1731            }
1732            if candidate.exists() && !roots.iter().any(|existing| existing == &candidate) {
1733                roots.push(candidate);
1734            }
1735        }
1736    }
1737}
1738
1739fn file_mtime_ms(path: &Path) -> i64 {
1740    fs::metadata(path)
1741        .and_then(|meta| meta.modified())
1742        .ok()
1743        .and_then(|ts| ts.duration_since(UNIX_EPOCH).ok())
1744        .map(|d| d.as_millis() as i64)
1745        .unwrap_or(0)
1746}
1747
1748fn bundle_total_bytes(root_path: &Path) -> u64 {
1749    let mut total = fs::metadata(root_path).map(|meta| meta.len()).unwrap_or(0);
1750    for suffix in ["-wal", "-shm"] {
1751        let sidecar = database_sidecar_path(root_path, suffix);
1752        total = total.saturating_add(fs::metadata(sidecar).map(|meta| meta.len()).unwrap_or(0));
1753    }
1754    total
1755}
1756
1757pub(crate) fn discover_historical_database_bundles(
1758    db_path: &Path,
1759) -> Vec<HistoricalDatabaseBundle> {
1760    let mut bundles: Vec<_> = historical_bundle_root_paths(db_path)
1761        .into_iter()
1762        .filter(|root| root.exists())
1763        .map(|root_path| {
1764            let modified_at_ms = file_mtime_ms(&root_path);
1765            let total_bytes = bundle_total_bytes(&root_path);
1766            let supports_direct_readonly = historical_bundle_supports_direct_readonly(&root_path);
1767            let probe = probe_historical_bundle(&root_path, supports_direct_readonly);
1768            HistoricalDatabaseBundle {
1769                modified_at_ms,
1770                total_bytes,
1771                supports_direct_readonly,
1772                root_path,
1773                probe,
1774            }
1775        })
1776        .filter(|bundle| bundle.total_bytes > 0)
1777        .collect();
1778
1779    fn bundle_priority(path: &Path) -> i32 {
1780        let path_str = path.to_string_lossy();
1781        if path_str.contains("/repair-lab/replay-") {
1782            return 5;
1783        }
1784        if path_str.contains("/repair-lab/") {
1785            return 4;
1786        }
1787        if path_str.contains("/snapshots/") {
1788            return 3;
1789        }
1790        if path_str.contains(".corrupt.") || path_str.contains("failed-baseline-seed") {
1791            return 0;
1792        }
1793        1
1794    }
1795
1796    fn bundle_health_rank(bundle: &HistoricalDatabaseBundle) -> i32 {
1797        // Classify FTS health. The probe only sets `fts_queryable = true`
1798        // when `fts_schema_rows == Some(1)` (see
1799        // `historical_bundle_fts_queryable_via_frankensqlite`), so we have
1800        // two legitimate "clean" shapes for a bundle:
1801        //
1802        //   * `fts_schema_rows == Some(1) && fts_queryable` — a pre-V14
1803        //     bundle where the FTS virtual table was eagerly created by
1804        //     migration and is queryable right now.
1805        //
1806        //   * `fts_schema_rows == Some(0) && schema_version == Some(V14+)` —
1807        //     a modern bundle where `MIGRATION_V14` dropped fts_messages on
1808        //     purpose and cass recreates it lazily via
1809        //     `ensure_search_fallback_fts_consistency` on the first open.
1810        //     Gating on `schema_version == CURRENT_SCHEMA_VERSION` is critical
1811        //     so an incomplete pre-V14 bundle with 0 fts rows is not promoted
1812        //     alongside real lazy-V14+ bundles. A `None` schema_version
1813        //     (schema marker unreadable) is excluded for the same reason.
1814        //
1815        // Everything else — `Some(1)` without queryability, `Some(n)` for
1816        // n >= 2 (duplicated CREATE VIRTUAL TABLE rows from a broken legacy
1817        // rebuild), `None` entirely, or `Some(0)` on a non-current schema —
1818        // is not "fts clean".
1819        let fts_clean = match bundle.probe.fts_schema_rows {
1820            Some(1) => bundle.probe.fts_queryable,
1821            Some(0) => bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION),
1822            _ => false,
1823        };
1824
1825        let clean_schema14_fts =
1826            bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION) && fts_clean;
1827        if clean_schema14_fts {
1828            return 5;
1829        }
1830
1831        if fts_clean {
1832            return 4;
1833        }
1834
1835        if bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION)
1836            && bundle.supports_direct_readonly
1837        {
1838            return 3;
1839        }
1840
1841        if bundle.supports_direct_readonly {
1842            return 2;
1843        }
1844
1845        1
1846    }
1847
1848    bundles.sort_by(|left, right| {
1849        bundle_health_rank(right)
1850            .cmp(&bundle_health_rank(left))
1851            .then_with(|| right.probe.max_message_id.cmp(&left.probe.max_message_id))
1852            .then_with(|| bundle_priority(&right.root_path).cmp(&bundle_priority(&left.root_path)))
1853            .then_with(|| {
1854                right
1855                    .supports_direct_readonly
1856                    .cmp(&left.supports_direct_readonly)
1857            })
1858            .then_with(|| right.total_bytes.cmp(&left.total_bytes))
1859            .then_with(|| right.modified_at_ms.cmp(&left.modified_at_ms))
1860            .then_with(|| right.root_path.cmp(&left.root_path))
1861    });
1862    bundles
1863}
1864
1865fn probe_historical_bundle(
1866    root_path: &Path,
1867    supports_direct_readonly: bool,
1868) -> HistoricalBundleProbe {
1869    if !supports_direct_readonly {
1870        return HistoricalBundleProbe::default();
1871    }
1872
1873    let Ok(conn) = open_historical_bundle_readonly(root_path) else {
1874        return HistoricalBundleProbe::default();
1875    };
1876
1877    let schema_version = read_meta_schema_version(&conn).ok().flatten();
1878    let fts_schema_rows: Option<i64> = conn
1879        .query_row_map(
1880            "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
1881            fparams![],
1882            |row| row.get_typed(0),
1883        )
1884        .ok();
1885    let fts_queryable =
1886        historical_bundle_fts_queryable_via_frankensqlite(root_path, fts_schema_rows);
1887    let max_message_id: i64 = conn
1888        .query_row_map(
1889            "SELECT COALESCE(MAX(id), 0) FROM messages",
1890            fparams![],
1891            |row| row.get_typed(0),
1892        )
1893        .unwrap_or(0);
1894
1895    HistoricalBundleProbe {
1896        schema_version,
1897        fts_schema_rows,
1898        fts_queryable,
1899        max_message_id,
1900    }
1901}
1902
1903fn historical_bundle_fts_queryable_via_frankensqlite(
1904    root_path: &Path,
1905    fts_schema_rows: Option<i64>,
1906) -> bool {
1907    matches!(fts_schema_rows, Some(1))
1908        && FrankenStorage::open_readonly(root_path)
1909            .map(|storage| {
1910                storage
1911                    .raw()
1912                    .query("SELECT rowid FROM fts_messages LIMIT 1")
1913                    .is_ok()
1914            })
1915            .unwrap_or(false)
1916}
1917
1918fn historical_bundle_supports_direct_readonly(root_path: &Path) -> bool {
1919    open_historical_bundle_readonly(root_path)
1920        .and_then(|conn| historical_bundle_has_queryable_core_tables(&conn))
1921        .is_ok()
1922}
1923
1924fn historical_table_exists(conn: &FrankenConnection, table: &str) -> Result<bool> {
1925    let found: Option<i64> = conn
1926        .query_row_map(
1927            "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1 LIMIT 1",
1928            fparams![table],
1929            |row| row.get_typed(0),
1930        )
1931        .optional()
1932        .with_context(|| format!("checking for historical table {table}"))?;
1933    Ok(found.is_some())
1934}
1935
1936fn probe_historical_table_reads(conn: &FrankenConnection, table: &str) -> Result<()> {
1937    if !historical_table_exists(conn, table)? {
1938        return Err(anyhow!(
1939            "historical database missing required table {table}"
1940        ));
1941    }
1942
1943    let sql = format!("SELECT rowid FROM {table} LIMIT 1");
1944    let _: Option<i64> = conn
1945        .query_row_map(&sql, fparams![], |row| row.get_typed(0))
1946        .optional()
1947        .with_context(|| format!("probing rows from historical table {table}"))?;
1948    Ok(())
1949}
1950
1951fn historical_bundle_has_queryable_core_tables(conn: &FrankenConnection) -> Result<()> {
1952    probe_historical_table_reads(conn, "conversations")?;
1953    probe_historical_table_reads(conn, "messages")?;
1954    Ok(())
1955}
1956
1957fn open_historical_bundle_readonly(root_path: &Path) -> Result<FrankenConnection> {
1958    let path_str = root_path.to_string_lossy();
1959    let flags = FrankenOpenFlags::SQLITE_OPEN_READ_ONLY;
1960    let conn = open_franken_with_flags(&path_str, flags)
1961        .with_context(|| format!("opening historical database {}", root_path.display()))?;
1962    Ok(conn)
1963}
1964
1965fn is_recoverable_insert_line(line: &str) -> bool {
1966    [
1967        "sources",
1968        "agents",
1969        "workspaces",
1970        "conversations",
1971        "messages",
1972        "snippets",
1973    ]
1974    .iter()
1975    .any(|table| {
1976        line.starts_with(&format!("INSERT INTO '{table}'"))
1977            || line.starts_with(&format!("INSERT OR IGNORE INTO '{table}'"))
1978            || line.starts_with(&format!("INSERT INTO \"{table}\""))
1979            || line.starts_with(&format!("INSERT OR IGNORE INTO \"{table}\""))
1980    })
1981}
1982
1983fn recover_historical_bundle_via_sqlite3(
1984    bundle: &HistoricalDatabaseBundle,
1985) -> Result<HistoricalReadConnection> {
1986    let tempdir = tempfile::TempDir::new().context("creating temporary salvage directory")?;
1987    let recovered_db = tempdir.path().join("historical-recovered.db");
1988    let temp_conn = FrankenConnection::open(recovered_db.to_string_lossy().as_ref())
1989        .with_context(|| format!("creating recovered database {}", recovered_db.display()))?;
1990    temp_conn
1991        .execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA)
1992        .with_context(|| format!("initializing recovered schema {}", recovered_db.display()))?;
1993    drop(temp_conn);
1994
1995    let bundle_uri = format!("file:{}?immutable=1", bundle.root_path.to_string_lossy());
1996    let mut recover = Command::new("sqlite3")
1997        .arg(&bundle_uri)
1998        .arg(".recover")
1999        .stdout(Stdio::piped())
2000        .spawn()
2001        .with_context(|| {
2002            format!(
2003                "launching sqlite3 .recover for historical bundle {}",
2004                bundle.root_path.display()
2005            )
2006        })?;
2007    let recover_stdout = recover
2008        .stdout
2009        .take()
2010        .context("capturing sqlite3 .recover stdout")?;
2011
2012    let mut importer = Command::new("sqlite3")
2013        .arg(&recovered_db)
2014        .stdin(Stdio::piped())
2015        .spawn()
2016        .with_context(|| {
2017            format!(
2018                "launching sqlite3 importer for recovered bundle {}",
2019                recovered_db.display()
2020            )
2021        })?;
2022
2023    {
2024        let importer_stdin = importer
2025            .stdin
2026            .as_mut()
2027            .context("opening sqlite3 importer stdin")?;
2028        importer_stdin
2029            .write_all(b"BEGIN;\n")
2030            .context("starting recovery import transaction")?;
2031
2032        let reader = BufReader::new(recover_stdout);
2033        for line in reader.lines() {
2034            let line = line.context("reading sqlite3 .recover output")?;
2035            if is_recoverable_insert_line(&line) {
2036                importer_stdin
2037                    .write_all(line.as_bytes())
2038                    .context("writing recovered INSERT")?;
2039                importer_stdin
2040                    .write_all(b"\n")
2041                    .context("writing recovered INSERT newline")?;
2042            }
2043        }
2044
2045        importer_stdin
2046            .write_all(b"COMMIT;\n")
2047            .context("committing recovery import transaction")?;
2048    }
2049
2050    let recover_status = recover
2051        .wait()
2052        .context("waiting for sqlite3 .recover process")?;
2053    if !recover_status.success() {
2054        anyhow::bail!(
2055            "sqlite3 .recover exited with status {} for {}",
2056            recover_status,
2057            bundle.root_path.display()
2058        );
2059    }
2060
2061    let importer_status = importer
2062        .wait()
2063        .context("waiting for sqlite3 recovery importer")?;
2064    if !importer_status.success() {
2065        anyhow::bail!(
2066            "sqlite3 recovery importer exited with status {} for {}",
2067            importer_status,
2068            recovered_db.display()
2069        );
2070    }
2071
2072    let conn = open_historical_bundle_readonly(&recovered_db)?;
2073    historical_bundle_has_queryable_core_tables(&conn)?;
2074    Ok(HistoricalReadConnection {
2075        conn,
2076        method: "sqlite3-recover",
2077        _tempdir: Some(tempdir),
2078    })
2079}
2080
2081fn open_historical_bundle_for_salvage(
2082    bundle: &HistoricalDatabaseBundle,
2083) -> Result<HistoricalReadConnection> {
2084    match open_historical_bundle_readonly(&bundle.root_path) {
2085        Ok(conn) => {
2086            if historical_bundle_has_queryable_core_tables(&conn).is_ok() {
2087                return Ok(HistoricalReadConnection {
2088                    conn,
2089                    method: "direct-readonly",
2090                    _tempdir: None,
2091                });
2092            }
2093        }
2094        Err(err) => {
2095            tracing::warn!(
2096                path = %bundle.root_path.display(),
2097                error = %err,
2098                "historical bundle direct open failed; falling back to sqlite3 .recover"
2099            );
2100        }
2101    }
2102
2103    recover_historical_bundle_via_sqlite3(bundle)
2104}
2105
2106fn historical_bundle_counts(conn: &FrankenConnection) -> Result<(usize, usize)> {
2107    let conversations: i64 =
2108        conn.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
2109            row.get_typed(0)
2110        })?;
2111    let messages: i64 = conn.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2112        row.get_typed(0)
2113    })?;
2114    Ok((
2115        usize::try_from(conversations.max(0)).unwrap_or(usize::MAX),
2116        usize::try_from(messages.max(0)).unwrap_or(usize::MAX),
2117    ))
2118}
2119
2120fn clear_seeded_runtime_meta(conn: &FrankenConnection) -> Result<()> {
2121    conn.execute(
2122        "DELETE FROM meta
2123         WHERE key LIKE 'historical_bundle_salvaged:%'
2124            OR key IN ('last_scan_ts', 'last_indexed_at', 'last_embedded_message_id')",
2125    )?;
2126    Ok(())
2127}
2128
2129fn record_historical_bundle_import(
2130    conn: &FrankenConnection,
2131    bundle: &HistoricalDatabaseBundle,
2132    method: &str,
2133    conversations_imported: usize,
2134    messages_imported: usize,
2135) -> Result<()> {
2136    let key = FrankenStorage::historical_bundle_meta_key(bundle);
2137    let value = serde_json::json!({
2138        "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
2139        "path": bundle.root_path.display().to_string(),
2140        "bytes": bundle.total_bytes,
2141        "modified_at_ms": bundle.modified_at_ms,
2142        "method": method,
2143        "conversations_imported": conversations_imported,
2144        "messages_imported": messages_imported,
2145        "recorded_at_ms": FrankenStorage::now_millis(),
2146    });
2147    let value_str = serde_json::to_string(&value)?;
2148    conn.execute_compat(
2149        "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
2150        fparams![key, value_str],
2151    )?;
2152    Ok(())
2153}
2154
2155fn finalize_seeded_canonical_bundle_via_rusqlite(
2156    canonical_db_path: &Path,
2157    bundle: &HistoricalDatabaseBundle,
2158    conversations_imported: usize,
2159    messages_imported: usize,
2160) -> Result<()> {
2161    let _fts_repair =
2162        ensure_fts_consistency_via_rusqlite(canonical_db_path).with_context(|| {
2163            format!(
2164                "repairing staged canonical FTS consistency before finalization: {}",
2165                canonical_db_path.display()
2166            )
2167        })?;
2168
2169    let path_str = canonical_db_path.to_string_lossy();
2170    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2171        format!(
2172            "opening seeded canonical database for post-seed finalization: {}",
2173            canonical_db_path.display()
2174        )
2175    })?;
2176    conn.execute("PRAGMA busy_timeout = 30000;")
2177        .with_context(|| {
2178            format!(
2179                "configuring busy timeout for seeded canonical database {}",
2180                canonical_db_path.display()
2181            )
2182        })?;
2183    let schema_version = read_meta_schema_version(&conn)?;
2184
2185    if let Some(version) = schema_version
2186        && version < CURRENT_SCHEMA_VERSION
2187        && version != 13
2188    {
2189        anyhow::bail!(
2190            "seeded canonical bundle schema_version {version} is too old for baseline import and cannot be finalized automatically"
2191        );
2192    }
2193
2194    clear_seeded_runtime_meta(&conn)?;
2195
2196    conn.execute_compat(
2197        "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
2198        fparams![CURRENT_SCHEMA_VERSION.to_string()],
2199    )?;
2200
2201    conn.execute_compat(
2202        "INSERT OR IGNORE INTO _schema_migrations(version, name) VALUES(?1, 'fts_contentless')",
2203        fparams![CURRENT_SCHEMA_VERSION],
2204    )?;
2205    record_historical_bundle_import(
2206        &conn,
2207        bundle,
2208        "baseline-bulk-sql-copy",
2209        conversations_imported,
2210        messages_imported,
2211    )?;
2212    Ok(())
2213}
2214
2215fn read_meta_schema_version(conn: &FrankenConnection) -> Result<Option<i64>> {
2216    let version: Option<String> = conn
2217        .query_row_map(
2218            "SELECT value FROM meta WHERE key = 'schema_version'",
2219            fparams![],
2220            |row| row.get_typed(0),
2221        )
2222        .optional()?;
2223    Ok(version.and_then(|raw| raw.parse::<i64>().ok()))
2224}
2225
2226#[cfg(test)]
2227fn franken_fts_schema_rows(conn: &FrankenConnection) -> Result<i64> {
2228    conn.query_row_map(
2229        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2230        fparams![],
2231        |row| row.get_typed(0),
2232    )
2233    .context("counting sqlite_master rows for fts_messages via frankensqlite")
2234}
2235
2236#[cfg(test)]
2237fn franken_fts_limit_probe(conn: &FrankenConnection) -> bool {
2238    conn.query("SELECT rowid FROM fts_messages LIMIT 1").is_ok()
2239}
2240
2241#[cfg(test)]
2242#[allow(dead_code)]
2243pub(crate) fn probe_database_health_via_frankensqlite(
2244    db_path: &Path,
2245) -> Result<SqliteDatabaseHealthProbe> {
2246    let path_str = db_path.to_string_lossy();
2247    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2248        format!(
2249            "opening frankensqlite db at {} for database health probe",
2250            db_path.display()
2251        )
2252    })?;
2253    conn.execute_batch("PRAGMA busy_timeout = 30000;")
2254        .with_context(|| {
2255            format!(
2256                "configuring busy timeout for database health probe at {}",
2257                db_path.display()
2258            )
2259        })?;
2260
2261    let schema_version = read_meta_schema_version(&conn)?;
2262    let quick_check_status: String = conn
2263        .query_row_map("PRAGMA quick_check(1)", fparams![], |row| row.get_typed(0))
2264        .with_context(|| format!("running PRAGMA quick_check(1) for {}", db_path.display()))?;
2265    let quick_check_ok = quick_check_status.trim().eq_ignore_ascii_case("ok");
2266    let fts_schema_rows = franken_fts_schema_rows(&conn)?;
2267    let fts_queryable = fts_schema_rows == 1 && franken_fts_limit_probe(&conn);
2268
2269    if !quick_check_ok {
2270        return Ok(SqliteDatabaseHealthProbe {
2271            schema_version,
2272            quick_check_ok,
2273            fts_schema_rows,
2274            fts_queryable,
2275            message_count: 0,
2276            max_message_id: 0,
2277        });
2278    }
2279
2280    let message_count: i64 = conn
2281        .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2282            row.get_typed(0)
2283        })
2284        .context("counting messages during frankensqlite database health probe")?;
2285    let max_message_id: i64 = conn
2286        .query_row_map(
2287            "SELECT COALESCE(MAX(id), 0) FROM messages",
2288            fparams![],
2289            |row| row.get_typed(0),
2290        )
2291        .context("reading max message id during frankensqlite database health probe")?;
2292
2293    Ok(SqliteDatabaseHealthProbe {
2294        schema_version,
2295        quick_check_ok,
2296        fts_schema_rows,
2297        fts_queryable,
2298        message_count,
2299        max_message_id,
2300    })
2301}
2302
2303struct StagedHistoricalSeed {
2304    tempdir: tempfile::TempDir,
2305    db_path: PathBuf,
2306}
2307
2308fn stage_historical_bundle_for_seed(
2309    canonical_db_path: &Path,
2310    bundle: &HistoricalDatabaseBundle,
2311) -> Result<StagedHistoricalSeed> {
2312    let canonical_parent = canonical_db_path.parent().unwrap_or_else(|| Path::new("."));
2313    fs::create_dir_all(canonical_parent).with_context(|| {
2314        format!(
2315            "creating canonical database directory before bulk historical seed import: {}",
2316            canonical_parent.display()
2317        )
2318    })?;
2319    let tempdir = tempfile::TempDir::new_in(canonical_parent)
2320        .context("creating temporary baseline seed directory")?;
2321    let staged_seed_db = tempdir.path().join("baseline-seed-output.db");
2322    copy_database_bundle(&bundle.root_path, &staged_seed_db)?;
2323
2324    Ok(StagedHistoricalSeed {
2325        tempdir,
2326        db_path: staged_seed_db,
2327    })
2328}
2329
2330fn promote_staged_historical_seed(
2331    canonical_db_path: &Path,
2332    staged_seed: &StagedHistoricalSeed,
2333) -> Result<()> {
2334    let canonical_backup = staged_seed
2335        .tempdir
2336        .path()
2337        .join("pre-seed-canonical-backup.db");
2338    let had_canonical = canonical_db_path.exists()
2339        || database_sidecar_path(canonical_db_path, "-wal").exists()
2340        || database_sidecar_path(canonical_db_path, "-shm").exists();
2341
2342    if had_canonical {
2343        move_database_bundle(canonical_db_path, &canonical_backup).with_context(|| {
2344            format!(
2345                "backing up canonical database before promoting staged historical seed import: {}",
2346                canonical_db_path.display()
2347            )
2348        })?;
2349    }
2350
2351    if let Err(err) =
2352        move_database_bundle(&staged_seed.db_path, canonical_db_path).with_context(|| {
2353            format!(
2354                "promoting staged historical seed database bundle {} into canonical path {}",
2355                staged_seed.db_path.display(),
2356                canonical_db_path.display()
2357            )
2358        })
2359    {
2360        if had_canonical {
2361            let _ = move_database_bundle(&canonical_backup, canonical_db_path);
2362        }
2363        return Err(err);
2364    }
2365
2366    Ok(())
2367}
2368
2369pub(crate) fn seed_canonical_from_best_historical_bundle(
2370    canonical_db_path: &Path,
2371) -> Result<Option<HistoricalSalvageOutcome>> {
2372    let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
2373    let mut last_seed_error: Option<anyhow::Error> = None;
2374    for bundle in ordered_bundles
2375        .into_iter()
2376        .filter(|bundle| bundle.supports_direct_readonly)
2377    {
2378        if let Some(version) = bundle.probe.schema_version
2379            && version < 13
2380        {
2381            let err = anyhow!(
2382                "historical bundle {} schema_version {version} is too old for baseline import",
2383                bundle.root_path.display()
2384            );
2385            tracing::warn!(
2386                path = %bundle.root_path.display(),
2387                schema_version = version,
2388                "historical bundle is too old for baseline seed import"
2389            );
2390            last_seed_error = Some(err);
2391            continue;
2392        }
2393
2394        let source = open_historical_bundle_for_salvage(&bundle).with_context(|| {
2395            format!(
2396                "opening historical seed bundle {} for baseline import",
2397                bundle.root_path.display()
2398            )
2399        })?;
2400        let (conversations_imported, messages_imported) = historical_bundle_counts(&source.conn)?;
2401
2402        let staged_seed = match stage_historical_bundle_for_seed(canonical_db_path, &bundle) {
2403            Ok(staged_seed) => staged_seed,
2404            Err(err) => {
2405                tracing::warn!(
2406                    path = %bundle.root_path.display(),
2407                    error = %err,
2408                    "bulk baseline seed staging from historical bundle failed; trying next candidate"
2409                );
2410                last_seed_error = Some(err);
2411                continue;
2412            }
2413        };
2414
2415        if let Err(err) = finalize_seeded_canonical_bundle_via_rusqlite(
2416            &staged_seed.db_path,
2417            &bundle,
2418            conversations_imported,
2419            messages_imported,
2420        ) {
2421            tracing::warn!(
2422                path = %bundle.root_path.display(),
2423                error = %err,
2424                "finalizing staged historical seed import failed; trying next candidate"
2425            );
2426            last_seed_error = Some(err);
2427            continue;
2428        }
2429
2430        if let Err(err) = promote_staged_historical_seed(canonical_db_path, &staged_seed) {
2431            tracing::warn!(
2432                path = %bundle.root_path.display(),
2433                error = %err,
2434                "promoting staged historical seed import failed; trying next candidate"
2435            );
2436            last_seed_error = Some(err);
2437            continue;
2438        }
2439
2440        tracing::info!(
2441            path = %bundle.root_path.display(),
2442            conversations_imported,
2443            messages_imported,
2444            "seeded empty canonical database from largest healthy historical bundle"
2445        );
2446
2447        return Ok(Some(HistoricalSalvageOutcome {
2448            bundles_considered: 0,
2449            bundles_imported: 1,
2450            conversations_imported,
2451            messages_imported,
2452        }));
2453    }
2454    if let Some(err) = last_seed_error {
2455        return Err(err);
2456    }
2457    Ok(None)
2458}
2459
2460fn parse_json_column(value: Option<String>) -> serde_json::Value {
2461    value
2462        .and_then(|raw| serde_json::from_str(&raw).ok())
2463        .unwrap_or(serde_json::Value::Null)
2464}
2465
2466const HISTORICAL_RAW_JSON_SENTINEL_KEY: &str = "__cass_historical_raw_json__";
2467
2468fn wrap_historical_raw_json(raw: String) -> serde_json::Value {
2469    serde_json::json!({ HISTORICAL_RAW_JSON_SENTINEL_KEY: raw })
2470}
2471
2472fn historical_raw_json(value: &serde_json::Value) -> Option<&str> {
2473    match value {
2474        serde_json::Value::Object(map) if map.len() == 1 => map
2475            .get(HISTORICAL_RAW_JSON_SENTINEL_KEY)
2476            .and_then(serde_json::Value::as_str),
2477        _ => None,
2478    }
2479}
2480
2481fn parse_historical_json_column(value: Option<String>) -> serde_json::Value {
2482    match value {
2483        Some(raw) if raw.trim().is_empty() => serde_json::Value::Null,
2484        Some(raw) => wrap_historical_raw_json(raw),
2485        None => serde_json::Value::Null,
2486    }
2487}
2488
2489fn historical_salvage_debug_enabled() -> bool {
2490    std::env::var_os("CASS_DEBUG_HISTORICAL_SALVAGE").is_some()
2491}
2492
2493#[derive(Debug, Clone, Copy)]
2494struct HistoricalImportBatchLimits {
2495    conversations: usize,
2496    messages: usize,
2497    payload_chars: usize,
2498}
2499
2500fn env_positive_usize(key: &str) -> Option<usize> {
2501    dotenvy::var(key)
2502        .ok()
2503        .and_then(|value| value.parse::<usize>().ok())
2504        .filter(|value| *value > 0)
2505}
2506
2507fn historical_import_batch_limits() -> HistoricalImportBatchLimits {
2508    let cpu_count = std::thread::available_parallelism()
2509        .map(std::num::NonZeroUsize::get)
2510        .unwrap_or(1);
2511
2512    let default_limits = if cpu_count >= 32 {
2513        HistoricalImportBatchLimits {
2514            conversations: 128,
2515            messages: 16_384,
2516            payload_chars: 12_000_000,
2517        }
2518    } else {
2519        HistoricalImportBatchLimits {
2520            conversations: 32,
2521            messages: 4_096,
2522            payload_chars: 3_000_000,
2523        }
2524    };
2525
2526    HistoricalImportBatchLimits {
2527        conversations: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CONVERSATIONS")
2528            .unwrap_or(default_limits.conversations),
2529        messages: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_MESSAGES")
2530            .unwrap_or(default_limits.messages),
2531        payload_chars: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CHARS")
2532            .unwrap_or(default_limits.payload_chars),
2533    }
2534}
2535
2536fn json_value_size_hint(value: &serde_json::Value) -> usize {
2537    if let Some(raw) = historical_raw_json(value) {
2538        return raw.len();
2539    }
2540    match value {
2541        serde_json::Value::Null => 0,
2542        other => serde_json::to_string(other)
2543            .map(|raw| raw.len())
2544            .unwrap_or(0),
2545    }
2546}
2547
2548fn message_payload_size_hint(message: &Message) -> usize {
2549    message
2550        .content
2551        .len()
2552        .saturating_add(json_value_size_hint(&message.extra_json))
2553}
2554
2555fn is_backup_root_name(name: &str, prefix: &str) -> bool {
2556    name.starts_with(prefix) && !name.ends_with("-wal") && !name.ends_with("-shm")
2557}
2558
2559// Suffixes that mark sqlite sidecar files we must never re-open as a DB root.
2560// Includes the standard -wal/-shm pair plus frankensqlite's Windows advisory-
2561// lock sidecars (-lock-shared/-lock-reserved/-lock-pending). Used by directory
2562// enumeration paths in `historical_bundle_root_paths`; deliberately NOT used
2563// by `is_backup_root_name`, because the existing backup-rotation cleanup must
2564// continue to sweep up any pre-existing orphan lock sidecars.
2565fn has_db_sidecar_suffix(name: &str) -> bool {
2566    const SIDECAR_SUFFIXES: &[&str] = &[
2567        "-wal",
2568        "-shm",
2569        "-lock-shared",
2570        "-lock-reserved",
2571        "-lock-pending",
2572    ];
2573    SIDECAR_SUFFIXES.iter().any(|suffix| name.ends_with(suffix))
2574}
2575
2576/// Public schema version constant for external checks.
2577pub const CURRENT_SCHEMA_VERSION: i64 = 20;
2578const MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION: i64 = 13;
2579
2580/// Result of checking schema compatibility.
2581#[derive(Debug, Clone)]
2582pub enum SchemaCheck {
2583    /// Schema is up to date, no migration needed.
2584    Compatible,
2585    /// Schema needs migration but can be done incrementally.
2586    NeedsMigration,
2587    /// Schema is incompatible and needs a full rebuild (with reason).
2588    NeedsRebuild(String),
2589}
2590
2591fn schema_check_error_requires_rebuild(err: &frankensqlite::FrankenError) -> bool {
2592    // Only on-disk corruption classes justify destructive rebuild.
2593    // Locking, open, and generic I/O failures are often transient and must
2594    // surface as errors rather than deleting the database under the caller.
2595    matches!(
2596        err,
2597        frankensqlite::FrankenError::DatabaseCorrupt { .. }
2598            | frankensqlite::FrankenError::WalCorrupt { .. }
2599            | frankensqlite::FrankenError::NotADatabase { .. }
2600            | frankensqlite::FrankenError::ShortRead { .. }
2601    )
2602}
2603
2604fn unique_backup_path(path: &Path) -> PathBuf {
2605    static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
2606
2607    let timestamp = SystemTime::now()
2608        .duration_since(UNIX_EPOCH)
2609        .map(|d| d.as_nanos())
2610        .unwrap_or(0);
2611    let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
2612    let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
2613
2614    path.with_file_name(format!(
2615        "{file_name}.backup.{}.{}.{}",
2616        std::process::id(),
2617        timestamp,
2618        nonce
2619    ))
2620}
2621
2622fn vacuum_stage_backup_path(backup_path: &Path) -> PathBuf {
2623    let file_name = backup_path
2624        .file_name()
2625        .and_then(|name| name.to_str())
2626        .unwrap_or("db.backup");
2627    backup_path.with_file_name(format!(".{file_name}.vacuum-in-progress"))
2628}
2629
2630/// Check schema compatibility without modifying the database.
2631///
2632/// Opens the database read-only and checks the schema version.
2633fn check_schema_compatibility(
2634    path: &Path,
2635) -> std::result::Result<SchemaCheck, frankensqlite::FrankenError> {
2636    let mut conn = open_franken_with_flags(
2637        &path.to_string_lossy(),
2638        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
2639    )?;
2640
2641    let result = (|| {
2642        // Check if meta table exists
2643        let meta_exists: i32 = conn.query_row_map(
2644            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='meta'",
2645            fparams![],
2646            |row| row.get_typed(0),
2647        )?;
2648
2649        if meta_exists == 0 {
2650            // No meta table - could be empty or very old schema, needs rebuild
2651            // But first check if there are any tables at all
2652            let table_count: i32 = conn.query_row_map(
2653                "SELECT COUNT(*) FROM sqlite_master WHERE type='table'",
2654                fparams![],
2655                |row| row.get_typed(0),
2656            )?;
2657
2658            if table_count == 0 {
2659                // Empty database, will be initialized fresh
2660                return Ok(SchemaCheck::NeedsMigration);
2661            }
2662
2663            // Has tables but no meta - very old or corrupted
2664            return Ok(SchemaCheck::NeedsRebuild(
2665                "Database missing schema version metadata".to_string(),
2666            ));
2667        }
2668
2669        // Get the schema version
2670        let version: Option<i64> = conn
2671            .query_row_map(
2672                "SELECT value FROM meta WHERE key = 'schema_version'",
2673                fparams![],
2674                |row| Ok(row.get_typed::<String>(0)?.parse().ok()),
2675            )
2676            .ok()
2677            .flatten();
2678
2679        match version {
2680            Some(v) if v == SCHEMA_VERSION => Ok(SchemaCheck::Compatible),
2681            Some(v) if (MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION..SCHEMA_VERSION).contains(&v) => {
2682                Ok(SchemaCheck::NeedsMigration)
2683            }
2684            Some(v) if v > 0 && v < MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION => {
2685                Ok(SchemaCheck::NeedsRebuild(format!(
2686                    "Schema version {} is too old for in-place migration; supported upgrade path starts at version {}",
2687                    v, MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION
2688                )))
2689            }
2690            Some(v) => {
2691                // v > SCHEMA_VERSION - database is from a newer version
2692                Ok(SchemaCheck::NeedsRebuild(format!(
2693                    "Schema version {} is newer than supported version {}",
2694                    v, SCHEMA_VERSION
2695                )))
2696            }
2697            None => Ok(SchemaCheck::NeedsRebuild(
2698                "Schema version not found or invalid".to_string(),
2699            )),
2700        }
2701    })();
2702
2703    if let Err(close_err) = conn.close_in_place() {
2704        tracing::warn!(
2705            error = %close_err,
2706            db_path = %path.display(),
2707            "check_schema_compatibility: close_in_place failed; falling back to best-effort close"
2708        );
2709        conn.close_best_effort_in_place();
2710    }
2711
2712    result
2713}
2714
2715const SCHEMA_VERSION: i64 = CURRENT_SCHEMA_VERSION;
2716
2717#[cfg(test)]
2718const MIGRATION_V1: &str = r"
2719PRAGMA foreign_keys = ON;
2720
2721CREATE TABLE IF NOT EXISTS meta (
2722    key TEXT PRIMARY KEY,
2723    value TEXT NOT NULL
2724);
2725
2726CREATE TABLE IF NOT EXISTS agents (
2727    id INTEGER PRIMARY KEY,
2728    slug TEXT NOT NULL UNIQUE,
2729    name TEXT NOT NULL,
2730    version TEXT,
2731    kind TEXT NOT NULL,
2732    created_at INTEGER NOT NULL,
2733    updated_at INTEGER NOT NULL
2734);
2735
2736CREATE TABLE IF NOT EXISTS workspaces (
2737    id INTEGER PRIMARY KEY,
2738    path TEXT NOT NULL UNIQUE,
2739    display_name TEXT
2740);
2741
2742CREATE TABLE IF NOT EXISTS conversations (
2743    id INTEGER PRIMARY KEY,
2744    agent_id INTEGER NOT NULL REFERENCES agents(id),
2745    workspace_id INTEGER REFERENCES workspaces(id),
2746    external_id TEXT,
2747    title TEXT,
2748    source_path TEXT NOT NULL,
2749    started_at INTEGER,
2750    ended_at INTEGER,
2751    approx_tokens INTEGER,
2752    metadata_json TEXT,
2753    UNIQUE(agent_id, external_id)
2754);
2755
2756CREATE TABLE IF NOT EXISTS messages (
2757    id INTEGER PRIMARY KEY,
2758    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
2759    idx INTEGER NOT NULL,
2760    role TEXT NOT NULL,
2761    author TEXT,
2762    created_at INTEGER,
2763    content TEXT NOT NULL,
2764    extra_json TEXT,
2765    UNIQUE(conversation_id, idx)
2766);
2767
2768CREATE TABLE IF NOT EXISTS snippets (
2769    id INTEGER PRIMARY KEY,
2770    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
2771    file_path TEXT,
2772    start_line INTEGER,
2773    end_line INTEGER,
2774    language TEXT,
2775    snippet_text TEXT
2776);
2777
2778CREATE TABLE IF NOT EXISTS tags (
2779    id INTEGER PRIMARY KEY,
2780    name TEXT NOT NULL UNIQUE
2781);
2782
2783CREATE TABLE IF NOT EXISTS conversation_tags (
2784    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
2785    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
2786    PRIMARY KEY (conversation_id, tag_id)
2787);
2788
2789CREATE INDEX IF NOT EXISTS idx_conversations_agent_started
2790    ON conversations(agent_id, started_at DESC);
2791
2792CREATE INDEX IF NOT EXISTS idx_messages_conv_idx
2793    ON messages(conversation_id, idx);
2794
2795";
2796
2797#[cfg(test)]
2798const MIGRATION_V2: &str = r"
2799CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(
2800    content,
2801    title,
2802    agent,
2803    workspace,
2804    source_path,
2805    created_at UNINDEXED,
2806    message_id UNINDEXED,
2807    tokenize='porter'
2808);
2809INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
2810SELECT
2811    m.content,
2812    c.title,
2813    a.slug,
2814    w.path,
2815    c.source_path,
2816    m.created_at,
2817    m.id
2818FROM messages m
2819JOIN conversations c ON m.conversation_id = c.id
2820JOIN agents a ON c.agent_id = a.id
2821LEFT JOIN workspaces w ON c.workspace_id = w.id;
2822";
2823
2824#[cfg(test)]
2825#[allow(dead_code)]
2826const MIGRATION_V3: &str = r"
2827DROP TABLE IF EXISTS fts_messages;
2828CREATE VIRTUAL TABLE fts_messages USING fts5(
2829    content,
2830    title,
2831    agent,
2832    workspace,
2833    source_path,
2834    created_at UNINDEXED,
2835    message_id UNINDEXED,
2836    tokenize='porter'
2837);
2838INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
2839SELECT
2840    m.content,
2841    c.title,
2842    a.slug,
2843    w.path,
2844    c.source_path,
2845    m.created_at,
2846    m.id
2847FROM messages m
2848JOIN conversations c ON m.conversation_id = c.id
2849JOIN agents a ON c.agent_id = a.id
2850LEFT JOIN workspaces w ON c.workspace_id = w.id;
2851";
2852
2853#[cfg(test)]
2854const MIGRATION_V4: &str = r"
2855-- Sources table for tracking where conversations come from
2856CREATE TABLE IF NOT EXISTS sources (
2857    id TEXT PRIMARY KEY,           -- source_id (e.g., 'local', 'work-laptop')
2858    kind TEXT NOT NULL,            -- 'local', 'ssh', etc.
2859    host_label TEXT,               -- display label
2860    machine_id TEXT,               -- optional stable machine id
2861    platform TEXT,                 -- 'macos', 'linux', 'windows'
2862    config_json TEXT,              -- JSON blob for extra config (SSH params, path rewrites)
2863    created_at INTEGER NOT NULL,
2864    updated_at INTEGER NOT NULL
2865);
2866
2867-- Bootstrap: Insert the default 'local' source
2868INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
2869VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
2870";
2871
2872#[cfg(test)]
2873const MIGRATION_V5: &str = r"
2874-- Add provenance columns to conversations table
2875-- SQLite cannot alter unique constraints, so we need to recreate the table
2876
2877-- Create new table with provenance columns and updated unique constraint
2878CREATE TABLE conversations_new (
2879    id INTEGER PRIMARY KEY,
2880    agent_id INTEGER NOT NULL REFERENCES agents(id),
2881    workspace_id INTEGER REFERENCES workspaces(id),
2882    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
2883    external_id TEXT,
2884    title TEXT,
2885    source_path TEXT NOT NULL,
2886    started_at INTEGER,
2887    ended_at INTEGER,
2888    approx_tokens INTEGER,
2889    metadata_json TEXT,
2890    origin_host TEXT,
2891    UNIQUE(source_id, agent_id, external_id)
2892);
2893
2894-- Copy data from old table (all existing conversations get source_id='local')
2895INSERT INTO conversations_new (id, agent_id, workspace_id, source_id, external_id, title,
2896                               source_path, started_at, ended_at, approx_tokens, metadata_json, origin_host)
2897SELECT id, agent_id, workspace_id, 'local', external_id, title,
2898       source_path, started_at, ended_at, approx_tokens, metadata_json, NULL
2899FROM conversations;
2900
2901-- Drop old table and rename new
2902DROP TABLE conversations;
2903ALTER TABLE conversations_new RENAME TO conversations;
2904
2905-- Recreate indexes
2906CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
2907CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
2908";
2909
2910#[cfg(test)]
2911const MIGRATION_V6: &str = r"
2912-- Optimize lookup by source_path (used by TUI detail view)
2913CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
2914";
2915
2916#[cfg(test)]
2917const MIGRATION_V7: &str = r"
2918-- Add binary columns for MessagePack serialization (Opt 3.1)
2919-- Binary format is 50-70% smaller than JSON and faster to parse
2920ALTER TABLE conversations ADD COLUMN metadata_bin BLOB;
2921ALTER TABLE messages ADD COLUMN extra_bin BLOB;
2922";
2923
2924#[cfg(test)]
2925const MIGRATION_V8: &str = r"
2926-- Opt 3.2: Daily stats materialized table for O(1) time-range histograms
2927-- Provides fast aggregated queries for stats/dashboard without full table scans
2928
2929CREATE TABLE IF NOT EXISTS daily_stats (
2930    day_id INTEGER NOT NULL,              -- Days since 2020-01-01 (Unix epoch + offset)
2931    agent_slug TEXT NOT NULL,             -- 'all' for totals, or specific agent slug
2932    source_id TEXT NOT NULL DEFAULT 'all', -- 'all' for totals, or specific source
2933    session_count INTEGER NOT NULL DEFAULT 0,
2934    message_count INTEGER NOT NULL DEFAULT 0,
2935    total_chars INTEGER NOT NULL DEFAULT 0,
2936    last_updated INTEGER NOT NULL,
2937    PRIMARY KEY (day_id, agent_slug, source_id)
2938);
2939
2940CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
2941CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
2942";
2943
2944#[cfg(test)]
2945const MIGRATION_V9: &str = r"
2946-- Background embedding jobs tracking table
2947CREATE TABLE IF NOT EXISTS embedding_jobs (
2948    id INTEGER PRIMARY KEY AUTOINCREMENT,
2949    db_path TEXT NOT NULL,
2950    model_id TEXT NOT NULL,
2951    status TEXT NOT NULL DEFAULT 'pending',
2952    total_docs INTEGER NOT NULL DEFAULT 0,
2953    completed_docs INTEGER NOT NULL DEFAULT 0,
2954    error_message TEXT,
2955    created_at TEXT NOT NULL DEFAULT (datetime('now')),
2956    started_at TEXT,
2957    completed_at TEXT
2958);
2959
2960-- Only one pending or running job per (db_path, model_id) at a time.
2961-- Multiple completed/failed/cancelled jobs are allowed for history.
2962CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
2963ON embedding_jobs(db_path, model_id)
2964WHERE status IN ('pending', 'running');
2965";
2966
2967#[cfg(test)]
2968const MIGRATION_V10: &str = r"
2969-- Token analytics: per-message token usage ledger
2970CREATE TABLE IF NOT EXISTS token_usage (
2971    id INTEGER PRIMARY KEY AUTOINCREMENT,
2972    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
2973    conversation_id INTEGER NOT NULL,
2974    agent_id INTEGER NOT NULL,
2975    workspace_id INTEGER,
2976    source_id TEXT NOT NULL DEFAULT 'local',
2977
2978    -- Timing
2979    timestamp_ms INTEGER NOT NULL,
2980    day_id INTEGER NOT NULL,
2981
2982    -- Model identification
2983    model_name TEXT,
2984    model_family TEXT,
2985    model_tier TEXT,
2986    service_tier TEXT,
2987    provider TEXT,
2988
2989    -- Token counts (nullable — not all agents provide all fields)
2990    input_tokens INTEGER,
2991    output_tokens INTEGER,
2992    cache_read_tokens INTEGER,
2993    cache_creation_tokens INTEGER,
2994    thinking_tokens INTEGER,
2995    total_tokens INTEGER,
2996
2997    -- Cost estimation
2998    estimated_cost_usd REAL,
2999
3000    -- Message context
3001    role TEXT NOT NULL,
3002    content_chars INTEGER NOT NULL,
3003    has_tool_calls INTEGER NOT NULL DEFAULT 0,
3004    tool_call_count INTEGER NOT NULL DEFAULT 0,
3005
3006    -- Data quality
3007    data_source TEXT NOT NULL DEFAULT 'api',
3008
3009    UNIQUE(message_id)
3010);
3011
3012CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
3013CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
3014CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
3015CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
3016CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
3017
3018-- Token analytics: pre-aggregated daily rollups
3019CREATE TABLE IF NOT EXISTS token_daily_stats (
3020    day_id INTEGER NOT NULL,
3021    agent_slug TEXT NOT NULL,
3022    source_id TEXT NOT NULL DEFAULT 'all',
3023    model_family TEXT NOT NULL DEFAULT 'all',
3024
3025    api_call_count INTEGER NOT NULL DEFAULT 0,
3026    user_message_count INTEGER NOT NULL DEFAULT 0,
3027    assistant_message_count INTEGER NOT NULL DEFAULT 0,
3028    tool_message_count INTEGER NOT NULL DEFAULT 0,
3029
3030    total_input_tokens INTEGER NOT NULL DEFAULT 0,
3031    total_output_tokens INTEGER NOT NULL DEFAULT 0,
3032    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
3033    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
3034    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
3035    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
3036
3037    total_content_chars INTEGER NOT NULL DEFAULT 0,
3038    total_tool_calls INTEGER NOT NULL DEFAULT 0,
3039
3040    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
3041
3042    session_count INTEGER NOT NULL DEFAULT 0,
3043
3044    last_updated INTEGER NOT NULL,
3045
3046    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
3047);
3048
3049CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
3050CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
3051
3052-- Model pricing lookup table
3053CREATE TABLE IF NOT EXISTS model_pricing (
3054    model_pattern TEXT NOT NULL,
3055    provider TEXT NOT NULL,
3056    input_cost_per_mtok REAL NOT NULL,
3057    output_cost_per_mtok REAL NOT NULL,
3058    cache_read_cost_per_mtok REAL,
3059    cache_creation_cost_per_mtok REAL,
3060    effective_date TEXT NOT NULL,
3061    PRIMARY KEY (model_pattern, effective_date)
3062);
3063
3064-- Seed with current pricing (as of 2026-02)
3065INSERT OR IGNORE INTO model_pricing VALUES
3066    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
3067    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
3068    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
3069    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
3070    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
3071    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3072    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3073    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
3074    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
3075    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
3076
3077-- Extend conversations table with token summary columns
3078ALTER TABLE conversations ADD COLUMN total_input_tokens INTEGER;
3079ALTER TABLE conversations ADD COLUMN total_output_tokens INTEGER;
3080ALTER TABLE conversations ADD COLUMN total_cache_read_tokens INTEGER;
3081ALTER TABLE conversations ADD COLUMN total_cache_creation_tokens INTEGER;
3082ALTER TABLE conversations ADD COLUMN grand_total_tokens INTEGER;
3083ALTER TABLE conversations ADD COLUMN estimated_cost_usd REAL;
3084ALTER TABLE conversations ADD COLUMN primary_model TEXT;
3085ALTER TABLE conversations ADD COLUMN api_call_count INTEGER;
3086ALTER TABLE conversations ADD COLUMN tool_call_count INTEGER;
3087ALTER TABLE conversations ADD COLUMN user_message_count INTEGER;
3088ALTER TABLE conversations ADD COLUMN assistant_message_count INTEGER;
3089";
3090
3091const MIGRATION_V14: &str = r"
3092-- Switch FTS5 from internal-content to contentless mode (CASS #163).
3093-- Drop the old V13 internal-content fts_messages first so that
3094-- sqlite_schema does not contain two conflicting CREATE VIRTUAL TABLE
3095-- entries, which makes the database completely unreadable.
3096-- The current contentless table is recreated lazily after open() only when the
3097-- frankensqlite FTS consistency check finds it missing or malformed.
3098DROP TABLE IF EXISTS fts_messages;
3099";
3100
3101const MIGRATION_V15_TAIL_STATE_TABLE: &str = r"
3102CREATE TABLE IF NOT EXISTS conversation_tail_state (
3103    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3104    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3105    conversation_id INTEGER PRIMARY KEY,
3106    ended_at INTEGER,
3107    last_message_idx INTEGER,
3108    last_message_created_at INTEGER
3109);
3110";
3111
3112const MIGRATION_V16: &str = r"
3113-- UNIQUE(conversation_id, idx) already creates sqlite_autoindex_messages_1,
3114-- which covers the same lookup/order key as idx_messages_conv_idx. Keeping both
3115-- doubles message insert index maintenance on the hot indexing path.
3116DROP INDEX IF EXISTS idx_messages_conv_idx;
3117";
3118
3119const MIGRATION_V17: &str = r"
3120-- Drop the global messages(created_at) secondary index from the ingest hot
3121-- path. Search/time filters are served by the derived search layer and
3122-- conversation/analytics indexes, while this index is maintained on every
3123-- message insert.
3124DROP INDEX IF EXISTS idx_messages_created;
3125";
3126
3127const MIGRATION_V18: &str = r"
3128-- Move append-tail state out of the wide, indexed conversations row. The hot
3129-- append path updates this cache for every appended conversation; keeping it in
3130-- a tiny rowid table avoids rewriting the large conversation record.
3131CREATE TABLE IF NOT EXISTS conversation_tail_state (
3132    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3133    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3134    conversation_id INTEGER PRIMARY KEY,
3135    ended_at INTEGER,
3136    last_message_idx INTEGER,
3137    last_message_created_at INTEGER
3138);
3139
3140INSERT OR REPLACE INTO conversation_tail_state (
3141    conversation_id, ended_at, last_message_idx, last_message_created_at
3142)
3143SELECT id, ended_at, last_message_idx, last_message_created_at
3144FROM conversations
3145WHERE ended_at IS NOT NULL
3146   OR last_message_idx IS NOT NULL
3147   OR last_message_created_at IS NOT NULL;
3148";
3149
3150const MIGRATION_V19: &str = r"
3151-- Materialize external conversation provenance into one compact lookup key.
3152-- This keeps the hot append/new-conversation probe on a single primary-key
3153-- lookup instead of a composite conversations-table predicate.
3154CREATE TABLE IF NOT EXISTS conversation_external_lookup (
3155    lookup_key TEXT PRIMARY KEY,
3156    conversation_id INTEGER NOT NULL
3157);
3158
3159INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
3160SELECT
3161    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
3162    CAST(agent_id AS TEXT) || ':' ||
3163    CAST(length(external_id) AS TEXT) || ':' || external_id,
3164    id
3165FROM conversations
3166WHERE external_id IS NOT NULL;
3167";
3168
3169const MIGRATION_V20: &str = r"
3170-- Fuse external conversation lookup with append-tail state. Append-heavy
3171-- workloads can resolve both the conversation id and tail plan from one
3172-- primary-key probe.
3173CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
3174    lookup_key TEXT PRIMARY KEY,
3175    conversation_id INTEGER NOT NULL,
3176    ended_at INTEGER,
3177    last_message_idx INTEGER,
3178    last_message_created_at INTEGER
3179);
3180
3181INSERT OR REPLACE INTO conversation_external_tail_lookup (
3182    lookup_key,
3183    conversation_id,
3184    ended_at,
3185    last_message_idx,
3186    last_message_created_at
3187)
3188SELECT
3189    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
3190    CAST(c.agent_id AS TEXT) || ':' ||
3191    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
3192    c.id,
3193    ts.ended_at,
3194    ts.last_message_idx,
3195    ts.last_message_created_at
3196FROM conversations c
3197LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
3198WHERE c.external_id IS NOT NULL;
3199";
3200
3201/// Row from the embedding_jobs table.
3202#[derive(Debug, Clone)]
3203pub struct EmbeddingJobRow {
3204    pub id: i64,
3205    pub db_path: String,
3206    pub model_id: String,
3207    pub status: String,
3208    pub total_docs: i64,
3209    pub completed_docs: i64,
3210    pub error_message: Option<String>,
3211    pub created_at: String,
3212    pub started_at: Option<String>,
3213    pub completed_at: Option<String>,
3214}
3215
3216/// Lightweight conversation projection used while rebuilding the lexical index.
3217///
3218/// This intentionally omits `metadata_json` / `metadata_bin` and other bulky
3219/// fields because Tantivy only needs the stable envelope plus provenance
3220/// identifiers. Reading full metadata here can force frankensqlite to traverse
3221/// large overflow chains before the first lexical checkpoint is committed.
3222#[derive(Debug, Clone)]
3223pub struct LexicalRebuildConversationRow {
3224    pub id: Option<i64>,
3225    pub agent_slug: String,
3226    pub workspace: Option<PathBuf>,
3227    pub external_id: Option<String>,
3228    pub title: Option<String>,
3229    pub source_path: PathBuf,
3230    pub started_at: Option<i64>,
3231    pub ended_at: Option<i64>,
3232    pub source_id: String,
3233    pub origin_host: Option<String>,
3234}
3235
3236/// Lightweight per-conversation footprint used to pre-plan lexical rebuild
3237/// shard boundaries without re-reading full message bodies in the hot path.
3238#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3239pub struct LexicalRebuildConversationFootprintRow {
3240    pub conversation_id: i64,
3241    pub message_count: usize,
3242    pub message_bytes: usize,
3243}
3244
3245pub(crate) const LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE: usize = 4 * 1024;
3246const LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT: usize = 64;
3247
3248fn lexical_rebuild_tail_metadata_coverage_is_sufficient(
3249    total_conversations: usize,
3250    covered_conversations: usize,
3251) -> bool {
3252    total_conversations == 0
3253        || total_conversations.saturating_sub(covered_conversations.min(total_conversations))
3254            <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT
3255}
3256
3257fn lexical_rebuild_message_count_from_tail_idx(last_message_idx: Option<i64>) -> Option<usize> {
3258    let last_message_idx = u64::try_from(last_message_idx?).ok()?;
3259    let high_water = last_message_idx.checked_add(1)?;
3260    usize::try_from(high_water).ok()
3261}
3262
3263fn lexical_rebuild_conversation_footprint_from_count(
3264    conversation_id: i64,
3265    message_count: usize,
3266) -> LexicalRebuildConversationFootprintRow {
3267    LexicalRebuildConversationFootprintRow {
3268        conversation_id,
3269        message_count,
3270        message_bytes: message_count
3271            .saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE),
3272    }
3273}
3274
3275/// Lightweight message projection used by the streaming lexical rebuild path.
3276#[derive(Debug, Clone)]
3277pub struct LexicalRebuildMessageRow {
3278    pub conversation_id: i64,
3279    pub id: i64,
3280    pub idx: i64,
3281    pub role: String,
3282    pub author: Option<String>,
3283    pub created_at: Option<i64>,
3284    pub content: String,
3285}
3286
3287/// Even lighter message projection used only by the grouped lexical rebuild
3288/// stream hot path. It keeps just the per-message fields the rebuild consumes
3289/// and tracks the final message id at conversation scope instead.
3290#[derive(Debug, Clone, PartialEq, Eq)]
3291pub struct LexicalRebuildGroupedMessageRow {
3292    pub idx: i64,
3293    pub is_tool_role: bool,
3294    pub created_at: Option<i64>,
3295    pub content: String,
3296}
3297
3298pub type LexicalRebuildGroupedMessageRows = SmallVec<[LexicalRebuildGroupedMessageRow; 32]>;
3299
3300/// Compatibility alias retained while call sites finish converging on `FrankenStorage`.
3301pub type SqliteStorage = FrankenStorage;
3302
3303/// Primary frankensqlite-backed storage backend.
3304pub struct FrankenStorage {
3305    conn: FrankenConnection,
3306    db_path: PathBuf,
3307    ephemeral_writer_preflight_verified: AtomicBool,
3308    index_writer_checkpoint_pages: AtomicI64,
3309    index_writer_busy_timeout_ms: AtomicU64,
3310    cached_ephemeral_writer: parking_lot::Mutex<CachedEphemeralWriter>,
3311    ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3312    ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3313    ensured_conversation_sources: Arc<parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>>,
3314    ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3315    fts_messages_present_cache: AtomicI8,
3316}
3317
3318/// Keep ordinary storage commits from tripping over frequent auto-checkpoints
3319/// while still bounding WAL growth. Bulk index paths may override this through
3320/// their explicit checkpoint policy.
3321const DEFAULT_WAL_AUTOCHECKPOINT_PAGES: i64 = 4096;
3322const UNSET_INDEX_WRITER_CHECKPOINT_PAGES: i64 = i64::MIN;
3323const UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS: u64 = 0;
3324const FTS_MESSAGES_PRESENT_UNKNOWN: i8 = 0;
3325const FTS_MESSAGES_PRESENT_ABSENT: i8 = 1;
3326const FTS_MESSAGES_PRESENT_PRESENT: i8 = 2;
3327
3328enum CachedEphemeralWriter {
3329    Uninitialized,
3330    Cached(Box<SendFrankenConnection>),
3331    InUse,
3332}
3333
3334#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3335struct EnsuredAgentKey {
3336    slug: String,
3337    name: String,
3338    version: Option<String>,
3339    kind: String,
3340}
3341
3342impl EnsuredAgentKey {
3343    fn from_agent(agent: &Agent) -> Self {
3344        Self {
3345            slug: agent.slug.clone(),
3346            name: agent.name.clone(),
3347            version: agent.version.clone(),
3348            kind: agent_kind_str(agent.kind.clone()),
3349        }
3350    }
3351}
3352
3353#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3354struct EnsuredWorkspaceKey {
3355    path: String,
3356    display_name: Option<String>,
3357}
3358
3359impl EnsuredWorkspaceKey {
3360    fn new(path: String, display_name: Option<&str>) -> Self {
3361        Self {
3362            path,
3363            display_name: display_name.map(str::to_owned),
3364        }
3365    }
3366}
3367
3368#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3369struct EnsuredConversationSourceKey {
3370    id: String,
3371    kind: SourceKind,
3372    host_label: Option<String>,
3373}
3374
3375impl EnsuredConversationSourceKey {
3376    fn from_source(source: &Source) -> Self {
3377        Self {
3378            id: source.id.clone(),
3379            kind: source.kind,
3380            host_label: source.host_label.clone(),
3381        }
3382    }
3383}
3384
3385#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3386struct EnsuredDailyStatsKey {
3387    day_id: i64,
3388    agent_slug: String,
3389    source_id: String,
3390}
3391
3392impl EnsuredDailyStatsKey {
3393    fn new(day_id: i64, agent_slug: &str, source_id: &str) -> Self {
3394        Self {
3395            day_id,
3396            agent_slug: agent_slug.to_owned(),
3397            source_id: source_id.to_owned(),
3398        }
3399    }
3400}
3401
3402const AUTOCOMMIT_RETAIN_OFF_PRAGMAS: [&str; 2] = [
3403    "PRAGMA fsqlite.autocommit_retain = OFF;",
3404    "PRAGMA autocommit_retain = OFF;",
3405];
3406
3407fn disable_autocommit_retain<E>(
3408    mut execute: impl FnMut(&'static str) -> std::result::Result<(), E>,
3409) -> Result<&'static str>
3410where
3411    E: std::fmt::Display,
3412{
3413    let mut failures = Vec::new();
3414    for pragma in AUTOCOMMIT_RETAIN_OFF_PRAGMAS {
3415        match execute(pragma) {
3416            Ok(()) => return Ok(pragma),
3417            Err(err) => {
3418                let error = err.to_string();
3419                tracing::debug!(
3420                    %pragma,
3421                    error = %error,
3422                    "autocommit_retain PRAGMA variant not supported"
3423                );
3424                failures.push(format!("{pragma}: {error}"));
3425            }
3426        }
3427    }
3428
3429    Err(anyhow!(
3430        "failed to disable autocommit_retain on frankensqlite connection; \
3431         refusing to keep a long-lived MVCC connection that may accumulate \
3432         unbounded write snapshots. Upgrade frankensqlite to a version that \
3433         supports one of these PRAGMAs or use a short-lived connection path. \
3434         attempts: {}",
3435        failures.join("; ")
3436    ))
3437}
3438
3439impl FrankenStorage {
3440    fn new(conn: FrankenConnection, db_path: PathBuf) -> Self {
3441        Self::new_with_shared_caches(
3442            conn,
3443            db_path,
3444            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3445            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3446            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3447            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3448        )
3449    }
3450
3451    fn new_with_shared_caches(
3452        conn: FrankenConnection,
3453        db_path: PathBuf,
3454        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3455        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3456        ensured_conversation_sources: Arc<
3457            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3458        >,
3459        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3460    ) -> Self {
3461        Self {
3462            conn,
3463            db_path,
3464            ephemeral_writer_preflight_verified: AtomicBool::new(false),
3465            index_writer_checkpoint_pages: AtomicI64::new(UNSET_INDEX_WRITER_CHECKPOINT_PAGES),
3466            index_writer_busy_timeout_ms: AtomicU64::new(UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS),
3467            cached_ephemeral_writer: parking_lot::Mutex::new(CachedEphemeralWriter::Uninitialized),
3468            ensured_agents,
3469            ensured_workspaces,
3470            ensured_conversation_sources,
3471            ensured_daily_stats_keys,
3472            fts_messages_present_cache: AtomicI8::new(FTS_MESSAGES_PRESENT_UNKNOWN),
3473        }
3474    }
3475
3476    fn apply_open_stage_busy_timeout(&self) {
3477        if let Err(err) = self.conn.execute("PRAGMA busy_timeout = 5000;") {
3478            tracing::debug!(
3479                error = %err,
3480                "failed to apply open-stage busy_timeout before migrations"
3481            );
3482        }
3483    }
3484
3485    /// Open a frankensqlite connection, run migrations, and apply config.
3486    ///
3487    /// This initializes canonical schema state only. Derived fallback search
3488    /// structures like the in-database `fts_messages` table are repaired
3489    /// separately so ordinary opens never block on heavyweight maintenance.
3490    pub fn open(path: &Path) -> Result<Self> {
3491        if let Some(parent) = path.parent() {
3492            fs::create_dir_all(parent)
3493                .with_context(|| format!("creating db directory {}", parent.display()))?;
3494        }
3495
3496        let path_str = path.to_string_lossy().to_string();
3497        let _doctor_guard =
3498            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3499        let conn = FrankenConnection::open(&path_str)
3500            .with_context(|| format!("opening frankensqlite db at {}", path.display()))?;
3501        let storage = Self::new(conn, path.to_path_buf());
3502        storage.apply_open_stage_busy_timeout();
3503        storage.run_migrations()?;
3504        storage.repair_missing_current_schema_objects()?;
3505        storage.apply_config()?;
3506        Ok(storage)
3507    }
3508
3509    /// Open a writer connection that skips migration (assumes DB already migrated).
3510    ///
3511    /// Used by the BEGIN CONCURRENT parallel writer pool: each writer needs its
3512    /// own connection with config applied, but migrations have already been run
3513    /// by the primary connection.
3514    pub fn open_writer(path: &Path) -> Result<Self> {
3515        Self::open_writer_with_shared_caches(
3516            path,
3517            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3518            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3519            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3520            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3521        )
3522    }
3523
3524    fn open_writer_with_shared_caches(
3525        path: &Path,
3526        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3527        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3528        ensured_conversation_sources: Arc<
3529            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3530        >,
3531        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3532    ) -> Result<Self> {
3533        let path_str = path.to_string_lossy().to_string();
3534        let _doctor_guard =
3535            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3536        let conn = FrankenConnection::open(&path_str)
3537            .with_context(|| format!("opening frankensqlite writer at {}", path.display()))?;
3538        let storage = Self::new_with_shared_caches(
3539            conn,
3540            path.to_path_buf(),
3541            ensured_agents,
3542            ensured_workspaces,
3543            ensured_conversation_sources,
3544            ensured_daily_stats_keys,
3545        );
3546        storage.apply_config()?;
3547        Ok(storage)
3548    }
3549
3550    pub(crate) fn acquire_cached_ephemeral_writer(&self) -> Result<(Self, bool)> {
3551        let mut cached = self.cached_ephemeral_writer.lock();
3552        match std::mem::replace(&mut *cached, CachedEphemeralWriter::InUse) {
3553            CachedEphemeralWriter::Cached(conn) => {
3554                let (conn, checkpoint_pages, busy_timeout_ms) = (*conn).into_parts();
3555                let writer = Self::new_with_shared_caches(
3556                    conn,
3557                    self.db_path.clone(),
3558                    Arc::clone(&self.ensured_agents),
3559                    Arc::clone(&self.ensured_workspaces),
3560                    Arc::clone(&self.ensured_conversation_sources),
3561                    Arc::clone(&self.ensured_daily_stats_keys),
3562                );
3563                writer
3564                    .index_writer_checkpoint_pages
3565                    .store(checkpoint_pages, Ordering::Relaxed);
3566                writer
3567                    .index_writer_busy_timeout_ms
3568                    .store(busy_timeout_ms, Ordering::Relaxed);
3569                Ok((writer, true))
3570            }
3571            CachedEphemeralWriter::Uninitialized => {
3572                drop(cached);
3573                match Self::open_writer_with_shared_caches(
3574                    &self.db_path,
3575                    Arc::clone(&self.ensured_agents),
3576                    Arc::clone(&self.ensured_workspaces),
3577                    Arc::clone(&self.ensured_conversation_sources),
3578                    Arc::clone(&self.ensured_daily_stats_keys),
3579                ) {
3580                    Ok(writer) => Ok((writer, true)),
3581                    Err(err) => {
3582                        let mut cached = self.cached_ephemeral_writer.lock();
3583                        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3584                            *cached = CachedEphemeralWriter::Uninitialized;
3585                        }
3586                        Err(err)
3587                    }
3588                }
3589            }
3590            CachedEphemeralWriter::InUse => {
3591                *cached = CachedEphemeralWriter::InUse;
3592                drop(cached);
3593                Ok((
3594                    Self::open_writer_with_shared_caches(
3595                        &self.db_path,
3596                        Arc::clone(&self.ensured_agents),
3597                        Arc::clone(&self.ensured_workspaces),
3598                        Arc::clone(&self.ensured_conversation_sources),
3599                        Arc::clone(&self.ensured_daily_stats_keys),
3600                    )?,
3601                    false,
3602                ))
3603            }
3604        }
3605    }
3606
3607    pub(crate) fn release_cached_ephemeral_writer(&self, writer: Self) {
3608        let checkpoint_pages = writer.index_writer_checkpoint_pages.load(Ordering::Relaxed);
3609        let busy_timeout_ms = writer.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
3610        let conn = writer.into_raw();
3611        let mut cached = self.cached_ephemeral_writer.lock();
3612        debug_assert!(
3613            matches!(&*cached, CachedEphemeralWriter::InUse),
3614            "cached ephemeral writer state should be in-use when releasing"
3615        );
3616        *cached = CachedEphemeralWriter::Cached(Box::new(
3617            SendFrankenConnection::new_with_index_writer_state(
3618                conn,
3619                checkpoint_pages,
3620                busy_timeout_ms,
3621            ),
3622        ));
3623    }
3624
3625    pub(crate) fn discard_cached_ephemeral_writer(&self, mut writer: Self) {
3626        writer.close_best_effort_in_place();
3627        let mut cached = self.cached_ephemeral_writer.lock();
3628        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3629            *cached = CachedEphemeralWriter::Uninitialized;
3630        }
3631    }
3632
3633    fn cached_agent_id(&self, key: &EnsuredAgentKey) -> Option<i64> {
3634        self.ensured_agents.lock().get(key).copied()
3635    }
3636
3637    fn mark_agent_ensured(&self, key: EnsuredAgentKey, id: i64) {
3638        self.ensured_agents.lock().insert(key, id);
3639    }
3640
3641    fn cached_workspace_id(&self, key: &EnsuredWorkspaceKey) -> Option<i64> {
3642        self.ensured_workspaces.lock().get(key).copied()
3643    }
3644
3645    fn mark_workspace_ensured(&self, key: EnsuredWorkspaceKey, id: i64) {
3646        self.ensured_workspaces.lock().insert(key, id);
3647    }
3648
3649    fn conversation_source_already_ensured(&self, key: &EnsuredConversationSourceKey) -> bool {
3650        self.ensured_conversation_sources.lock().contains(key)
3651    }
3652
3653    fn mark_conversation_source_ensured(&self, key: EnsuredConversationSourceKey) {
3654        self.ensured_conversation_sources.lock().insert(key);
3655    }
3656
3657    fn daily_stats_key_already_ensured(&self, key: &EnsuredDailyStatsKey) -> bool {
3658        self.ensured_daily_stats_keys.lock().contains(key)
3659    }
3660
3661    fn daily_stats_keys_already_ensured(&self, keys: &[EnsuredDailyStatsKey; 4]) -> bool {
3662        let ensured = self.ensured_daily_stats_keys.lock();
3663        keys.iter().all(|key| ensured.contains(key))
3664    }
3665
3666    fn mark_daily_stats_key_ensured(&self, key: EnsuredDailyStatsKey) {
3667        self.ensured_daily_stats_keys.lock().insert(key);
3668    }
3669
3670    fn fts_messages_present_cached(&self, tx: &FrankenTransaction<'_>) -> bool {
3671        match self.fts_messages_present_cache.load(Ordering::Acquire) {
3672            FTS_MESSAGES_PRESENT_PRESENT => return true,
3673            FTS_MESSAGES_PRESENT_ABSENT => return false,
3674            _ => {}
3675        }
3676
3677        let present = tx
3678            .query_row_map(
3679                "SELECT COUNT(*) FROM sqlite_master
3680                 WHERE name = 'fts_messages'
3681                   AND rootpage > 0",
3682                fparams![],
3683                |row| row.get_typed::<i64>(0),
3684            )
3685            .map(|count| count > 0)
3686            .unwrap_or_else(|err| {
3687                tracing::debug!(
3688                    error = %err,
3689                    "failed to probe fts_messages presence; skipping db-resident FTS maintenance"
3690                );
3691                false
3692            });
3693        self.set_fts_messages_present_cache(present);
3694        present
3695    }
3696
3697    fn set_fts_messages_present_cache(&self, present: bool) {
3698        self.fts_messages_present_cache.store(
3699            if present {
3700                FTS_MESSAGES_PRESENT_PRESENT
3701            } else {
3702                FTS_MESSAGES_PRESENT_ABSENT
3703            },
3704            Ordering::Release,
3705        );
3706    }
3707
3708    fn invalidate_fts_messages_present_cache(&self) {
3709        self.fts_messages_present_cache
3710            .store(FTS_MESSAGES_PRESENT_UNKNOWN, Ordering::Release);
3711    }
3712
3713    fn invalidate_conversation_source_cache(&self, source_id: &str) {
3714        self.ensured_conversation_sources
3715            .lock()
3716            .retain(|key| key.id != source_id);
3717    }
3718
3719    fn close_cached_ephemeral_writer_best_effort_in_place(&mut self) {
3720        let cached = self.cached_ephemeral_writer.get_mut();
3721        if let CachedEphemeralWriter::Cached(conn) =
3722            std::mem::replace(cached, CachedEphemeralWriter::Uninitialized)
3723        {
3724            let mut conn = conn;
3725            conn.0.close_best_effort_in_place();
3726        }
3727    }
3728
3729    fn close_cached_ephemeral_writer_without_checkpoint_in_place(&mut self) -> Result<()> {
3730        let cached = self.cached_ephemeral_writer.get_mut();
3731        match std::mem::replace(cached, CachedEphemeralWriter::Uninitialized) {
3732            CachedEphemeralWriter::Cached(mut conn) => conn
3733                .0
3734                .close_without_checkpoint_in_place()
3735                .with_context(|| "closing cached frankensqlite writer without final checkpoint"),
3736            CachedEphemeralWriter::Uninitialized | CachedEphemeralWriter::InUse => Ok(()),
3737        }
3738    }
3739
3740    /// Open in read-only mode using frankensqlite compat flags.
3741    pub fn open_readonly(path: &Path) -> Result<Self> {
3742        Self::open_readonly_with_doctor_lock_timeout(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)
3743    }
3744
3745    /// Open in read-only mode with an explicit doctor mutation-lock timeout.
3746    ///
3747    /// This is primarily useful for probes that need to prove a reader would
3748    /// not enter the archive while `cass doctor --fix` owns the repair lock.
3749    pub fn open_readonly_with_doctor_lock_timeout(path: &Path, timeout: Duration) -> Result<Self> {
3750        let path_str = path.to_string_lossy().to_string();
3751        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
3752        let conn = open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
3753            .with_context(|| format!("opening frankensqlite db readonly at {}", path.display()))?;
3754        let storage = Self::new(conn, path.to_path_buf());
3755        storage.apply_readonly_config()?;
3756        Ok(storage)
3757    }
3758
3759    pub fn close(self) -> Result<()> {
3760        let mut this = self;
3761        this.close_cached_ephemeral_writer_best_effort_in_place();
3762        this.conn
3763            .close()
3764            .with_context(|| "closing frankensqlite connection")
3765    }
3766
3767    pub fn close_without_checkpoint(self) -> Result<()> {
3768        let mut this = self;
3769        this.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
3770        this.conn
3771            .close_without_checkpoint()
3772            .with_context(|| "closing frankensqlite connection without final checkpoint")
3773    }
3774
3775    pub fn close_best_effort_in_place(&mut self) {
3776        self.close_cached_ephemeral_writer_best_effort_in_place();
3777        self.conn.close_best_effort_in_place();
3778    }
3779
3780    pub fn close_without_checkpoint_in_place(&mut self) -> Result<()> {
3781        self.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
3782        self.conn
3783            .close_without_checkpoint_in_place()
3784            .with_context(|| "closing frankensqlite connection without final checkpoint")
3785    }
3786
3787    /// Access the raw frankensqlite connection.
3788    pub fn raw(&self) -> &FrankenConnection {
3789        &self.conn
3790    }
3791
3792    /// Consume the storage wrapper and return the underlying frankensqlite
3793    /// connection after migrations/repair have already been applied.
3794    pub fn into_raw(self) -> FrankenConnection {
3795        let mut this = self;
3796        this.close_cached_ephemeral_writer_best_effort_in_place();
3797        this.conn
3798    }
3799
3800    /// Apply connection PRAGMAs for parity with SqliteStorage's `apply_pragmas()`.
3801    ///
3802    /// Frankensqlite supports all PRAGMAs cass uses (journal_mode, synchronous,
3803    /// cache_size, foreign_keys, busy_timeout). Its default journal_mode is already
3804    /// WAL and default synchronous is NORMAL, matching cass's requirements.
3805    ///
3806    pub fn apply_config(&self) -> Result<()> {
3807        // journal_mode: frankensqlite defaults to WAL, same as cass.
3808        // synchronous: frankensqlite defaults to NORMAL, same as cass.
3809        // Both are set explicitly for clarity.
3810        self.conn
3811            .execute("PRAGMA journal_mode = WAL;")
3812            .with_context(|| "setting journal_mode")?;
3813        self.conn
3814            .execute("PRAGMA synchronous = NORMAL;")
3815            .with_context(|| "setting synchronous")?;
3816
3817        // cache_size: 64MB (negative value = KiB).
3818        self.conn
3819            .execute("PRAGMA cache_size = -65536;")
3820            .with_context(|| "setting cache_size")?;
3821
3822        // foreign_keys: enable constraint enforcement.
3823        self.conn
3824            .execute("PRAGMA foreign_keys = ON;")
3825            .with_context(|| "setting foreign_keys")?;
3826
3827        // busy_timeout: 5 seconds (in milliseconds).
3828        self.conn
3829            .execute("PRAGMA busy_timeout = 5000;")
3830            .with_context(|| "setting busy_timeout")?;
3831
3832        // temp_store = MEMORY and mmap_size are C SQLite performance knobs.
3833        // In frankensqlite's architecture (in-memory MVCC engine with pager
3834        // backend), temp_store is always memory-resident and mmap_size does not
3835        // apply. Skipped intentionally — these are no-ops or errors.
3836
3837        // wal_autocheckpoint: use a bounded cadence that avoids checkpointing
3838        // inside common append batches without deferring checkpoints forever.
3839        let checkpoint_pragma =
3840            format!("PRAGMA wal_autocheckpoint = {DEFAULT_WAL_AUTOCHECKPOINT_PAGES};");
3841        let _ = self.conn.execute(&checkpoint_pragma);
3842        self.index_writer_checkpoint_pages
3843            .store(DEFAULT_WAL_AUTOCHECKPOINT_PAGES, Ordering::Relaxed);
3844        // Explicitly enable concurrent writer mode for BEGIN/transaction paths.
3845        // Try both namespace variants for compatibility across fsqlite builds.
3846        let _ = self.conn.execute("PRAGMA fsqlite.concurrent_mode = ON;");
3847        let _ = self.conn.execute("PRAGMA concurrent_mode = ON;");
3848        // Frankensqlite retained autocommit currently mis-serves same-connection
3849        // read-after-write queries on cass's storage paths; keep it off here
3850        // until the upstream visibility bug is fixed.
3851        //
3852        // CASS #163 item 3: If neither PRAGMA variant succeeds, the MVCC engine
3853        // will accumulate write snapshots for the lifetime of the connection,
3854        // causing unbounded memory growth on long-lived watch-mode handles.
3855        // Log at warn level so the failure is visible instead of silently
3856        // swallowed, and set a flag for callers that need to periodically
3857        // recycle the connection.
3858        let autocommit_pragma =
3859            disable_autocommit_retain(|pragma| self.conn.execute(pragma).map(|_| ()))?;
3860        tracing::debug!(
3861            pragma = autocommit_pragma,
3862            "disabled frankensqlite autocommit_retain for storage connection"
3863        );
3864
3865        Ok(())
3866    }
3867
3868    fn apply_readonly_config(&self) -> Result<()> {
3869        self.conn
3870            .execute("PRAGMA query_only = 1;")
3871            .with_context(|| "setting query_only")?;
3872        self.conn
3873            .execute("PRAGMA busy_timeout = 5000;")
3874            .with_context(|| "setting busy_timeout")?;
3875        self.conn
3876            .execute("PRAGMA cache_size = -65536;")
3877            .with_context(|| "setting cache_size")?;
3878        self.conn
3879            .execute("PRAGMA foreign_keys = ON;")
3880            .with_context(|| "setting foreign_keys")?;
3881        Ok(())
3882    }
3883
3884    /// Run all schema migrations, handling transition from meta table versioning.
3885    ///
3886    /// The existing `SqliteStorage` tracks schema version in a `meta` table entry.
3887    /// The new `MigrationRunner` uses a `_schema_migrations` table. This method:
3888    /// 1. Transitions existing databases from meta table → `_schema_migrations`
3889    /// 2. Runs pending migrations via `MigrationRunner`
3890    /// 3. Syncs `meta.schema_version` for backward compatibility
3891    ///
3892    /// # Fresh vs existing databases
3893    ///
3894    /// Fresh databases use a single combined migration (`MIGRATION_FRESH_SCHEMA`)
3895    /// that creates the complete V13 schema directly. This avoids the incremental
3896    /// V5 migration which uses `DROP TABLE` — an operation that triggers a known
3897    /// frankensqlite autoindex limitation.
3898    ///
3899    /// Existing databases (transitioned from SqliteStorage) are typically at
3900    /// V13 or newer already; additive post-V13 migrations are applied normally.
3901    pub fn run_migrations(&self) -> Result<()> {
3902        transition_from_meta_version(&self.conn)?;
3903
3904        let base_result = build_cass_migrations_before_tail_cache()
3905            .run(&self.conn)
3906            .with_context(|| "running base schema migrations")?;
3907
3908        let mut applied = base_result.applied;
3909        if apply_conversation_tail_state_cache_migration(&self.conn)
3910            .with_context(|| "running conversation tail-state cache migration")?
3911        {
3912            applied.push(15);
3913        }
3914
3915        let post_result = build_cass_migrations_after_tail_cache()
3916            .run(&self.conn)
3917            .with_context(|| "running post-tail-cache schema migrations")?;
3918        applied.extend(post_result.applied);
3919
3920        let current = self.schema_version()?;
3921        if !applied.is_empty() {
3922            info!(
3923                applied = ?applied,
3924                current,
3925                was_fresh = base_result.was_fresh,
3926                "frankensqlite schema migrations applied"
3927            );
3928        }
3929
3930        // Keep meta.schema_version in sync for backward compatibility.
3931        self.sync_meta_schema_version(current)?;
3932
3933        Ok(())
3934    }
3935
3936    /// Some historical canonical rebuild paths produced databases whose
3937    /// version markers claim the current schema while post-V10 analytics
3938    /// tables were never materialized. Detect that drift and backfill the
3939    /// idempotent table/index set from the combined schema migration.
3940    fn repair_missing_current_schema_objects(&self) -> Result<()> {
3941        let mut missing_tables = Vec::new();
3942        for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
3943            if let Err(err) = self.conn.query(probe_sql) {
3944                if error_indicates_missing_table(&err) {
3945                    missing_tables.push(table_name);
3946                    continue;
3947                }
3948                return Err(err).with_context(|| {
3949                    format!("probing required schema table {table_name} for completeness")
3950                });
3951            }
3952        }
3953
3954        if !missing_tables.is_empty() {
3955            info!(
3956                missing_tables = ?missing_tables,
3957                "repairing missing current-schema tables on an already-versioned cass database"
3958            );
3959
3960            for batch in current_schema_repair_batches_for_missing_tables(&missing_tables)? {
3961                self.conn
3962                    .execute_batch(batch.sql)
3963                    .with_context(|| format!("repairing current-schema batch {}", batch.name))?;
3964            }
3965
3966            for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
3967                if !missing_tables.contains(&table_name) {
3968                    continue;
3969                }
3970                self.conn
3971                    .query(probe_sql)
3972                    .with_context(|| format!("verifying repaired schema table {table_name}"))?;
3973            }
3974        }
3975        self.repair_missing_conversation_token_columns()?;
3976        Ok(())
3977    }
3978
3979    fn repair_missing_conversation_token_columns(&self) -> Result<()> {
3980        let columns = franken_table_column_names(&self.conn, "conversations")
3981            .with_context(|| "inspecting conversations columns for token-summary repair")?;
3982        let mut missing_columns = Vec::new();
3983        for &(column_name, column_type) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
3984            if columns.contains(column_name) {
3985                continue;
3986            }
3987            let sql = format!("ALTER TABLE conversations ADD COLUMN {column_name} {column_type};");
3988            self.conn.execute(&sql).with_context(|| {
3989                format!("adding missing conversations.{column_name} token-summary column")
3990            })?;
3991            missing_columns.push(column_name);
3992        }
3993        if !missing_columns.is_empty() {
3994            tracing::warn!(
3995                target: "cass::schema_repair",
3996                db_path = %self.db_path.display(),
3997                missing_columns = ?missing_columns,
3998                "cass#222: repaired missing conversations token-summary columns"
3999            );
4000        }
4001        Ok(())
4002    }
4003
4004    /// Detect and remove orphan rows whose FK parent has gone missing.
4005    ///
4006    /// A `Connection` dropped mid-transaction (the `drop_close` warning emitted
4007    /// by frankensqlite's `Drop` impl) can leave child rows persisted without a
4008    /// matching parent — `messages` referencing a `conversation_id` that does
4009    /// not exist, `message_metrics`/`token_usage`/`snippets` referencing a
4010    /// `message_id` that does not exist, etc. With `PRAGMA foreign_keys = ON`,
4011    /// every subsequent indexer pass then trips `FOREIGN KEY constraint failed`
4012    /// on the next write, the session never gets marked indexed, and the
4013    /// pending backlog grows without bound (issue #202).
4014    ///
4015    /// This pass runs at indexer startup as defense in depth: it scans each
4016    /// child table for rows whose parent row has gone missing and removes them
4017    /// in bounded committed chunks, breaking the failure cycle even when the
4018    /// underlying transaction-discipline bug has not been fully root-caused.
4019    /// The pass is idempotent (a clean database is a no-op), and emits a
4020    /// `WARN` after successful cleanup so the upstream `drop_close` condition
4021    /// stays visible.
4022    pub(crate) fn cleanup_orphan_fk_rows(&self) -> Result<OrphanFkCleanupReport> {
4023        let mut report = OrphanFkCleanupReport::default();
4024        let orphan_message_ids = match collect_orphan_message_ids(&self.conn) {
4025            Ok(ids) => ids,
4026            Err(err) if error_indicates_missing_table(&err) => {
4027                tracing::debug!(
4028                    target: "cass::fk_repair",
4029                    child_table = "messages",
4030                    error = %err,
4031                    "skipping orphan-message probe (table or column unavailable)"
4032                );
4033                Vec::new()
4034            }
4035            Err(err) => return Err(err),
4036        };
4037        if !orphan_message_ids.is_empty() {
4038            report.record("messages", orphan_message_ids.len() as i64);
4039        }
4040
4041        if !orphan_message_ids.is_empty() {
4042            delete_orphan_message_ids_bisecting_oom(&self.conn, &orphan_message_ids)
4043                .context("deleting orphan message rows and dependent children")?;
4044        }
4045
4046        for entry in ORPHAN_DIRECT_CHILD_TABLES {
4047            loop {
4048                let ids = match collect_direct_orphan_id_page(&self.conn, entry) {
4049                    Ok(ids) => ids,
4050                    Err(err)
4051                        if error_indicates_missing_table(&err)
4052                            || error_indicates_missing_column(&err) =>
4053                    {
4054                        // Tolerant probe: a missing child/parent table or FK
4055                        // column on older schemas means there is nothing to
4056                        // clean up for this table.
4057                        tracing::debug!(
4058                            target: "cass::fk_repair",
4059                            child_table = entry.child_table,
4060                            error = %err,
4061                            "skipping orphan probe (table or column unavailable)"
4062                        );
4063                        break;
4064                    }
4065                    Err(err) => {
4066                        return Err(err).with_context(|| {
4067                            format!("probing orphan rows in {}", entry.child_table)
4068                        });
4069                    }
4070                };
4071                if ids.is_empty() {
4072                    break;
4073                }
4074
4075                let deleted = delete_direct_orphan_ids_bisecting_oom(&self.conn, entry, &ids)
4076                    .with_context(|| format!("deleting orphan rows from {}", entry.child_table))?;
4077                if deleted == 0 {
4078                    break;
4079                }
4080                report.record(
4081                    entry.child_table,
4082                    i64::try_from(deleted).unwrap_or(i64::MAX),
4083                );
4084            }
4085        }
4086
4087        if report.total == 0 {
4088            return Ok(report);
4089        }
4090
4091        // WARN only fires after a successful commit so the message accurately
4092        // reflects what actually happened on disk. db_path is included so logs
4093        // from concurrent indexers against different databases stay
4094        // disambiguated.
4095        tracing::warn!(
4096            target: "cass::fk_repair",
4097            db_path = %self.db_path.display(),
4098            total_orphans = report.total,
4099            per_table = ?report.per_table,
4100            "cass#202: removed orphan rows left behind by interrupted index transactions"
4101        );
4102
4103        Ok(report)
4104    }
4105
4106    /// Return the current schema version from `_schema_migrations`.
4107    pub fn schema_version(&self) -> Result<i64> {
4108        let rows = self
4109            .conn
4110            .query("SELECT MAX(version) FROM _schema_migrations;")
4111            .with_context(|| "reading schema version from _schema_migrations")?;
4112
4113        if let Some(row) = rows.first()
4114            && let Ok(v) = row.get_typed::<Option<i64>>(0)
4115        {
4116            return Ok(v.unwrap_or(0));
4117        }
4118        Ok(0)
4119    }
4120
4121    /// Keep `meta.schema_version` in sync for backward compatibility with `SqliteStorage`.
4122    fn sync_meta_schema_version(&self, version: i64) -> Result<()> {
4123        // The meta table is created by V1 migration. If it doesn't exist yet,
4124        // there's nothing to sync.
4125        if self.conn.query("SELECT key FROM meta LIMIT 1;").is_err() {
4126            return Ok(());
4127        }
4128
4129        // Only write if the version needs updating to avoid write lock contention
4130        if let Ok(rows) = self
4131            .conn
4132            .query("SELECT value FROM meta WHERE key = 'schema_version';")
4133            && let Some(row) = rows.first()
4134            && let Ok(val) = row.get_typed::<String>(0)
4135            && val == version.to_string()
4136        {
4137            return Ok(()); // Already up to date
4138        }
4139
4140        self.conn
4141            .execute_compat(
4142                "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1);",
4143                &[ParamValue::from(version.to_string())],
4144            )
4145            .with_context(|| "syncing meta schema_version")?;
4146
4147        Ok(())
4148    }
4149
4150    /// Resolve the database file path for this connection.
4151    pub fn database_path(&self) -> Result<PathBuf> {
4152        Ok(self.db_path.clone())
4153    }
4154
4155    pub(crate) fn ephemeral_writer_preflight_verified(&self) -> bool {
4156        self.ephemeral_writer_preflight_verified
4157            .load(Ordering::Relaxed)
4158    }
4159
4160    pub(crate) fn mark_ephemeral_writer_preflight_verified(&self) {
4161        self.ephemeral_writer_preflight_verified
4162            .store(true, Ordering::Relaxed);
4163    }
4164
4165    pub(crate) fn index_writer_checkpoint_pages(&self) -> Option<i64> {
4166        let pages = self.index_writer_checkpoint_pages.load(Ordering::Relaxed);
4167        (pages != UNSET_INDEX_WRITER_CHECKPOINT_PAGES).then_some(pages)
4168    }
4169
4170    pub(crate) fn mark_index_writer_checkpoint_pages(&self, pages: i64) {
4171        self.index_writer_checkpoint_pages
4172            .store(pages, Ordering::Relaxed);
4173    }
4174
4175    pub(crate) fn index_writer_busy_timeout_ms(&self) -> Option<u64> {
4176        let timeout_ms = self.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
4177        (timeout_ms != UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS).then_some(timeout_ms)
4178    }
4179
4180    pub(crate) fn mark_index_writer_busy_timeout_ms(&self, timeout_ms: u64) {
4181        self.index_writer_busy_timeout_ms
4182            .store(timeout_ms, Ordering::Relaxed);
4183    }
4184
4185    /// Open database with migration, backing up if schema is incompatible.
4186    pub fn open_or_rebuild(path: &Path) -> std::result::Result<Self, MigrationError> {
4187        if let Some(parent) = path.parent() {
4188            fs::create_dir_all(parent)?;
4189        }
4190
4191        if path.exists() {
4192            let check_result = check_schema_compatibility(path);
4193            match check_result {
4194                Ok(SchemaCheck::Compatible) | Ok(SchemaCheck::NeedsMigration) => {
4195                    // Continue with normal open
4196                }
4197                Ok(SchemaCheck::NeedsRebuild(reason)) => {
4198                    let backup_path = create_backup(path)?;
4199                    cleanup_old_backups(path, MAX_BACKUPS)?;
4200                    remove_database_files(path)?;
4201                    return Err(MigrationError::RebuildRequired {
4202                        reason,
4203                        backup_path,
4204                    });
4205                }
4206                Err(err) if schema_check_error_requires_rebuild(&err) => {
4207                    let backup_path = create_backup(path)?;
4208                    cleanup_old_backups(path, MAX_BACKUPS)?;
4209                    remove_database_files(path)?;
4210                    return Err(MigrationError::RebuildRequired {
4211                        reason: format!("Database appears corrupted: {err}"),
4212                        backup_path,
4213                    });
4214                }
4215                Err(err) => return Err(MigrationError::Database(err)),
4216            }
4217        }
4218
4219        let storage = Self::open(path).map_err(|e| MigrationError::Other(e.to_string()))?;
4220        Ok(storage)
4221    }
4222}
4223
4224// -------------------------------------------------------------------------
4225// Frankensqlite migration helpers
4226// -------------------------------------------------------------------------
4227
4228/// Build the `MigrationRunner` for the frankensqlite migration path.
4229///
4230/// Uses a single combined migration (version 13) that creates the complete
4231/// final schema in one step. This avoids the V5 `DROP TABLE conversations`
4232/// operation which triggers a known frankensqlite limitation: autoindex entries
4233/// in sqlite_master are not properly cleaned up during DROP TABLE, causing
4234/// "sqlite_master entry not found" errors.
4235///
4236/// For existing databases transitioned from SqliteStorage, the transition
4237/// function backfills `_schema_migrations`; post-V13 additive migrations then
4238/// run normally.
4239fn build_cass_migrations_before_tail_cache() -> MigrationRunner {
4240    MigrationRunner::new()
4241        .add(13, "full_schema_v13", MIGRATION_FRESH_SCHEMA)
4242        .add(14, "fts_contentless", MIGRATION_V14)
4243}
4244
4245fn build_cass_migrations_after_tail_cache() -> MigrationRunner {
4246    MigrationRunner::new()
4247        .add(16, "drop_redundant_message_conv_idx", MIGRATION_V16)
4248        .add(17, "drop_message_created_idx", MIGRATION_V17)
4249        .add(18, "conversation_tail_state_hot_table", MIGRATION_V18)
4250        .add(19, "conversation_external_lookup", MIGRATION_V19)
4251        .add(20, "conversation_external_tail_lookup", MIGRATION_V20)
4252}
4253
4254fn schema_migration_is_applied(conn: &FrankenConnection, version: i64) -> Result<bool> {
4255    let rows = conn
4256        .query_with_params(
4257            "SELECT 1 FROM _schema_migrations WHERE version = ?1 LIMIT 1;",
4258            &[SqliteValue::from(version)],
4259        )
4260        .with_context(|| format!("checking schema migration version {version}"))?;
4261    Ok(!rows.is_empty())
4262}
4263
4264fn apply_conversation_tail_state_cache_migration(conn: &FrankenConnection) -> Result<bool> {
4265    conn.execute("BEGIN IMMEDIATE;")
4266        .with_context(|| "starting v15 conversation tail-state migration transaction")?;
4267
4268    let result = (|| -> Result<bool> {
4269        if schema_migration_is_applied(conn, 15)? {
4270            conn.execute("COMMIT;")
4271                .with_context(|| "committing already-applied v15 migration transaction")?;
4272            return Ok(false);
4273        }
4274
4275        let started = Instant::now();
4276        let conversation_columns = franken_table_column_names(conn, "conversations")
4277            .with_context(|| "inspecting conversations columns before v15 migration")?;
4278        if !conversation_columns.contains("last_message_idx") {
4279            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_idx INTEGER;")
4280                .with_context(|| "adding v15 conversations.last_message_idx column")?;
4281        }
4282        if !conversation_columns.contains("last_message_created_at") {
4283            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_created_at INTEGER;")
4284                .with_context(|| "adding v15 conversations.last_message_created_at column")?;
4285        }
4286        conn.execute_batch(MIGRATION_V15_TAIL_STATE_TABLE)
4287            .with_context(|| "applying v15 conversation tail-state table schema")?;
4288        conn.execute_compat(
4289            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
4290            fparams![15_i64, "conversation_tail_state_cache"],
4291        )
4292        .with_context(|| "recording v15 conversation tail-state migration")?;
4293        conn.execute("COMMIT;")
4294            .with_context(|| "committing v15 conversation tail-state migration")?;
4295        info!(
4296            elapsed_ms = started.elapsed().as_millis(),
4297            "applied v15 conversation tail-state cache migration"
4298        );
4299        Ok(true)
4300    })();
4301
4302    if result.is_err() {
4303        let _ = conn.execute("ROLLBACK;");
4304    }
4305
4306    result
4307}
4308
4309fn franken_table_column_names(
4310    conn: &FrankenConnection,
4311    table_name: &str,
4312) -> Result<HashSet<String>> {
4313    if !table_name
4314        .chars()
4315        .all(|c| c.is_ascii_alphanumeric() || c == '_')
4316    {
4317        return Err(anyhow!(
4318            "unsafe table name for PRAGMA table_info: {table_name}"
4319        ));
4320    }
4321
4322    conn.query_map_collect(
4323        &format!("PRAGMA table_info({table_name})"),
4324        fparams![],
4325        |row: &FrankenRow| row.get_typed::<String>(1),
4326    )
4327    .with_context(|| format!("reading PRAGMA table_info({table_name})"))
4328    .map(|columns| columns.into_iter().collect())
4329}
4330
4331/// Combined V13 schema for fresh databases.
4332///
4333/// Creates the complete final schema in a single migration, avoiding the
4334/// incremental V5 `DROP TABLE conversations` which triggers a frankensqlite
4335/// autoindex limitation. All columns from V1-V13 are included in their
4336/// respective CREATE TABLE statements.
4337///
4338/// Table creation order respects foreign key references:
4339/// sources → agents/workspaces → conversations → messages → snippets, etc.
4340const MIGRATION_FRESH_SCHEMA: &str = r"
4341-- Core tables (V1)
4342CREATE TABLE IF NOT EXISTS meta (
4343    key TEXT PRIMARY KEY,
4344    value TEXT NOT NULL
4345);
4346
4347CREATE TABLE IF NOT EXISTS agents (
4348    id INTEGER PRIMARY KEY,
4349    slug TEXT NOT NULL UNIQUE,
4350    name TEXT NOT NULL,
4351    version TEXT,
4352    kind TEXT NOT NULL,
4353    created_at INTEGER NOT NULL,
4354    updated_at INTEGER NOT NULL
4355);
4356
4357CREATE TABLE IF NOT EXISTS workspaces (
4358    id INTEGER PRIMARY KEY,
4359    path TEXT NOT NULL UNIQUE,
4360    display_name TEXT
4361);
4362
4363-- Sources (V4)
4364CREATE TABLE IF NOT EXISTS sources (
4365    id TEXT PRIMARY KEY,
4366    kind TEXT NOT NULL,
4367    host_label TEXT,
4368    machine_id TEXT,
4369    platform TEXT,
4370    config_json TEXT,
4371    created_at INTEGER NOT NULL,
4372    updated_at INTEGER NOT NULL
4373);
4374
4375INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4376VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4377
4378-- Conversations: V1 base + V5 provenance + V7 metadata_bin + V10 token summary
4379CREATE TABLE IF NOT EXISTS conversations (
4380    id INTEGER PRIMARY KEY,
4381    agent_id INTEGER NOT NULL REFERENCES agents(id),
4382    workspace_id INTEGER REFERENCES workspaces(id),
4383    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
4384    external_id TEXT,
4385    title TEXT,
4386    source_path TEXT NOT NULL,
4387    started_at INTEGER,
4388    ended_at INTEGER,
4389    approx_tokens INTEGER,
4390    metadata_json TEXT,
4391    origin_host TEXT,
4392    metadata_bin BLOB,
4393    total_input_tokens INTEGER,
4394    total_output_tokens INTEGER,
4395    total_cache_read_tokens INTEGER,
4396    total_cache_creation_tokens INTEGER,
4397    grand_total_tokens INTEGER,
4398    estimated_cost_usd REAL,
4399    primary_model TEXT,
4400    api_call_count INTEGER,
4401    tool_call_count INTEGER,
4402    user_message_count INTEGER,
4403    assistant_message_count INTEGER,
4404    -- V15 columns are included in the fresh schema so fresh DB creation does
4405    -- not need ALTER TABLE on conversations. That ALTER path can duplicate
4406    -- provenance autoindex state in frankensqlite when the named unique
4407    -- provenance index already exists.
4408    last_message_idx INTEGER,
4409    last_message_created_at INTEGER
4410);
4411
4412-- Named unique index avoids autoindex issues if table is ever recreated
4413CREATE UNIQUE INDEX IF NOT EXISTS idx_conversations_provenance
4414    ON conversations(source_id, agent_id, external_id);
4415
4416-- Messages: V1 base + V7 extra_bin
4417CREATE TABLE IF NOT EXISTS messages (
4418    id INTEGER PRIMARY KEY,
4419    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4420    idx INTEGER NOT NULL,
4421    role TEXT NOT NULL,
4422    author TEXT,
4423    created_at INTEGER,
4424    content TEXT NOT NULL,
4425    extra_json TEXT,
4426    extra_bin BLOB,
4427    UNIQUE(conversation_id, idx)
4428);
4429
4430CREATE TABLE IF NOT EXISTS snippets (
4431    id INTEGER PRIMARY KEY,
4432    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4433    file_path TEXT,
4434    start_line INTEGER,
4435    end_line INTEGER,
4436    language TEXT,
4437    snippet_text TEXT
4438);
4439
4440CREATE TABLE IF NOT EXISTS tags (
4441    id INTEGER PRIMARY KEY,
4442    name TEXT NOT NULL UNIQUE
4443);
4444
4445CREATE TABLE IF NOT EXISTS conversation_tags (
4446    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4447    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
4448    PRIMARY KEY (conversation_id, tag_id)
4449);
4450
4451-- Daily stats (V8)
4452CREATE TABLE IF NOT EXISTS daily_stats (
4453    day_id INTEGER NOT NULL,
4454    agent_slug TEXT NOT NULL,
4455    source_id TEXT NOT NULL DEFAULT 'all',
4456    session_count INTEGER NOT NULL DEFAULT 0,
4457    message_count INTEGER NOT NULL DEFAULT 0,
4458    total_chars INTEGER NOT NULL DEFAULT 0,
4459    last_updated INTEGER NOT NULL,
4460    PRIMARY KEY (day_id, agent_slug, source_id)
4461);
4462
4463-- Embedding jobs (V9)
4464CREATE TABLE IF NOT EXISTS embedding_jobs (
4465    id INTEGER PRIMARY KEY AUTOINCREMENT,
4466    db_path TEXT NOT NULL,
4467    model_id TEXT NOT NULL,
4468    status TEXT NOT NULL DEFAULT 'pending',
4469    total_docs INTEGER NOT NULL DEFAULT 0,
4470    completed_docs INTEGER NOT NULL DEFAULT 0,
4471    error_message TEXT,
4472    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4473    started_at TEXT,
4474    completed_at TEXT
4475);
4476
4477CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4478ON embedding_jobs(db_path, model_id)
4479WHERE status IN ('pending', 'running');
4480
4481-- Token usage ledger (V10)
4482CREATE TABLE IF NOT EXISTS token_usage (
4483    id INTEGER PRIMARY KEY AUTOINCREMENT,
4484    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4485    conversation_id INTEGER NOT NULL,
4486    agent_id INTEGER NOT NULL,
4487    workspace_id INTEGER,
4488    source_id TEXT NOT NULL DEFAULT 'local',
4489    timestamp_ms INTEGER NOT NULL,
4490    day_id INTEGER NOT NULL,
4491    model_name TEXT,
4492    model_family TEXT,
4493    model_tier TEXT,
4494    service_tier TEXT,
4495    provider TEXT,
4496    input_tokens INTEGER,
4497    output_tokens INTEGER,
4498    cache_read_tokens INTEGER,
4499    cache_creation_tokens INTEGER,
4500    thinking_tokens INTEGER,
4501    total_tokens INTEGER,
4502    estimated_cost_usd REAL,
4503    role TEXT NOT NULL,
4504    content_chars INTEGER NOT NULL,
4505    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4506    tool_call_count INTEGER NOT NULL DEFAULT 0,
4507    data_source TEXT NOT NULL DEFAULT 'api',
4508    UNIQUE(message_id)
4509);
4510
4511-- Token daily stats (V10)
4512CREATE TABLE IF NOT EXISTS token_daily_stats (
4513    day_id INTEGER NOT NULL,
4514    agent_slug TEXT NOT NULL,
4515    source_id TEXT NOT NULL DEFAULT 'all',
4516    model_family TEXT NOT NULL DEFAULT 'all',
4517    api_call_count INTEGER NOT NULL DEFAULT 0,
4518    user_message_count INTEGER NOT NULL DEFAULT 0,
4519    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4520    tool_message_count INTEGER NOT NULL DEFAULT 0,
4521    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4522    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4523    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4524    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4525    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4526    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4527    total_content_chars INTEGER NOT NULL DEFAULT 0,
4528    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4529    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4530    session_count INTEGER NOT NULL DEFAULT 0,
4531    last_updated INTEGER NOT NULL,
4532    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4533);
4534
4535-- Model pricing (V10)
4536CREATE TABLE IF NOT EXISTS model_pricing (
4537    model_pattern TEXT NOT NULL,
4538    provider TEXT NOT NULL,
4539    input_cost_per_mtok REAL NOT NULL,
4540    output_cost_per_mtok REAL NOT NULL,
4541    cache_read_cost_per_mtok REAL,
4542    cache_creation_cost_per_mtok REAL,
4543    effective_date TEXT NOT NULL,
4544    PRIMARY KEY (model_pattern, effective_date)
4545);
4546
4547INSERT OR IGNORE INTO model_pricing VALUES
4548    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4549    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4550    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4551    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4552    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4553    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4554    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4555    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4556    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4557    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4558
4559-- Message metrics: V11 base + V12 model dimensions
4560CREATE TABLE IF NOT EXISTS message_metrics (
4561    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4562    created_at_ms INTEGER NOT NULL,
4563    hour_id INTEGER NOT NULL,
4564    day_id INTEGER NOT NULL,
4565    agent_slug TEXT NOT NULL,
4566    workspace_id INTEGER NOT NULL DEFAULT 0,
4567    source_id TEXT NOT NULL DEFAULT 'local',
4568    role TEXT NOT NULL,
4569    content_chars INTEGER NOT NULL,
4570    content_tokens_est INTEGER NOT NULL,
4571    api_input_tokens INTEGER,
4572    api_output_tokens INTEGER,
4573    api_cache_read_tokens INTEGER,
4574    api_cache_creation_tokens INTEGER,
4575    api_thinking_tokens INTEGER,
4576    api_service_tier TEXT,
4577    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4578    tool_call_count INTEGER NOT NULL DEFAULT 0,
4579    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4580    has_plan INTEGER NOT NULL DEFAULT 0,
4581    model_name TEXT,
4582    model_family TEXT NOT NULL DEFAULT 'unknown',
4583    model_tier TEXT NOT NULL DEFAULT 'unknown',
4584    provider TEXT NOT NULL DEFAULT 'unknown'
4585);
4586
4587-- Hourly rollups: V11 base + V13 plan columns
4588CREATE TABLE IF NOT EXISTS usage_hourly (
4589    hour_id INTEGER NOT NULL,
4590    agent_slug TEXT NOT NULL,
4591    workspace_id INTEGER NOT NULL DEFAULT 0,
4592    source_id TEXT NOT NULL DEFAULT 'local',
4593    message_count INTEGER NOT NULL DEFAULT 0,
4594    user_message_count INTEGER NOT NULL DEFAULT 0,
4595    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4596    tool_call_count INTEGER NOT NULL DEFAULT 0,
4597    plan_message_count INTEGER NOT NULL DEFAULT 0,
4598    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4599    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4600    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4601    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4602    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4603    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4604    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4605    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4606    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4607    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4608    last_updated INTEGER NOT NULL DEFAULT 0,
4609    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4610    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4611    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4612);
4613
4614-- Daily rollups: V11 base + V13 plan columns
4615CREATE TABLE IF NOT EXISTS usage_daily (
4616    day_id INTEGER NOT NULL,
4617    agent_slug TEXT NOT NULL,
4618    workspace_id INTEGER NOT NULL DEFAULT 0,
4619    source_id TEXT NOT NULL DEFAULT 'local',
4620    message_count INTEGER NOT NULL DEFAULT 0,
4621    user_message_count INTEGER NOT NULL DEFAULT 0,
4622    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4623    tool_call_count INTEGER NOT NULL DEFAULT 0,
4624    plan_message_count INTEGER NOT NULL DEFAULT 0,
4625    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4626    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4627    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4628    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4629    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4630    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4631    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4632    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4633    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4634    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4635    last_updated INTEGER NOT NULL DEFAULT 0,
4636    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4637    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4638    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
4639);
4640
4641-- Model daily rollups (V12)
4642CREATE TABLE IF NOT EXISTS usage_models_daily (
4643    day_id INTEGER NOT NULL,
4644    agent_slug TEXT NOT NULL,
4645    workspace_id INTEGER NOT NULL DEFAULT 0,
4646    source_id TEXT NOT NULL DEFAULT 'local',
4647    model_family TEXT NOT NULL DEFAULT 'unknown',
4648    model_tier TEXT NOT NULL DEFAULT 'unknown',
4649    message_count INTEGER NOT NULL DEFAULT 0,
4650    user_message_count INTEGER NOT NULL DEFAULT 0,
4651    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4652    tool_call_count INTEGER NOT NULL DEFAULT 0,
4653    plan_message_count INTEGER NOT NULL DEFAULT 0,
4654    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4655    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4656    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4657    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4658    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4659    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4660    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4661    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4662    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4663    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4664    last_updated INTEGER NOT NULL DEFAULT 0,
4665    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
4666);
4667
4668-- All indexes
4669CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
4670CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
4671CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
4672CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4673CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4674CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
4675CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
4676CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
4677CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
4678CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
4679CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
4680CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
4681CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
4682CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
4683CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
4684CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
4685CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
4686CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
4687CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
4688CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
4689CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
4690CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
4691CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
4692CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
4693CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
4694CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
4695CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
4696CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
4697CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
4698CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
4699";
4700
4701#[derive(Clone, Copy)]
4702struct SchemaRepairBatch {
4703    name: &'static str,
4704    tables: &'static [&'static str],
4705    sql: &'static str,
4706}
4707
4708const CURRENT_SCHEMA_REPAIR_SOURCES_SQL: &str = r"
4709CREATE TABLE IF NOT EXISTS sources (
4710    id TEXT PRIMARY KEY,
4711    kind TEXT NOT NULL,
4712    host_label TEXT,
4713    machine_id TEXT,
4714    platform TEXT,
4715    config_json TEXT,
4716    created_at INTEGER NOT NULL,
4717    updated_at INTEGER NOT NULL
4718);
4719
4720INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4721VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4722";
4723
4724const CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL: &str = r"
4725CREATE TABLE IF NOT EXISTS daily_stats (
4726    day_id INTEGER NOT NULL,
4727    agent_slug TEXT NOT NULL,
4728    source_id TEXT NOT NULL DEFAULT 'all',
4729    session_count INTEGER NOT NULL DEFAULT 0,
4730    message_count INTEGER NOT NULL DEFAULT 0,
4731    total_chars INTEGER NOT NULL DEFAULT 0,
4732    last_updated INTEGER NOT NULL,
4733    PRIMARY KEY (day_id, agent_slug, source_id)
4734);
4735
4736CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4737CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4738";
4739
4740const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL: &str = r"
4741CREATE TABLE IF NOT EXISTS conversation_external_lookup (
4742    lookup_key TEXT PRIMARY KEY,
4743    conversation_id INTEGER NOT NULL
4744);
4745
4746INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
4747SELECT
4748    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
4749    CAST(agent_id AS TEXT) || ':' ||
4750    CAST(length(external_id) AS TEXT) || ':' || external_id,
4751    id
4752FROM conversations
4753WHERE external_id IS NOT NULL;
4754";
4755
4756const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL: &str = r"
4757CREATE TABLE IF NOT EXISTS conversation_tail_state (
4758    conversation_id INTEGER PRIMARY KEY,
4759    ended_at INTEGER,
4760    last_message_idx INTEGER,
4761    last_message_created_at INTEGER
4762);
4763
4764CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
4765    lookup_key TEXT PRIMARY KEY,
4766    conversation_id INTEGER NOT NULL,
4767    ended_at INTEGER,
4768    last_message_idx INTEGER,
4769    last_message_created_at INTEGER
4770);
4771
4772INSERT OR REPLACE INTO conversation_external_tail_lookup (
4773    lookup_key,
4774    conversation_id,
4775    ended_at,
4776    last_message_idx,
4777    last_message_created_at
4778)
4779SELECT
4780    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
4781    CAST(c.agent_id AS TEXT) || ':' ||
4782    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
4783    c.id,
4784    ts.ended_at,
4785    ts.last_message_idx,
4786    ts.last_message_created_at
4787FROM conversations c
4788LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
4789WHERE c.external_id IS NOT NULL;
4790";
4791
4792const CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL: &str = r"
4793CREATE TABLE IF NOT EXISTS embedding_jobs (
4794    id INTEGER PRIMARY KEY AUTOINCREMENT,
4795    db_path TEXT NOT NULL,
4796    model_id TEXT NOT NULL,
4797    status TEXT NOT NULL DEFAULT 'pending',
4798    total_docs INTEGER NOT NULL DEFAULT 0,
4799    completed_docs INTEGER NOT NULL DEFAULT 0,
4800    error_message TEXT,
4801    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4802    started_at TEXT,
4803    completed_at TEXT
4804);
4805
4806CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4807ON embedding_jobs(db_path, model_id)
4808WHERE status IN ('pending', 'running');
4809";
4810
4811const CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL: &str = r"
4812CREATE TABLE IF NOT EXISTS token_usage (
4813    id INTEGER PRIMARY KEY AUTOINCREMENT,
4814    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4815    conversation_id INTEGER NOT NULL,
4816    agent_id INTEGER NOT NULL,
4817    workspace_id INTEGER,
4818    source_id TEXT NOT NULL DEFAULT 'local',
4819    timestamp_ms INTEGER NOT NULL,
4820    day_id INTEGER NOT NULL,
4821    model_name TEXT,
4822    model_family TEXT,
4823    model_tier TEXT,
4824    service_tier TEXT,
4825    provider TEXT,
4826    input_tokens INTEGER,
4827    output_tokens INTEGER,
4828    cache_read_tokens INTEGER,
4829    cache_creation_tokens INTEGER,
4830    thinking_tokens INTEGER,
4831    total_tokens INTEGER,
4832    estimated_cost_usd REAL,
4833    role TEXT NOT NULL,
4834    content_chars INTEGER NOT NULL,
4835    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4836    tool_call_count INTEGER NOT NULL DEFAULT 0,
4837    data_source TEXT NOT NULL DEFAULT 'api',
4838    UNIQUE(message_id)
4839);
4840
4841CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
4842CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
4843CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
4844CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
4845CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
4846
4847CREATE TABLE IF NOT EXISTS token_daily_stats (
4848    day_id INTEGER NOT NULL,
4849    agent_slug TEXT NOT NULL,
4850    source_id TEXT NOT NULL DEFAULT 'all',
4851    model_family TEXT NOT NULL DEFAULT 'all',
4852    api_call_count INTEGER NOT NULL DEFAULT 0,
4853    user_message_count INTEGER NOT NULL DEFAULT 0,
4854    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4855    tool_message_count INTEGER NOT NULL DEFAULT 0,
4856    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4857    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4858    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4859    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4860    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4861    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4862    total_content_chars INTEGER NOT NULL DEFAULT 0,
4863    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4864    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4865    session_count INTEGER NOT NULL DEFAULT 0,
4866    last_updated INTEGER NOT NULL,
4867    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4868);
4869
4870CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
4871CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
4872
4873CREATE TABLE IF NOT EXISTS model_pricing (
4874    model_pattern TEXT NOT NULL,
4875    provider TEXT NOT NULL,
4876    input_cost_per_mtok REAL NOT NULL,
4877    output_cost_per_mtok REAL NOT NULL,
4878    cache_read_cost_per_mtok REAL,
4879    cache_creation_cost_per_mtok REAL,
4880    effective_date TEXT NOT NULL,
4881    PRIMARY KEY (model_pattern, effective_date)
4882);
4883
4884INSERT OR IGNORE INTO model_pricing VALUES
4885    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4886    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4887    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4888    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4889    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4890    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4891    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4892    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4893    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4894    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4895";
4896
4897const CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL: &str = r"
4898CREATE TABLE IF NOT EXISTS message_metrics (
4899    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4900    created_at_ms INTEGER NOT NULL,
4901    hour_id INTEGER NOT NULL,
4902    day_id INTEGER NOT NULL,
4903    agent_slug TEXT NOT NULL,
4904    workspace_id INTEGER NOT NULL DEFAULT 0,
4905    source_id TEXT NOT NULL DEFAULT 'local',
4906    role TEXT NOT NULL,
4907    content_chars INTEGER NOT NULL,
4908    content_tokens_est INTEGER NOT NULL,
4909    api_input_tokens INTEGER,
4910    api_output_tokens INTEGER,
4911    api_cache_read_tokens INTEGER,
4912    api_cache_creation_tokens INTEGER,
4913    api_thinking_tokens INTEGER,
4914    api_service_tier TEXT,
4915    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4916    tool_call_count INTEGER NOT NULL DEFAULT 0,
4917    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4918    has_plan INTEGER NOT NULL DEFAULT 0,
4919    model_name TEXT,
4920    model_family TEXT NOT NULL DEFAULT 'unknown',
4921    model_tier TEXT NOT NULL DEFAULT 'unknown',
4922    provider TEXT NOT NULL DEFAULT 'unknown'
4923);
4924
4925CREATE TABLE IF NOT EXISTS usage_hourly (
4926    hour_id INTEGER NOT NULL,
4927    agent_slug TEXT NOT NULL,
4928    workspace_id INTEGER NOT NULL DEFAULT 0,
4929    source_id TEXT NOT NULL DEFAULT 'local',
4930    message_count INTEGER NOT NULL DEFAULT 0,
4931    user_message_count INTEGER NOT NULL DEFAULT 0,
4932    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4933    tool_call_count INTEGER NOT NULL DEFAULT 0,
4934    plan_message_count INTEGER NOT NULL DEFAULT 0,
4935    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4936    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4937    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4938    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4939    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4940    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4941    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4942    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4943    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4944    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4945    last_updated INTEGER NOT NULL DEFAULT 0,
4946    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4947    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4948    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4949);
4950
4951CREATE TABLE IF NOT EXISTS usage_daily (
4952    day_id INTEGER NOT NULL,
4953    agent_slug TEXT NOT NULL,
4954    workspace_id INTEGER NOT NULL DEFAULT 0,
4955    source_id TEXT NOT NULL DEFAULT 'local',
4956    message_count INTEGER NOT NULL DEFAULT 0,
4957    user_message_count INTEGER NOT NULL DEFAULT 0,
4958    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4959    tool_call_count INTEGER NOT NULL DEFAULT 0,
4960    plan_message_count INTEGER NOT NULL DEFAULT 0,
4961    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4962    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4963    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4964    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4965    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4966    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4967    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4968    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4969    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4970    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4971    last_updated INTEGER NOT NULL DEFAULT 0,
4972    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4973    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4974    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
4975);
4976
4977CREATE TABLE IF NOT EXISTS usage_models_daily (
4978    day_id INTEGER NOT NULL,
4979    agent_slug TEXT NOT NULL,
4980    workspace_id INTEGER NOT NULL DEFAULT 0,
4981    source_id TEXT NOT NULL DEFAULT 'local',
4982    model_family TEXT NOT NULL DEFAULT 'unknown',
4983    model_tier TEXT NOT NULL DEFAULT 'unknown',
4984    message_count INTEGER NOT NULL DEFAULT 0,
4985    user_message_count INTEGER NOT NULL DEFAULT 0,
4986    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4987    tool_call_count INTEGER NOT NULL DEFAULT 0,
4988    plan_message_count INTEGER NOT NULL DEFAULT 0,
4989    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4990    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4991    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4992    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4993    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4994    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4995    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4996    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4997    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4998    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4999    last_updated INTEGER NOT NULL DEFAULT 0,
5000    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5001);
5002
5003CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5004CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5005CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5006CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5007CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5008CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5009CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5010CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5011CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5012CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5013CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5014CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5015CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5016CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5017CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5018CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5019CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5020CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5021";
5022
5023const CURRENT_SCHEMA_REPAIR_BATCHES: &[SchemaRepairBatch] = &[
5024    SchemaRepairBatch {
5025        name: "sources",
5026        tables: &["sources"],
5027        sql: CURRENT_SCHEMA_REPAIR_SOURCES_SQL,
5028    },
5029    SchemaRepairBatch {
5030        name: "daily_stats",
5031        tables: &["daily_stats"],
5032        sql: CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL,
5033    },
5034    SchemaRepairBatch {
5035        name: "conversation_external_lookup",
5036        tables: &["conversation_external_lookup"],
5037        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL,
5038    },
5039    SchemaRepairBatch {
5040        name: "conversation_external_tail_lookup",
5041        tables: &[
5042            "conversation_tail_state",
5043            "conversation_external_tail_lookup",
5044        ],
5045        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL,
5046    },
5047    SchemaRepairBatch {
5048        name: "embedding_jobs",
5049        tables: &["embedding_jobs"],
5050        sql: CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL,
5051    },
5052    SchemaRepairBatch {
5053        name: "token_analytics",
5054        tables: &["token_usage", "token_daily_stats", "model_pricing"],
5055        sql: CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL,
5056    },
5057    SchemaRepairBatch {
5058        name: "message_rollups",
5059        tables: &[
5060            "message_metrics",
5061            "usage_hourly",
5062            "usage_daily",
5063            "usage_models_daily",
5064        ],
5065        sql: CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL,
5066    },
5067];
5068
5069fn current_schema_repair_batches_for_missing_tables(
5070    missing_tables: &[&'static str],
5071) -> Result<Vec<&'static SchemaRepairBatch>> {
5072    let missing_set: HashSet<&'static str> = missing_tables.iter().copied().collect();
5073    let mut selected_batches = Vec::new();
5074    let mut covered_tables = HashSet::new();
5075
5076    for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
5077        if !batch
5078            .tables
5079            .iter()
5080            .any(|table_name| missing_set.contains(table_name))
5081        {
5082            continue;
5083        }
5084        selected_batches.push(batch);
5085        covered_tables.extend(batch.tables.iter().copied());
5086    }
5087
5088    for &table_name in missing_tables {
5089        if !covered_tables.contains(table_name) {
5090            return Err(anyhow!(
5091                "no current-schema repair batch registered for missing table {table_name}"
5092            ));
5093        }
5094    }
5095
5096    Ok(selected_batches)
5097}
5098
5099/// Migration name lookup for backfilling `_schema_migrations` during transition.
5100const MIGRATION_NAMES: [(i64, &str); 20] = [
5101    (1, "core_tables"),
5102    (2, "fts_messages"),
5103    (3, "fts_messages_rebuild"),
5104    (4, "sources"),
5105    (5, "provenance_columns"),
5106    (6, "source_path_index"),
5107    (7, "msgpack_columns"),
5108    (8, "daily_stats"),
5109    (9, "embedding_jobs"),
5110    (10, "token_analytics"),
5111    (11, "message_metrics"),
5112    (12, "model_dimensions"),
5113    (13, "plan_token_rollups"),
5114    (14, "fts_contentless"),
5115    (15, "conversation_tail_state_cache"),
5116    (16, "drop_redundant_message_conv_idx"),
5117    (17, "drop_message_created_idx"),
5118    (18, "conversation_tail_state_hot_table"),
5119    (19, "conversation_external_lookup"),
5120    (20, "conversation_external_tail_lookup"),
5121];
5122
5123/// Transitions an existing database from `meta` table schema versioning to the
5124/// `_schema_migrations` table used by `MigrationRunner`.
5125///
5126/// The existing `SqliteStorage` tracks schema version as a string value in
5127/// `meta WHERE key = 'schema_version'`. The bead spec references
5128/// `PRAGMA user_version`, but the actual cass code uses the `meta` table.
5129/// This function handles the real code path.
5130///
5131/// Behavior:
5132/// - If `_schema_migrations` already exists → skip (already transitioned)
5133/// - If `meta` table has `schema_version > 0` → create `_schema_migrations`
5134///   and backfill entries for versions `1..=current_version`
5135/// - If `meta` table missing or `schema_version = 0` with no tables → fresh DB,
5136///   let `MigrationRunner` handle it
5137/// - If `schema_version = 0` but tables exist → corrupted state, log warning
5138fn transition_from_meta_version(conn: &FrankenConnection) -> Result<()> {
5139    // Avoid sqlite_master enumeration here. Databases with FTS virtual tables
5140    // can trigger frankensqlite parse-recovery on sqlite_master reads, which is
5141    // enough to break the transition on otherwise-healthy legacy cass DBs.
5142    if conn
5143        .query("SELECT version FROM \"_schema_migrations\";")
5144        .is_ok()
5145    {
5146        return Ok(());
5147    }
5148
5149    // Check if the meta table exists.
5150    if conn.query("SELECT key FROM meta;").is_err() {
5151        // No meta table → fresh database, let MigrationRunner handle it.
5152        return Ok(());
5153    }
5154
5155    // Read the current schema version from the meta table.
5156    let rows = conn
5157        .query("SELECT value FROM meta WHERE key = 'schema_version';")
5158        .with_context(|| "reading schema_version from meta")?;
5159
5160    let current_version: i64 = rows
5161        .first()
5162        .and_then(|row| row.get_typed::<String>(0).ok())
5163        .and_then(|s| s.parse().ok())
5164        .unwrap_or(0);
5165
5166    if current_version == 0 {
5167        // Check if tables actually exist (corrupted state: tables present but version=0).
5168        if conn.query("SELECT id FROM conversations LIMIT 1;").is_err() {
5169            // Truly fresh DB (meta table exists but empty/reset). Let MigrationRunner handle it.
5170            return Ok(());
5171        }
5172
5173        // Tables exist but version=0: corrupted state. Log and skip transition;
5174        // MigrationRunner will fail on "table already exists" and surface the error.
5175        info!("meta.schema_version=0 but tables exist; skipping transition (corrupted state)");
5176        return Ok(());
5177    }
5178
5179    // Create _schema_migrations and backfill entries for all applied versions.
5180    info!(
5181        current_version,
5182        "transitioning schema tracking from meta table to _schema_migrations"
5183    );
5184
5185    conn.execute(
5186        "CREATE TABLE IF NOT EXISTS _schema_migrations (\
5187            version INTEGER PRIMARY KEY, \
5188            name TEXT NOT NULL, \
5189            applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))\
5190        );",
5191    )
5192    .with_context(|| "creating _schema_migrations table for transition")?;
5193
5194    for &(version, name) in &MIGRATION_NAMES {
5195        if version > current_version {
5196            break;
5197        }
5198        conn.execute_compat(
5199            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
5200            &[ParamValue::from(version), ParamValue::from(name)],
5201        )
5202        .with_context(|| format!("backfilling _schema_migrations version {version}"))?;
5203    }
5204
5205    info!(
5206        current_version,
5207        "schema version transition complete: backfilled entries for versions 1..={current_version}"
5208    );
5209
5210    Ok(())
5211}
5212
5213const REQUIRED_CURRENT_SCHEMA_TABLE_PROBES: &[(&str, &str)] = &[
5214    ("sources", "SELECT id FROM sources LIMIT 1;"),
5215    ("daily_stats", "SELECT day_id FROM daily_stats LIMIT 1;"),
5216    (
5217        "conversation_external_lookup",
5218        "SELECT lookup_key FROM conversation_external_lookup LIMIT 1;",
5219    ),
5220    (
5221        "conversation_tail_state",
5222        "SELECT conversation_id FROM conversation_tail_state LIMIT 1;",
5223    ),
5224    (
5225        "conversation_external_tail_lookup",
5226        "SELECT lookup_key, last_message_idx FROM conversation_external_tail_lookup LIMIT 1;",
5227    ),
5228    ("embedding_jobs", "SELECT id FROM embedding_jobs LIMIT 1;"),
5229    ("token_usage", "SELECT id FROM token_usage LIMIT 1;"),
5230    (
5231        "token_daily_stats",
5232        "SELECT day_id FROM token_daily_stats LIMIT 1;",
5233    ),
5234    (
5235        "model_pricing",
5236        "SELECT model_pattern FROM model_pricing LIMIT 1;",
5237    ),
5238    (
5239        "message_metrics",
5240        "SELECT message_id FROM message_metrics LIMIT 1;",
5241    ),
5242    ("usage_hourly", "SELECT hour_id FROM usage_hourly LIMIT 1;"),
5243    ("usage_daily", "SELECT day_id FROM usage_daily LIMIT 1;"),
5244    (
5245        "usage_models_daily",
5246        "SELECT day_id FROM usage_models_daily LIMIT 1;",
5247    ),
5248];
5249
5250const REQUIRED_CONVERSATION_TOKEN_COLUMNS: &[(&str, &str)] = &[
5251    ("total_input_tokens", "INTEGER"),
5252    ("total_output_tokens", "INTEGER"),
5253    ("total_cache_read_tokens", "INTEGER"),
5254    ("total_cache_creation_tokens", "INTEGER"),
5255    ("grand_total_tokens", "INTEGER"),
5256    ("estimated_cost_usd", "REAL"),
5257    ("primary_model", "TEXT"),
5258    ("api_call_count", "INTEGER"),
5259    ("tool_call_count", "INTEGER"),
5260    ("user_message_count", "INTEGER"),
5261    ("assistant_message_count", "INTEGER"),
5262];
5263
5264fn error_indicates_missing_table(err: &impl std::fmt::Display) -> bool {
5265    err.to_string()
5266        .to_ascii_lowercase()
5267        .contains("no such table")
5268}
5269
5270fn error_indicates_missing_column(err: &impl std::fmt::Display) -> bool {
5271    err.to_string()
5272        .to_ascii_lowercase()
5273        .contains("no such column")
5274}
5275
5276const ORPHAN_FK_ID_CHUNK_SIZE: usize = 256;
5277
5278fn collect_orphan_message_ids(conn: &FrankenConnection) -> Result<Vec<i64>> {
5279    let min_conversation_id = conn
5280        .query_map_collect(
5281            "SELECT conversation_id
5282             FROM messages
5283             ORDER BY conversation_id ASC
5284             LIMIT 1",
5285            fparams![],
5286            |row| row.get_typed(0),
5287        )
5288        .context("finding minimum message conversation id for orphan FK cleanup")?
5289        .into_iter()
5290        .next();
5291    let Some(min_conversation_id) = min_conversation_id else {
5292        return Ok(Vec::new());
5293    };
5294    let max_conversation_id: i64 = conn
5295        .query_row_map(
5296            "SELECT conversation_id
5297             FROM messages
5298             ORDER BY conversation_id DESC
5299             LIMIT 1",
5300            fparams![],
5301            |row| row.get_typed(0),
5302        )
5303        .context("finding maximum message conversation id for orphan FK cleanup")?;
5304
5305    let parent_conversation_ids: Vec<i64> = conn
5306        .query_map_collect(
5307            "SELECT id
5308             FROM conversations
5309             WHERE id BETWEEN ?1 AND ?2
5310             ORDER BY id",
5311            fparams![min_conversation_id, max_conversation_id],
5312            |row| row.get_typed(0),
5313        )
5314        .context("listing parent conversation ids for orphan FK cleanup")?;
5315
5316    let mut message_ids = Vec::new();
5317    let mut gap_start = min_conversation_id;
5318    for parent_id in parent_conversation_ids {
5319        if parent_id < gap_start {
5320            continue;
5321        }
5322        if parent_id > max_conversation_id {
5323            break;
5324        }
5325        if gap_start < parent_id {
5326            collect_message_ids_for_conversation_gap(
5327                conn,
5328                gap_start,
5329                parent_id.saturating_sub(1),
5330                &mut message_ids,
5331            )?;
5332        }
5333        if parent_id == i64::MAX {
5334            return Ok(message_ids);
5335        }
5336        gap_start = parent_id + 1;
5337    }
5338    if gap_start <= max_conversation_id {
5339        collect_message_ids_for_conversation_gap(
5340            conn,
5341            gap_start,
5342            max_conversation_id,
5343            &mut message_ids,
5344        )?;
5345    }
5346
5347    Ok(message_ids)
5348}
5349
5350fn collect_message_ids_for_conversation_gap(
5351    conn: &FrankenConnection,
5352    gap_start: i64,
5353    gap_end: i64,
5354    message_ids: &mut Vec<i64>,
5355) -> Result<()> {
5356    let (sql, params) = if gap_start == gap_end {
5357        (
5358            "SELECT id FROM messages WHERE conversation_id = ?1",
5359            vec![SqliteValue::from(gap_start)],
5360        )
5361    } else {
5362        (
5363            "SELECT id FROM messages WHERE conversation_id BETWEEN ?1 AND ?2",
5364            vec![SqliteValue::from(gap_start), SqliteValue::from(gap_end)],
5365        )
5366    };
5367    let rows = conn.query_with_params(sql, &params).with_context(|| {
5368        format!("listing orphan message ids for conversation-id gap {gap_start}..={gap_end}")
5369    })?;
5370    message_ids.reserve(rows.len());
5371    for row in rows {
5372        message_ids.push(row.get_typed(0)?);
5373    }
5374    Ok(())
5375}
5376
5377fn delete_rows_by_i64_chunks(
5378    tx: &FrankenTransaction<'_>,
5379    delete_sql: &'static str,
5380    ids: &[i64],
5381) -> Result<usize> {
5382    let mut deleted = 0;
5383    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5384        for id in chunk {
5385            deleted += tx.execute_with_params(delete_sql, &[SqliteValue::from(*id)])?;
5386        }
5387    }
5388    Ok(deleted)
5389}
5390
5391fn delete_orphan_message_ids_bisecting_oom(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5392    let mut deleted = 0usize;
5393    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5394        deleted = deleted.saturating_add(delete_orphan_message_id_chunk(conn, chunk)?);
5395    }
5396    Ok(deleted)
5397}
5398
5399fn delete_orphan_message_id_chunk(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5400    if ids.is_empty() {
5401        return Ok(0);
5402    }
5403
5404    match delete_orphan_message_id_chunk_once(conn, ids) {
5405        Ok(deleted) => Ok(deleted),
5406        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5407            let split_at = ids.len() / 2;
5408            tracing::warn!(
5409                target: "cass::fk_repair",
5410                rows = ids.len(),
5411                left = split_at,
5412                right = ids.len().saturating_sub(split_at),
5413                error = %err,
5414                "orphan-message cleanup ran out of memory; retrying as smaller batches"
5415            );
5416            let left = delete_orphan_message_id_chunk(conn, &ids[..split_at])?;
5417            let right = delete_orphan_message_id_chunk(conn, &ids[split_at..])?;
5418            Ok(left.saturating_add(right))
5419        }
5420        Err(err) => Err(err),
5421    }
5422}
5423
5424fn delete_orphan_message_id_chunk_once(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5425    let mut tx = conn.transaction()?;
5426    let mut deleted = 0usize;
5427    for entry in ORPHAN_MESSAGE_DEPENDENT_TABLES {
5428        match delete_rows_by_i64_chunks(&tx, entry.delete_sql, ids) {
5429            Ok(count) => {
5430                deleted = deleted.saturating_add(count);
5431            }
5432            Err(err) if error_indicates_missing_table(&err) => {
5433                tracing::debug!(
5434                    target: "cass::fk_repair",
5435                    child_table = entry.child_table,
5436                    error = %err,
5437                    "skipping orphan-message dependent cleanup (table unavailable)"
5438                );
5439            }
5440            Err(err) => {
5441                return Err(err).with_context(|| {
5442                    format!(
5443                        "deleting rows from {} that depend on orphan messages",
5444                        entry.child_table
5445                    )
5446                });
5447            }
5448        }
5449    }
5450    deleted = deleted.saturating_add(
5451        delete_rows_by_i64_chunks(&tx, "DELETE FROM messages WHERE id = ?1", ids)
5452            .context("deleting orphan rows from messages")?,
5453    );
5454    tx.commit()?;
5455    Ok(deleted)
5456}
5457
5458fn collect_direct_orphan_id_page(
5459    conn: &FrankenConnection,
5460    entry: &'static OrphanFkTable,
5461) -> Result<Vec<i64>> {
5462    Ok(conn.query_map_collect(
5463        entry.orphan_id_page_sql,
5464        fparams![i64::try_from(ORPHAN_FK_ID_CHUNK_SIZE).unwrap_or(i64::MAX)],
5465        |row| row.get_typed(0),
5466    )?)
5467}
5468
5469fn delete_direct_orphan_ids_bisecting_oom(
5470    conn: &FrankenConnection,
5471    entry: &'static OrphanFkTable,
5472    ids: &[i64],
5473) -> Result<usize> {
5474    let mut deleted = 0usize;
5475    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5476        deleted = deleted.saturating_add(delete_direct_orphan_id_chunk(conn, entry, chunk)?);
5477    }
5478    Ok(deleted)
5479}
5480
5481fn delete_direct_orphan_id_chunk(
5482    conn: &FrankenConnection,
5483    entry: &'static OrphanFkTable,
5484    ids: &[i64],
5485) -> Result<usize> {
5486    if ids.is_empty() {
5487        return Ok(0);
5488    }
5489
5490    match delete_direct_orphan_id_chunk_once(conn, entry, ids) {
5491        Ok(deleted) => Ok(deleted),
5492        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5493            let split_at = ids.len() / 2;
5494            tracing::warn!(
5495                target: "cass::fk_repair",
5496                child_table = entry.child_table,
5497                rows = ids.len(),
5498                left = split_at,
5499                right = ids.len().saturating_sub(split_at),
5500                error = %err,
5501                "direct orphan cleanup ran out of memory; retrying as smaller batches"
5502            );
5503            let left = delete_direct_orphan_id_chunk(conn, entry, &ids[..split_at])?;
5504            let right = delete_direct_orphan_id_chunk(conn, entry, &ids[split_at..])?;
5505            Ok(left.saturating_add(right))
5506        }
5507        Err(err) => Err(err),
5508    }
5509}
5510
5511fn delete_direct_orphan_id_chunk_once(
5512    conn: &FrankenConnection,
5513    entry: &'static OrphanFkTable,
5514    ids: &[i64],
5515) -> Result<usize> {
5516    let mut tx = conn.transaction()?;
5517    let deleted = delete_rows_by_i64_chunk_bulk(&tx, entry.delete_many_sql_prefix, ids)?;
5518    tx.commit()?;
5519    Ok(deleted)
5520}
5521
5522fn delete_rows_by_i64_chunk_bulk(
5523    tx: &FrankenTransaction<'_>,
5524    delete_many_sql_prefix: &'static str,
5525    ids: &[i64],
5526) -> Result<usize> {
5527    if ids.is_empty() {
5528        return Ok(0);
5529    }
5530
5531    let placeholders = (1..=ids.len())
5532        .map(|idx| format!("?{idx}"))
5533        .collect::<Vec<_>>()
5534        .join(", ");
5535    let sql = format!("{delete_many_sql_prefix} ({placeholders})");
5536    let params = ids
5537        .iter()
5538        .map(|id| SqliteValue::from(*id))
5539        .collect::<Vec<_>>();
5540    Ok(tx.execute_with_params(&sql, &params)?)
5541}
5542
5543/// Tables whose FK parent rows can go missing when an index transaction is
5544/// dropped mid-flight. The select and delete SQL strings are intentionally
5545/// static (no dynamic table names) so they can be audited at a glance and so
5546/// they cannot be subverted by injected identifiers. The select statement
5547/// yields the integer FK key used by the matching chunked delete.
5548struct OrphanFkTable {
5549    child_table: &'static str,
5550    orphan_id_page_sql: &'static str,
5551    delete_many_sql_prefix: &'static str,
5552}
5553
5554const ORPHAN_DIRECT_CHILD_TABLES: &[OrphanFkTable] = &[
5555    OrphanFkTable {
5556        child_table: "message_metrics",
5557        orphan_id_page_sql: "SELECT message_id FROM message_metrics \
5558                             WHERE message_id NOT IN (SELECT id FROM messages) \
5559                             ORDER BY message_id \
5560                             LIMIT ?1",
5561        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5562    },
5563    OrphanFkTable {
5564        child_table: "token_usage",
5565        orphan_id_page_sql: "SELECT message_id FROM token_usage \
5566                             WHERE message_id NOT IN (SELECT id FROM messages) \
5567                             ORDER BY message_id \
5568                             LIMIT ?1",
5569        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5570    },
5571    OrphanFkTable {
5572        child_table: "snippets",
5573        orphan_id_page_sql: "SELECT message_id FROM snippets \
5574                             WHERE message_id NOT IN (SELECT id FROM messages) \
5575                             ORDER BY message_id \
5576                             LIMIT ?1",
5577        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5578    },
5579    OrphanFkTable {
5580        child_table: "conversation_tags",
5581        orphan_id_page_sql: "SELECT conversation_id FROM conversation_tags \
5582                             WHERE conversation_id NOT IN (SELECT id FROM conversations) \
5583                             ORDER BY conversation_id \
5584                             LIMIT ?1",
5585        delete_many_sql_prefix: "DELETE FROM conversation_tags WHERE conversation_id IN",
5586    },
5587];
5588
5589struct OrphanMessageDependentTable {
5590    child_table: &'static str,
5591    delete_sql: &'static str,
5592}
5593
5594const ORPHAN_MESSAGE_DEPENDENT_TABLES: &[OrphanMessageDependentTable] = &[
5595    OrphanMessageDependentTable {
5596        child_table: "message_metrics",
5597        delete_sql: "DELETE FROM message_metrics WHERE message_id = ?1",
5598    },
5599    OrphanMessageDependentTable {
5600        child_table: "token_usage",
5601        delete_sql: "DELETE FROM token_usage WHERE message_id = ?1",
5602    },
5603    OrphanMessageDependentTable {
5604        child_table: "snippets",
5605        delete_sql: "DELETE FROM snippets WHERE message_id = ?1",
5606    },
5607];
5608
5609/// Summary of orphan rows detected and removed by `cleanup_orphan_fk_rows`.
5610///
5611/// Message-root counts come from the probe phase, while direct child counts
5612/// come from bounded page deletes. Under the function's intended use — a single
5613/// indexer-startup pass holding the index run lock — no concurrent writers
5614/// exist, so these counts match the primary orphan roots identified and
5615/// removed during cleanup. Dependent rows below an orphan message
5616/// (`message_metrics` / `token_usage` / `snippets`) are an expected consequence
5617/// of removing that root orphan and are *not* separately counted in `total` or
5618/// `per_table`.
5619#[derive(Debug, Default, Clone)]
5620pub(crate) struct OrphanFkCleanupReport {
5621    pub total: i64,
5622    pub per_table: Vec<(&'static str, i64)>,
5623}
5624
5625impl OrphanFkCleanupReport {
5626    fn record(&mut self, child_table: &'static str, count: i64) {
5627        if let Some((_, existing)) = self
5628            .per_table
5629            .iter_mut()
5630            .find(|(table, _)| *table == child_table)
5631        {
5632            *existing = existing.saturating_add(count);
5633        } else {
5634            self.per_table.push((child_table, count));
5635        }
5636        self.total = self.total.saturating_add(count);
5637    }
5638}
5639
5640pub struct InsertOutcome {
5641    pub conversation_id: i64,
5642    pub conversation_inserted: bool,
5643    pub inserted_indices: Vec<i64>,
5644}
5645
5646#[cfg(test)]
5647#[derive(Debug, Clone, Default)]
5648struct MessageInsertSubstageProfile {
5649    single_row_calls: usize,
5650    batch_calls: usize,
5651    batch_rows: usize,
5652    payload_duration: Duration,
5653    sql_build_duration: Duration,
5654    param_build_duration: Duration,
5655    execute_duration: Duration,
5656    rowid_duration: Duration,
5657}
5658
5659#[cfg(test)]
5660#[derive(Debug, Clone, Default)]
5661struct InsertConversationTreePerfProfile {
5662    invocations: usize,
5663    messages: usize,
5664    inserted_messages: usize,
5665    total_duration: Duration,
5666    source_duration: Duration,
5667    tx_open_duration: Duration,
5668    existing_lookup_duration: Duration,
5669    existing_idx_lookup_duration: Duration,
5670    existing_replay_lookup_duration: Duration,
5671    dedupe_filter_duration: Duration,
5672    conversation_row_duration: Duration,
5673    message_insert_duration: Duration,
5674    message_insert_breakdown: MessageInsertSubstageProfile,
5675    snippet_insert_duration: Duration,
5676    fts_entry_duration: Duration,
5677    fts_flush_duration: Duration,
5678    analytics_duration: Duration,
5679    commit_duration: Duration,
5680}
5681
5682#[cfg(test)]
5683impl InsertConversationTreePerfProfile {
5684    fn millis(duration: Duration) -> f64 {
5685        duration.as_secs_f64() * 1000.0
5686    }
5687
5688    fn log_summary(&self, label: &str) {
5689        let calls = self.invocations.max(1) as f64;
5690        let accounted_duration = self.source_duration
5691            + self.tx_open_duration
5692            + self.existing_lookup_duration
5693            + self.existing_idx_lookup_duration
5694            + self.existing_replay_lookup_duration
5695            + self.dedupe_filter_duration
5696            + self.conversation_row_duration
5697            + self.message_insert_duration
5698            + self.snippet_insert_duration
5699            + self.fts_entry_duration
5700            + self.fts_flush_duration
5701            + self.analytics_duration
5702            + self.commit_duration;
5703        let residual_duration = self.total_duration.saturating_sub(accounted_duration);
5704        eprintln!(
5705            concat!(
5706                "CASS_INSERT_TREE_STAGE_PROFILE ",
5707                "label={} calls={} messages={} inserted_messages={} ",
5708                "total_ms={:.3} source_ms={:.3} tx_open_ms={:.3} existing_lookup_ms={:.3} ",
5709                "existing_idx_lookup_ms={:.3} existing_replay_lookup_ms={:.3} dedupe_filter_ms={:.3} ",
5710                "conversation_row_ms={:.3} message_insert_ms={:.3} snippet_insert_ms={:.3} ",
5711                "fts_entry_ms={:.3} fts_flush_ms={:.3} analytics_ms={:.3} commit_ms={:.3} ",
5712                "msg_payload_ms={:.3} msg_sql_ms={:.3} msg_param_ms={:.3} msg_execute_ms={:.3} msg_rowid_ms={:.3} ",
5713                "residual_ms={:.3} avg_total_ms={:.3} avg_message_insert_ms={:.3} ",
5714                "avg_msg_execute_ms={:.3} avg_msg_payload_ms={:.3} avg_snippet_insert_ms={:.3} avg_fts_entry_ms={:.3} avg_commit_ms={:.3}"
5715            ),
5716            label,
5717            self.invocations,
5718            self.messages,
5719            self.inserted_messages,
5720            Self::millis(self.total_duration),
5721            Self::millis(self.source_duration),
5722            Self::millis(self.tx_open_duration),
5723            Self::millis(self.existing_lookup_duration),
5724            Self::millis(self.existing_idx_lookup_duration),
5725            Self::millis(self.existing_replay_lookup_duration),
5726            Self::millis(self.dedupe_filter_duration),
5727            Self::millis(self.conversation_row_duration),
5728            Self::millis(self.message_insert_duration),
5729            Self::millis(self.snippet_insert_duration),
5730            Self::millis(self.fts_entry_duration),
5731            Self::millis(self.fts_flush_duration),
5732            Self::millis(self.analytics_duration),
5733            Self::millis(self.commit_duration),
5734            Self::millis(self.message_insert_breakdown.payload_duration),
5735            Self::millis(self.message_insert_breakdown.sql_build_duration),
5736            Self::millis(self.message_insert_breakdown.param_build_duration),
5737            Self::millis(self.message_insert_breakdown.execute_duration),
5738            Self::millis(self.message_insert_breakdown.rowid_duration),
5739            Self::millis(residual_duration),
5740            Self::millis(self.total_duration) / calls,
5741            Self::millis(self.message_insert_duration) / calls,
5742            Self::millis(self.message_insert_breakdown.execute_duration) / calls,
5743            Self::millis(self.message_insert_breakdown.payload_duration) / calls,
5744            Self::millis(self.snippet_insert_duration) / calls,
5745            Self::millis(self.fts_entry_duration) / calls,
5746            Self::millis(self.commit_duration) / calls,
5747        );
5748    }
5749}
5750
5751#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5752enum PendingConversationKey {
5753    External {
5754        source_id: String,
5755        agent_id: i64,
5756        external_id: String,
5757    },
5758    SourcePath {
5759        source_id: String,
5760        agent_id: i64,
5761        source_path: String,
5762        started_at: Option<i64>,
5763    },
5764}
5765
5766fn conversation_external_lookup_key(source_id: &str, agent_id: i64, external_id: &str) -> String {
5767    format!(
5768        "{}:{source_id}:{agent_id}:{}:{external_id}",
5769        source_id.chars().count(),
5770        external_id.chars().count()
5771    )
5772}
5773
5774fn conversation_external_lookup_key_for_conv(agent_id: i64, conv: &Conversation) -> Option<String> {
5775    conv.external_id
5776        .as_deref()
5777        .map(|external_id| conversation_external_lookup_key(&conv.source_id, agent_id, external_id))
5778}
5779
5780#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5781struct MessageMergeFingerprint {
5782    idx: i64,
5783    created_at: Option<i64>,
5784    role: MessageRole,
5785    author: Option<String>,
5786    content_hash: [u8; 32],
5787}
5788
5789#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5790struct MessageReplayFingerprint {
5791    created_at: Option<i64>,
5792    role: MessageRole,
5793    author: Option<String>,
5794    content_hash: [u8; 32],
5795}
5796
5797#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5798struct ConversationMergeEvidence {
5799    exact_overlap: usize,
5800    replay_overlap: usize,
5801    smaller_replay_set: usize,
5802    started_close: bool,
5803    start_distance_ms: i64,
5804}
5805
5806struct ExistingConversationNewMessages<'a> {
5807    messages: Vec<&'a Message>,
5808    new_chars: i64,
5809    idx_collision_count: usize,
5810    first_collision_idx: Option<i64>,
5811}
5812
5813#[derive(Debug, Clone, Copy)]
5814struct ExistingConversationTailState {
5815    last_message_idx: i64,
5816    last_message_created_at: i64,
5817    ended_at: Option<i64>,
5818}
5819
5820#[derive(Debug, Clone, Copy)]
5821struct ExistingConversationWithTail {
5822    id: i64,
5823    tail_state: Option<ExistingConversationTailState>,
5824}
5825
5826fn conversation_effective_started_at(conv: &Conversation) -> Option<i64> {
5827    conv.started_at
5828        .or_else(|| conv.messages.iter().filter_map(|msg| msg.created_at).min())
5829}
5830
5831fn conversation_tail_state(conv: &Conversation) -> (Option<i64>, Option<i64>) {
5832    (
5833        conv.messages.iter().map(|msg| msg.idx).max(),
5834        conv.messages.iter().filter_map(|msg| msg.created_at).max(),
5835    )
5836}
5837
5838fn borrowed_messages_tail_state(messages: &[&Message]) -> (Option<i64>, Option<i64>) {
5839    (
5840        messages.iter().map(|msg| msg.idx).max(),
5841        messages.iter().filter_map(|msg| msg.created_at).max(),
5842    )
5843}
5844
5845fn role_from_str(role: &str) -> MessageRole {
5846    match role {
5847        "user" => MessageRole::User,
5848        "agent" | "assistant" => MessageRole::Agent,
5849        "tool" => MessageRole::Tool,
5850        "system" => MessageRole::System,
5851        other => MessageRole::Other(other.to_string()),
5852    }
5853}
5854
5855fn message_merge_fingerprint(msg: &Message) -> MessageMergeFingerprint {
5856    MessageMergeFingerprint {
5857        idx: msg.idx,
5858        created_at: msg.created_at,
5859        role: msg.role.clone(),
5860        author: msg.author.clone(),
5861        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
5862    }
5863}
5864
5865fn message_replay_fingerprint(msg: &Message) -> MessageReplayFingerprint {
5866    MessageReplayFingerprint {
5867        created_at: msg.created_at,
5868        role: msg.role.clone(),
5869        author: msg.author.clone(),
5870        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
5871    }
5872}
5873
5874fn conversation_message_fingerprints(conv: &Conversation) -> HashSet<MessageMergeFingerprint> {
5875    conv.messages
5876        .iter()
5877        .map(message_merge_fingerprint)
5878        .collect()
5879}
5880
5881fn conversation_message_replay_fingerprints(
5882    conv: &Conversation,
5883) -> HashSet<MessageReplayFingerprint> {
5884    conv.messages
5885        .iter()
5886        .map(message_replay_fingerprint)
5887        .collect()
5888}
5889
5890fn replay_fingerprint_from_merge(
5891    fingerprint: &MessageMergeFingerprint,
5892) -> MessageReplayFingerprint {
5893    MessageReplayFingerprint {
5894        created_at: fingerprint.created_at,
5895        role: fingerprint.role.clone(),
5896        author: fingerprint.author.clone(),
5897        content_hash: fingerprint.content_hash,
5898    }
5899}
5900
5901fn replay_fingerprints_from_merge_set(
5902    fingerprints: &HashSet<MessageMergeFingerprint>,
5903) -> HashSet<MessageReplayFingerprint> {
5904    fingerprints
5905        .iter()
5906        .map(replay_fingerprint_from_merge)
5907        .collect()
5908}
5909
5910fn collect_new_messages_for_existing_conversation<'a>(
5911    conversation_id: i64,
5912    conv: &'a Conversation,
5913    existing_messages: &mut HashMap<i64, MessageMergeFingerprint>,
5914    existing_replay_fingerprints: &mut HashSet<MessageReplayFingerprint>,
5915    replay_skip_log: &'static str,
5916) -> ExistingConversationNewMessages<'a> {
5917    let mut idx_collision_count = 0usize;
5918    let mut first_collision_idx: Option<i64> = None;
5919    let mut new_chars: i64 = 0;
5920    let mut messages = Vec::new();
5921
5922    for msg in &conv.messages {
5923        let incoming_fingerprint = message_merge_fingerprint(msg);
5924        if let Some(existing_fingerprint) = existing_messages.get(&msg.idx) {
5925            if existing_fingerprint != &incoming_fingerprint {
5926                idx_collision_count = idx_collision_count.saturating_add(1);
5927                first_collision_idx.get_or_insert(msg.idx);
5928            }
5929            continue;
5930        }
5931
5932        let incoming_replay = replay_fingerprint_from_merge(&incoming_fingerprint);
5933        if existing_replay_fingerprints.contains(&incoming_replay) {
5934            tracing::debug!(
5935                conversation_id,
5936                idx = msg.idx,
5937                source_path = %conv.source_path.display(),
5938                "{replay_skip_log}"
5939            );
5940            continue;
5941        }
5942
5943        existing_messages.insert(msg.idx, incoming_fingerprint);
5944        existing_replay_fingerprints.insert(incoming_replay);
5945        new_chars += msg.content.len() as i64;
5946        messages.push(msg);
5947    }
5948
5949    ExistingConversationNewMessages {
5950        messages,
5951        new_chars,
5952        idx_collision_count,
5953        first_collision_idx,
5954    }
5955}
5956
5957fn franken_existing_conversation_append_tail_state(
5958    tx: &FrankenTransaction<'_>,
5959    conversation_id: i64,
5960) -> Result<Option<ExistingConversationTailState>> {
5961    let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
5962        .query_row_map(
5963            "SELECT last_message_idx, last_message_created_at, ended_at
5964             FROM conversation_tail_state
5965             WHERE conversation_id = ?1",
5966            fparams![conversation_id],
5967            |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
5968        )
5969        .optional()?;
5970    if let Some(cached) = cached {
5971        let (_, _, cached_ended_at) = cached;
5972        if let Some(tail_state) =
5973            existing_conversation_tail_state_from_cached(cached.0, cached.1, cached_ended_at)
5974        {
5975            return Ok(Some(tail_state));
5976        }
5977    }
5978
5979    let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
5980        "SELECT last_message_idx, last_message_created_at, ended_at
5981         FROM conversations
5982         WHERE id = ?1",
5983        fparams![conversation_id],
5984        |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
5985    )?;
5986    let (_, _, cached_ended_at) = legacy_cached;
5987    if let Some(tail_state) = existing_conversation_tail_state_from_cached(
5988        legacy_cached.0,
5989        legacy_cached.1,
5990        cached_ended_at,
5991    ) {
5992        franken_insert_conversation_tail_state(
5993            tx,
5994            conversation_id,
5995            cached_ended_at,
5996            Some(tail_state.last_message_idx),
5997            Some(tail_state.last_message_created_at),
5998        )?;
5999        return Ok(Some(tail_state));
6000    }
6001
6002    let (max_idx, max_created_at): (Option<i64>, Option<i64>) = tx.query_row_map(
6003        "SELECT MAX(idx), MAX(created_at)
6004         FROM messages
6005         WHERE conversation_id = ?1",
6006        fparams![conversation_id],
6007        |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
6008    )?;
6009    if let Some((last_message_idx, last_message_created_at)) = max_idx.zip(max_created_at) {
6010        franken_update_conversation_tail_state(
6011            tx,
6012            conversation_id,
6013            None,
6014            Some(last_message_idx),
6015            Some(last_message_created_at),
6016        )?;
6017        return Ok(Some(ExistingConversationTailState {
6018            last_message_idx,
6019            last_message_created_at,
6020            ended_at: cached_ended_at,
6021        }));
6022    }
6023    Ok(None)
6024}
6025
6026fn existing_conversation_tail_state_from_cached(
6027    last_message_idx: Option<i64>,
6028    last_message_created_at: Option<i64>,
6029    ended_at: Option<i64>,
6030) -> Option<ExistingConversationTailState> {
6031    let (last_message_idx, last_message_created_at) =
6032        last_message_idx.zip(last_message_created_at)?;
6033    Some(ExistingConversationTailState {
6034        last_message_idx,
6035        last_message_created_at,
6036        ended_at,
6037    })
6038}
6039
6040fn franken_find_existing_conversation_with_tail_by_key(
6041    tx: &FrankenTransaction<'_>,
6042    key: &PendingConversationKey,
6043    conv: Option<&Conversation>,
6044) -> Result<Option<ExistingConversationWithTail>> {
6045    if let PendingConversationKey::External {
6046        source_id,
6047        agent_id,
6048        external_id,
6049    } = key
6050    {
6051        let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
6052        if let Some(existing) = franken_find_external_conversation_tail_lookup(tx, &lookup_key)? {
6053            return Ok(Some(existing));
6054        }
6055        return Ok(None);
6056    }
6057
6058    let Some(id) = franken_find_existing_conversation_by_key(tx, key, conv)? else {
6059        return Ok(None);
6060    };
6061    let tail_state = franken_existing_conversation_append_tail_state(tx, id)?;
6062    Ok(Some(ExistingConversationWithTail { id, tail_state }))
6063}
6064
6065fn franken_insert_conversation_tail_state(
6066    tx: &FrankenTransaction<'_>,
6067    conversation_id: i64,
6068    ended_at: Option<i64>,
6069    last_message_idx: Option<i64>,
6070    last_message_created_at: Option<i64>,
6071) -> Result<()> {
6072    if ended_at.is_none() && last_message_idx.is_none() && last_message_created_at.is_none() {
6073        return Ok(());
6074    }
6075    tx.execute_compat(
6076        "INSERT OR REPLACE INTO conversation_tail_state (
6077             conversation_id, ended_at, last_message_idx, last_message_created_at
6078         ) VALUES (?1, ?2, ?3, ?4)",
6079        fparams![
6080            conversation_id,
6081            ended_at,
6082            last_message_idx,
6083            last_message_created_at
6084        ],
6085    )?;
6086    Ok(())
6087}
6088
6089fn franken_update_conversation_tail_columns(
6090    tx: &FrankenTransaction<'_>,
6091    conversation_id: i64,
6092    ended_at_candidate: Option<i64>,
6093    last_message_idx_candidate: Option<i64>,
6094    last_message_created_at_candidate: Option<i64>,
6095) -> Result<()> {
6096    if ended_at_candidate.is_none()
6097        && last_message_idx_candidate.is_none()
6098        && last_message_created_at_candidate.is_none()
6099    {
6100        return Ok(());
6101    }
6102
6103    tx.execute_compat(
6104        "UPDATE conversations
6105         SET ended_at = CASE
6106                 WHEN ?1 IS NULL THEN ended_at
6107                 WHEN ended_at IS NULL OR ended_at < ?1 THEN ?1
6108                 ELSE ended_at
6109             END,
6110             last_message_idx = CASE
6111                 WHEN ?2 IS NULL THEN last_message_idx
6112                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6113                 ELSE last_message_idx
6114             END,
6115             last_message_created_at = CASE
6116                 WHEN ?3 IS NULL THEN last_message_created_at
6117                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6118                 ELSE last_message_created_at
6119             END
6120         WHERE id = ?4",
6121        fparams![
6122            ended_at_candidate,
6123            last_message_idx_candidate,
6124            last_message_created_at_candidate,
6125            conversation_id
6126        ],
6127    )?;
6128    Ok(())
6129}
6130
6131fn franken_tail_state_insert_ended_at(
6132    tx: &FrankenTransaction<'_>,
6133    conversation_id: i64,
6134    candidate: Option<i64>,
6135) -> Result<Option<i64>> {
6136    let canonical: Option<i64> = tx
6137        .query_row_map(
6138            "SELECT ended_at FROM conversations WHERE id = ?1",
6139            fparams![conversation_id],
6140            |row| row.get_typed(0),
6141        )
6142        .optional()?
6143        .flatten();
6144    Ok(canonical.max(candidate))
6145}
6146
6147fn franken_update_conversation_tail_state(
6148    tx: &FrankenTransaction<'_>,
6149    conversation_id: i64,
6150    ended_at_candidate: Option<i64>,
6151    last_message_idx_candidate: Option<i64>,
6152    last_message_created_at_candidate: Option<i64>,
6153) -> Result<()> {
6154    if ended_at_candidate.is_none()
6155        && last_message_idx_candidate.is_none()
6156        && last_message_created_at_candidate.is_none()
6157    {
6158        return Ok(());
6159    }
6160
6161    let changed = tx.execute_compat(
6162        "UPDATE conversation_tail_state
6163         SET ended_at = CASE
6164                 WHEN ?1 IS NULL THEN ended_at
6165                 ELSE MAX(IFNULL(ended_at, 0), ?1)
6166             END,
6167             last_message_idx = CASE
6168                 WHEN ?2 IS NULL THEN last_message_idx
6169                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6170                 ELSE last_message_idx
6171             END,
6172             last_message_created_at = CASE
6173                 WHEN ?3 IS NULL THEN last_message_created_at
6174                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6175                 ELSE last_message_created_at
6176             END
6177         WHERE conversation_id = ?4",
6178        fparams![
6179            ended_at_candidate,
6180            last_message_idx_candidate,
6181            last_message_created_at_candidate,
6182            conversation_id
6183        ],
6184    )?;
6185    if changed == 0 {
6186        let insert_ended_at =
6187            franken_tail_state_insert_ended_at(tx, conversation_id, ended_at_candidate)?;
6188        franken_insert_conversation_tail_state(
6189            tx,
6190            conversation_id,
6191            insert_ended_at,
6192            last_message_idx_candidate,
6193            last_message_created_at_candidate,
6194        )?;
6195    }
6196    franken_update_conversation_tail_columns(
6197        tx,
6198        conversation_id,
6199        ended_at_candidate,
6200        last_message_idx_candidate,
6201        last_message_created_at_candidate,
6202    )?;
6203    Ok(())
6204}
6205
6206fn franken_set_conversation_tail_state_after_append(
6207    tx: &FrankenTransaction<'_>,
6208    conversation_id: i64,
6209    ended_at: i64,
6210    last_message_idx: i64,
6211    last_message_created_at: i64,
6212) -> Result<()> {
6213    let changed = tx.execute_compat(
6214        "UPDATE conversation_tail_state
6215         SET ended_at = ?1,
6216             last_message_idx = ?2,
6217             last_message_created_at = ?3
6218         WHERE conversation_id = ?4",
6219        fparams![
6220            ended_at,
6221            last_message_idx,
6222            last_message_created_at,
6223            conversation_id
6224        ],
6225    )?;
6226    if changed == 0 {
6227        let insert_ended_at =
6228            franken_tail_state_insert_ended_at(tx, conversation_id, Some(ended_at))?;
6229        franken_insert_conversation_tail_state(
6230            tx,
6231            conversation_id,
6232            insert_ended_at,
6233            Some(last_message_idx),
6234            Some(last_message_created_at),
6235        )?;
6236    }
6237    franken_update_conversation_tail_columns(
6238        tx,
6239        conversation_id,
6240        Some(ended_at),
6241        Some(last_message_idx),
6242        Some(last_message_created_at),
6243    )?;
6244    Ok(())
6245}
6246
6247fn collect_append_only_tail_messages<'a>(
6248    conv: &'a Conversation,
6249    existing_max_idx: i64,
6250    existing_max_created_at: i64,
6251) -> Option<ExistingConversationNewMessages<'a>> {
6252    if conv.messages.is_empty() {
6253        return Some(ExistingConversationNewMessages {
6254            messages: Vec::new(),
6255            new_chars: 0,
6256            idx_collision_count: 0,
6257            first_collision_idx: None,
6258        });
6259    }
6260
6261    let mut split_idx = None;
6262    let mut prev_idx = None;
6263    for (pos, msg) in conv.messages.iter().enumerate() {
6264        if prev_idx.is_some_and(|prev| msg.idx < prev) {
6265            return None;
6266        }
6267        prev_idx = Some(msg.idx);
6268        if split_idx.is_none() && msg.idx > existing_max_idx {
6269            split_idx = Some(pos);
6270        }
6271    }
6272    let split_idx = split_idx?;
6273
6274    let mut seen_tail_idx = HashSet::new();
6275    let mut seen_tail_replay = HashSet::new();
6276    let mut new_chars = 0i64;
6277    let mut messages = Vec::new();
6278    for msg in &conv.messages[split_idx..] {
6279        let created_at = msg.created_at?;
6280        if created_at <= existing_max_created_at {
6281            return None;
6282        }
6283
6284        if !seen_tail_idx.insert(msg.idx) {
6285            return None;
6286        }
6287
6288        let replay_fingerprint = message_replay_fingerprint(msg);
6289        if !seen_tail_replay.insert(replay_fingerprint) {
6290            return None;
6291        }
6292
6293        new_chars += msg.content.len() as i64;
6294        messages.push(msg);
6295    }
6296
6297    Some(ExistingConversationNewMessages {
6298        messages,
6299        new_chars,
6300        idx_collision_count: 0,
6301        first_collision_idx: None,
6302    })
6303}
6304
6305fn start_distance_ms(left: Option<i64>, right: Option<i64>) -> i64 {
6306    match (left, right) {
6307        (Some(left), Some(right)) => (i128::from(left) - i128::from(right))
6308            .abs()
6309            .try_into()
6310            .unwrap_or(i64::MAX),
6311        _ => i64::MAX,
6312    }
6313}
6314
6315fn conversation_merge_evidence(
6316    incoming_exact: &HashSet<MessageMergeFingerprint>,
6317    incoming_replay: &HashSet<MessageReplayFingerprint>,
6318    existing_exact: &HashSet<MessageMergeFingerprint>,
6319    existing_replay: &HashSet<MessageReplayFingerprint>,
6320    incoming_started_at: Option<i64>,
6321    existing_started_at: Option<i64>,
6322) -> Option<ConversationMergeEvidence> {
6323    let exact_overlap = incoming_exact.intersection(existing_exact).count();
6324    let replay_overlap = incoming_replay.intersection(existing_replay).count();
6325    if exact_overlap == 0 && replay_overlap == 0 {
6326        return None;
6327    }
6328
6329    let smaller_replay_set = incoming_replay.len().min(existing_replay.len());
6330    let started_close = timestamps_within_tolerance(
6331        incoming_started_at,
6332        existing_started_at,
6333        SOURCE_PATH_MERGE_START_TOLERANCE_MS,
6334    );
6335    let full_replay_subset_match = smaller_replay_set >= 2 && replay_overlap == smaller_replay_set;
6336
6337    let merge_allowed = if started_close {
6338        exact_overlap >= 1 || replay_overlap >= 2
6339    } else {
6340        exact_overlap >= 2 || full_replay_subset_match
6341    };
6342
6343    merge_allowed.then_some(ConversationMergeEvidence {
6344        exact_overlap,
6345        replay_overlap,
6346        smaller_replay_set,
6347        started_close,
6348        start_distance_ms: start_distance_ms(incoming_started_at, existing_started_at),
6349    })
6350}
6351
6352fn timestamps_within_tolerance(left: Option<i64>, right: Option<i64>, tolerance_ms: i64) -> bool {
6353    match (left, right) {
6354        (Some(left), Some(right)) => {
6355            (i128::from(left) - i128::from(right)).abs() <= i128::from(tolerance_ms)
6356        }
6357        _ => false,
6358    }
6359}
6360
6361fn conversation_merge_key(agent_id: i64, conv: &Conversation) -> PendingConversationKey {
6362    if let Some(external_id) = conv.external_id.clone() {
6363        PendingConversationKey::External {
6364            source_id: conv.source_id.clone(),
6365            agent_id,
6366            external_id,
6367        }
6368    } else {
6369        PendingConversationKey::SourcePath {
6370            source_id: conv.source_id.clone(),
6371            agent_id,
6372            source_path: path_to_string(&conv.source_path),
6373            started_at: conversation_effective_started_at(conv),
6374        }
6375    }
6376}
6377
6378/// Message data needed for semantic embedding generation.
6379pub struct MessageForEmbedding {
6380    pub message_id: i64,
6381    pub created_at: Option<i64>,
6382    pub agent_id: i64,
6383    pub workspace_id: Option<i64>,
6384    pub source_id_hash: u32,
6385    pub role: String,
6386    pub content: String,
6387}
6388
6389// =========================================================================
6390// FrankenStorage CRUD operations
6391// =========================================================================
6392
6393impl FrankenStorage {
6394    /// Ensure an agent exists in the database, returning its ID.
6395    pub fn ensure_agent(&self, agent: &Agent) -> Result<i64> {
6396        let cache_key = EnsuredAgentKey::from_agent(agent);
6397        if let Some(id) = self.cached_agent_id(&cache_key) {
6398            return Ok(id);
6399        }
6400
6401        let now = Self::now_millis();
6402        self.conn.execute_compat(
6403            "INSERT INTO agents(slug, name, version, kind, created_at, updated_at)
6404             VALUES(?1, ?2, ?3, ?4, ?5, ?6)
6405             ON CONFLICT(slug) DO UPDATE SET
6406                 name = excluded.name,
6407                 version = excluded.version,
6408                 kind = excluded.kind,
6409                 updated_at = excluded.updated_at
6410             WHERE NOT (
6411                 agents.name IS excluded.name
6412                 AND agents.version IS excluded.version
6413                 AND agents.kind IS excluded.kind
6414             )",
6415            fparams![
6416                agent.slug.as_str(),
6417                agent.name.as_str(),
6418                agent.version.as_deref(),
6419                cache_key.kind.as_str(),
6420                now,
6421                now
6422            ],
6423        )?;
6424
6425        let id = self
6426            .conn
6427            .query_row_map(
6428                "SELECT id FROM agents WHERE slug = ?1 LIMIT 1",
6429                fparams![agent.slug.as_str()],
6430                |row| row.get_typed(0),
6431            )
6432            .with_context(|| format!("fetching agent id for {}", agent.slug))?;
6433        self.mark_agent_ensured(cache_key, id);
6434        Ok(id)
6435    }
6436
6437    /// Ensure a workspace exists in the database, returning its ID.
6438    pub fn ensure_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
6439        let path_str = path.to_string_lossy().to_string();
6440        let cache_key = EnsuredWorkspaceKey::new(path_str.clone(), display_name);
6441        if let Some(id) = self.cached_workspace_id(&cache_key) {
6442            return Ok(id);
6443        }
6444
6445        if let Some(display_name) = display_name {
6446            self.conn.execute_compat(
6447                "INSERT INTO workspaces(path, display_name)
6448                 VALUES(?1, ?2)
6449                 ON CONFLICT(path) DO UPDATE SET
6450                     display_name = excluded.display_name
6451                 WHERE NOT (workspaces.display_name IS excluded.display_name)",
6452                fparams![path_str.as_str(), display_name],
6453            )?;
6454        } else {
6455            self.conn.execute_compat(
6456                "INSERT OR IGNORE INTO workspaces(path, display_name) VALUES(?1, NULL)",
6457                fparams![path_str.as_str()],
6458            )?;
6459        }
6460
6461        let id = self
6462            .conn
6463            .query_row_map(
6464                "SELECT id FROM workspaces WHERE path = ?1 LIMIT 1",
6465                fparams![path_str.as_str()],
6466                |row| row.get_typed(0),
6467            )
6468            .with_context(|| format!("fetching workspace id for {path_str}"))?;
6469        self.mark_workspace_ensured(cache_key, id);
6470        Ok(id)
6471    }
6472
6473    /// Get current time as milliseconds since epoch.
6474    pub fn now_millis() -> i64 {
6475        SystemTime::now()
6476            .duration_since(UNIX_EPOCH)
6477            .map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
6478            .unwrap_or(0)
6479    }
6480
6481    /// Convert a millisecond timestamp to a day ID (days since 2020-01-01).
6482    pub fn day_id_from_millis(timestamp_ms: i64) -> i64 {
6483        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6484        let secs = timestamp_ms.div_euclid(1000);
6485        (secs - EPOCH_2020_SECS).div_euclid(86400)
6486    }
6487
6488    /// Convert a millisecond timestamp to an hour ID (hours since 2020-01-01 00:00 UTC).
6489    pub fn hour_id_from_millis(timestamp_ms: i64) -> i64 {
6490        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6491        let secs = timestamp_ms.div_euclid(1000);
6492        (secs - EPOCH_2020_SECS).div_euclid(3600)
6493    }
6494
6495    /// Convert a day ID back to milliseconds (start of day).
6496    pub fn millis_from_day_id(day_id: i64) -> i64 {
6497        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6498        (EPOCH_2020_SECS + day_id * 86400) * 1000
6499    }
6500
6501    /// Convert an hour ID back to milliseconds (start of hour).
6502    pub fn millis_from_hour_id(hour_id: i64) -> i64 {
6503        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6504        (EPOCH_2020_SECS + hour_id * 3600) * 1000
6505    }
6506
6507    /// Get the timestamp of the last successful scan.
6508    pub fn get_last_scan_ts(&self) -> Result<Option<i64>> {
6509        let result: Result<String, _> = self.conn.query_row_map(
6510            "SELECT value FROM meta WHERE key = 'last_scan_ts'",
6511            fparams![],
6512            |row| row.get_typed(0),
6513        );
6514        match result.optional() {
6515            Ok(Some(s)) => Ok(s.parse().ok()),
6516            Ok(None) => Ok(None),
6517            Err(e) => Err(e.into()),
6518        }
6519    }
6520
6521    /// Set the timestamp of the last successful scan (milliseconds since epoch).
6522    pub fn set_last_scan_ts(&self, ts: i64) -> Result<()> {
6523        self.conn.execute_compat(
6524            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_scan_ts', ?1)",
6525            fparams![ts.to_string()],
6526        )?;
6527        Ok(())
6528    }
6529
6530    /// Get the timestamp of the last successful index completion.
6531    pub fn get_last_indexed_at(&self) -> Result<Option<i64>> {
6532        let result: Result<String, _> = self.conn.query_row_map(
6533            "SELECT value FROM meta WHERE key = 'last_indexed_at'",
6534            fparams![],
6535            |row| row.get_typed(0),
6536        );
6537        match result.optional() {
6538            Ok(Some(s)) => Ok(s.parse().ok()),
6539            Ok(None) => Ok(None),
6540            Err(e) => Err(e.into()),
6541        }
6542    }
6543
6544    /// Set the timestamp of the last successful index completion (milliseconds since epoch).
6545    pub fn set_last_indexed_at(&self, ts: i64) -> Result<()> {
6546        self.conn.execute_compat(
6547            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_indexed_at', ?1)",
6548            fparams![ts.to_string()],
6549        )?;
6550        Ok(())
6551    }
6552
6553    /// List all registered agents.
6554    pub fn list_agents(&self) -> Result<Vec<Agent>> {
6555        self.conn
6556            .query_map_collect(
6557                "SELECT id, slug, name, version, kind FROM agents ORDER BY slug",
6558                fparams![],
6559                |row| {
6560                    let kind: String = row.get_typed(4)?;
6561                    Ok(Agent {
6562                        id: Some(row.get_typed(0)?),
6563                        slug: row.get_typed(1)?,
6564                        name: row.get_typed(2)?,
6565                        version: row.get_typed(3)?,
6566                        kind: match kind.as_str() {
6567                            "cli" => AgentKind::Cli,
6568                            "vscode" => AgentKind::VsCode,
6569                            _ => AgentKind::Hybrid,
6570                        },
6571                    })
6572                },
6573            )
6574            .with_context(|| "listing agents")
6575    }
6576
6577    /// Count all archived conversations.
6578    pub fn total_conversation_count(&self) -> Result<usize> {
6579        let count: i64 =
6580            self.conn
6581                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6582                    row.get_typed(0)
6583                })?;
6584        Ok(count.max(0) as usize)
6585    }
6586
6587    /// Count all archived messages.
6588    pub fn total_message_count(&self) -> Result<usize> {
6589        let count: i64 =
6590            self.conn
6591                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
6592                    row.get_typed(0)
6593                })?;
6594        Ok(count.max(0) as usize)
6595    }
6596
6597    /// Remove all archived conversations/messages for one agent slug.
6598    ///
6599    /// This only affects cass's local archive database. Source session files on
6600    /// disk are untouched.
6601    pub fn purge_agent_archive_data(&self, agent_slug: &str) -> Result<AgentArchivePurgeResult> {
6602        let normalized = agent_slug.trim().to_ascii_lowercase();
6603        if normalized.is_empty() {
6604            return Err(anyhow!("agent slug cannot be empty"));
6605        }
6606
6607        let Some(agent_id) = self
6608            .conn
6609            .query_row_map(
6610                "SELECT id FROM agents WHERE slug = ?1",
6611                fparams![normalized.as_str()],
6612                |row| row.get_typed::<i64>(0),
6613            )
6614            .optional()?
6615        else {
6616            return Ok(AgentArchivePurgeResult::default());
6617        };
6618
6619        let conversations_deleted: i64 = self.conn.query_row_map(
6620            "SELECT COUNT(*) FROM conversations WHERE agent_id = ?1",
6621            fparams![agent_id],
6622            |row| row.get_typed(0),
6623        )?;
6624        if conversations_deleted == 0 {
6625            return Ok(AgentArchivePurgeResult::default());
6626        }
6627
6628        let messages_deleted: i64 = self.conn.query_row_map(
6629            "SELECT COUNT(*)
6630             FROM messages
6631             WHERE conversation_id IN (
6632                 SELECT id FROM conversations WHERE agent_id = ?1
6633             )",
6634            fparams![agent_id],
6635            |row| row.get_typed(0),
6636        )?;
6637
6638        let mut tx = self.conn.transaction()?;
6639        tx.execute_compat(
6640            "DELETE FROM conversation_external_lookup
6641             WHERE conversation_id IN (
6642                 SELECT id FROM conversations WHERE agent_id = ?1
6643             )",
6644            fparams![agent_id],
6645        )?;
6646        tx.execute_compat(
6647            "DELETE FROM conversation_external_tail_lookup
6648             WHERE conversation_id IN (
6649                 SELECT id FROM conversations WHERE agent_id = ?1
6650             )",
6651            fparams![agent_id],
6652        )?;
6653        tx.execute_compat(
6654            "DELETE FROM conversations WHERE agent_id = ?1",
6655            fparams![agent_id],
6656        )?;
6657        tx.execute_compat(
6658            "DELETE FROM agents
6659             WHERE id = ?1
6660               AND NOT EXISTS (
6661                   SELECT 1 FROM conversations WHERE agent_id = ?1
6662               )",
6663            fparams![agent_id],
6664        )?;
6665        tx.commit()?;
6666
6667        Ok(AgentArchivePurgeResult {
6668            conversations_deleted: conversations_deleted.max(0) as usize,
6669            messages_deleted: messages_deleted.max(0) as usize,
6670        })
6671    }
6672
6673    /// List all registered workspaces.
6674    pub fn list_workspaces(&self) -> Result<Vec<crate::model::types::Workspace>> {
6675        self.conn
6676            .query_map_collect(
6677                "SELECT id, path, display_name FROM workspaces ORDER BY path",
6678                fparams![],
6679                |row| {
6680                    let path_str: String = row.get_typed(1)?;
6681                    Ok(crate::model::types::Workspace {
6682                        id: Some(row.get_typed(0)?),
6683                        path: Path::new(&path_str).to_path_buf(),
6684                        display_name: row.get_typed(2)?,
6685                    })
6686                },
6687            )
6688            .with_context(|| "listing workspaces")
6689    }
6690
6691    /// List conversations with pagination.
6692    pub fn list_conversations(&self, limit: i64, offset: i64) -> Result<Vec<Conversation>> {
6693        // Avoid the multi-table JOIN with LIMIT/OFFSET that triggers
6694        // frankensqlite's materialization fallback (see c38edcd9, 860acb12).
6695        // Use correlated subqueries for the tiny agents (~20 rows) and
6696        // workspaces (~30 rows) lookup tables and degrade NULL agent_id to
6697        // the same 'unknown' sentinel that 8a0c547c established for the
6698        // lexical rebuild path.
6699        self.conn
6700            .query_map_collect(
6701                r"SELECT c.id,
6702                         COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
6703                         (SELECT w.path FROM workspaces w WHERE w.id = c.workspace_id),
6704                         c.external_id, c.title, c.source_path,
6705                         c.started_at,
6706                         COALESCE(
6707                             (SELECT ts.ended_at
6708                              FROM conversation_tail_state ts
6709                              WHERE ts.conversation_id = c.id),
6710                             c.ended_at
6711                         ),
6712                         c.approx_tokens, c.metadata_json,
6713                         c.source_id, c.origin_host, c.metadata_bin
6714                FROM conversations c
6715                ORDER BY CASE WHEN c.started_at IS NULL THEN 1 ELSE 0 END, c.started_at DESC, c.id DESC
6716                LIMIT ?1 OFFSET ?2",
6717                fparams![limit, offset],
6718                |row| {
6719                    let workspace_path: Option<String> = row.get_typed(2)?;
6720                    let source_path: String = row.get_typed(5)?;
6721                    let raw_source_id: Option<String> = row.get_typed(10)?;
6722                    let raw_origin_host: Option<String> = row.get_typed(11)?;
6723                    let (source_id, _, origin_host) = normalized_storage_source_parts(
6724                        raw_source_id.as_deref(),
6725                        None,
6726                        raw_origin_host.as_deref(),
6727                    );
6728                    Ok(Conversation {
6729                        id: Some(row.get_typed(0)?),
6730                        agent_slug: row.get_typed(1)?,
6731                        workspace: workspace_path.map(|p| Path::new(&p).to_path_buf()),
6732                        external_id: row.get_typed(3)?,
6733                        title: row.get_typed(4)?,
6734                        source_path: Path::new(&source_path).to_path_buf(),
6735                        started_at: row.get_typed(6)?,
6736                        ended_at: row.get_typed(7)?,
6737                        approx_tokens: row.get_typed(8)?,
6738                        metadata_json: franken_read_metadata_compat(row, 9, 12),
6739                        messages: Vec::new(),
6740                        source_id,
6741                        origin_host,
6742                    })
6743                },
6744            )
6745            .with_context(|| "listing conversations")
6746    }
6747
6748    /// Build lookup maps for agents and workspaces to avoid JOINs in
6749    /// paged conversation queries.  Both tables are tiny (tens of rows)
6750    /// so this is effectively free.
6751    pub fn build_lexical_rebuild_lookups(
6752        &self,
6753    ) -> Result<(HashMap<i64, String>, HashMap<i64, PathBuf>)> {
6754        let agents: HashMap<i64, String> = self
6755            .conn
6756            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
6757                Ok((row.get_typed::<i64>(0)?, row.get_typed::<String>(1)?))
6758            })
6759            .with_context(|| "loading agent lookup for lexical rebuild")?
6760            .into_iter()
6761            .collect();
6762        let workspaces: HashMap<i64, PathBuf> = self
6763            .conn
6764            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
6765                let path_str: String = row.get_typed(1)?;
6766                Ok((row.get_typed::<i64>(0)?, PathBuf::from(path_str)))
6767            })
6768            .with_context(|| "loading workspace lookup for lexical rebuild")?
6769            .into_iter()
6770            .collect();
6771        Ok((agents, workspaces))
6772    }
6773
6774    /// List per-conversation message footprints in primary-key order.
6775    ///
6776    /// This deliberately avoids rebuild-path JOINs. Instead we merge ordered
6777    /// single-table reads over `conversations` and the narrow
6778    /// `conversation_tail_state` cache in Rust, then use `last_message_idx + 1`
6779    /// as a planning estimate.
6780    ///
6781    /// The planner only needs a sizing heuristic; exact message and byte
6782    /// accounting is performed later by the rebuild packet pipeline as it reads
6783    /// message content for indexing. Rows missing both tail-cache sources fall
6784    /// back to `MAX(messages.idx) + 1`, which preserves legacy upgraded
6785    /// databases without treating populated conversations as empty.
6786    pub fn list_conversation_footprints_for_lexical_rebuild(
6787        &self,
6788    ) -> Result<Vec<LexicalRebuildConversationFootprintRow>> {
6789        let tail_state_rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
6790            "SELECT conversation_id, last_message_idx
6791             FROM conversation_tail_state
6792             ORDER BY conversation_id ASC",
6793            fparams![],
6794            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
6795        ) {
6796            Ok(rows) => rows,
6797            Err(err) if error_indicates_missing_table(&err) => Vec::new(),
6798            Err(err) => {
6799                return Err(err).with_context(|| "listing lexical rebuild tail-state estimates");
6800            }
6801        };
6802        let tail_state_by_conversation: HashMap<i64, Option<i64>> =
6803            tail_state_rows.into_iter().collect();
6804
6805        let rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
6806            "SELECT id, last_message_idx
6807             FROM conversations
6808             ORDER BY id ASC",
6809            fparams![],
6810            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
6811        ) {
6812            Ok(rows) => rows,
6813            Err(err) if error_indicates_missing_column(&err) => self
6814                .conn
6815                .query_map_collect(
6816                    "SELECT id
6817                     FROM conversations
6818                     ORDER BY id ASC",
6819                    fparams![],
6820                    |row| Ok((row.get_typed::<i64>(0)?, None)),
6821                )
6822                .with_context(|| {
6823                    "listing lexical rebuild conversation ids after missing tail column fallback"
6824                })?,
6825            Err(err) => {
6826                return Err(err)
6827                    .with_context(|| "listing lexical rebuild conversation footprint estimates");
6828            }
6829        };
6830
6831        let mut footprints = Vec::with_capacity(rows.len());
6832        let mut missing_tail_positions = HashMap::new();
6833        for (conversation_id, conversation_last_message_idx) in rows {
6834            let last_message_idx = tail_state_by_conversation
6835                .get(&conversation_id)
6836                .copied()
6837                .flatten()
6838                .or(conversation_last_message_idx);
6839            let Some(message_count) = lexical_rebuild_message_count_from_tail_idx(last_message_idx)
6840            else {
6841                missing_tail_positions.insert(conversation_id, footprints.len());
6842                footprints.push(LexicalRebuildConversationFootprintRow {
6843                    conversation_id,
6844                    message_count: 0,
6845                    message_bytes: 0,
6846                });
6847                continue;
6848            };
6849            footprints.push(lexical_rebuild_conversation_footprint_from_count(
6850                conversation_id,
6851                message_count,
6852            ));
6853        }
6854
6855        let every_footprint_was_missing_tail = missing_tail_positions.len() == footprints.len();
6856        if !missing_tail_positions.is_empty() {
6857            self.fill_missing_lexical_rebuild_footprint_tails(
6858                &mut footprints,
6859                &missing_tail_positions,
6860            )?;
6861        }
6862        if !every_footprint_was_missing_tail {
6863            self.raise_lexical_rebuild_footprints_to_exact_message_counts(&mut footprints)?;
6864        }
6865
6866        Ok(footprints)
6867    }
6868
6869    pub fn lexical_rebuild_has_tail_footprint_metadata(&self) -> Result<bool> {
6870        let total_conversations: i64 = self
6871            .conn
6872            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6873                row.get_typed(0)
6874            })
6875            .with_context(|| "counting conversations for lexical rebuild tail metadata coverage")?;
6876        let total_conversations = usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX);
6877        if total_conversations == 0 {
6878            return Ok(true);
6879        }
6880
6881        let conversation_columns = franken_table_column_names(&self.conn, "conversations")?;
6882        let conversations_have_tail_column = conversation_columns.contains("last_message_idx");
6883        let tail_state_has_tail_column =
6884            match franken_table_column_names(&self.conn, "conversation_tail_state") {
6885                Ok(columns) => columns.contains("last_message_idx"),
6886                Err(err) if error_indicates_missing_table(&err) => false,
6887                Err(err) => {
6888                    return Err(err)
6889                        .with_context(|| "reading lexical rebuild tail-state metadata columns");
6890                }
6891            };
6892        if !conversations_have_tail_column && !tail_state_has_tail_column {
6893            return Ok(false);
6894        }
6895
6896        let covered_sql = match (conversations_have_tail_column, tail_state_has_tail_column) {
6897            (true, true) => {
6898                "SELECT COUNT(*)
6899                 FROM conversations c
6900                 LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
6901                 WHERE c.last_message_idx IS NOT NULL
6902                    OR ts.last_message_idx IS NOT NULL"
6903            }
6904            (true, false) => {
6905                "SELECT COUNT(*)
6906                 FROM conversations
6907                 WHERE last_message_idx IS NOT NULL"
6908            }
6909            (false, true) => {
6910                "SELECT COUNT(*)
6911                 FROM conversations c
6912                 WHERE EXISTS (
6913                     SELECT 1
6914                     FROM conversation_tail_state ts
6915                     WHERE ts.conversation_id = c.id
6916                       AND ts.last_message_idx IS NOT NULL
6917                 )"
6918            }
6919            (false, false) => unreachable!("checked before covered_sql selection"),
6920        };
6921        let covered_conversations: i64 = self
6922            .conn
6923            .query_row_map(covered_sql, fparams![], |row| row.get_typed(0))
6924            .with_context(
6925                || "counting conversations covered by lexical rebuild tail footprint metadata",
6926            )?;
6927        let covered_conversations =
6928            usize::try_from(covered_conversations.max(0)).unwrap_or(usize::MAX);
6929
6930        Ok(lexical_rebuild_tail_metadata_coverage_is_sufficient(
6931            total_conversations,
6932            covered_conversations,
6933        ))
6934    }
6935
6936    fn raise_lexical_rebuild_footprints_to_exact_message_counts(
6937        &self,
6938        footprints: &mut [LexicalRebuildConversationFootprintRow],
6939    ) -> Result<()> {
6940        if footprints.is_empty() {
6941            return Ok(());
6942        }
6943
6944        let positions_by_conversation: HashMap<i64, usize> = footprints
6945            .iter()
6946            .enumerate()
6947            .map(|(position, footprint)| (footprint.conversation_id, position))
6948            .collect();
6949        self.conn
6950            .query_with_params_for_each(
6951                "SELECT conversation_id, COUNT(*) AS message_count
6952                 FROM messages
6953                 GROUP BY conversation_id
6954                 ORDER BY conversation_id ASC",
6955                &[] as &[SqliteValue],
6956                |row| {
6957                    let conversation_id: i64 = row.get_typed(0)?;
6958                    let exact_count: i64 = row.get_typed(1)?;
6959                    let Some(position) = positions_by_conversation.get(&conversation_id) else {
6960                        return Ok(());
6961                    };
6962                    let exact_count = usize::try_from(exact_count.max(0)).unwrap_or(usize::MAX);
6963                    let footprint = &mut footprints[*position];
6964                    if exact_count > footprint.message_count {
6965                        footprint.message_count = exact_count;
6966                        footprint.message_bytes =
6967                            footprint.message_bytes.max(exact_count.saturating_mul(
6968                                LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
6969                            ));
6970                    }
6971                    Ok(())
6972                },
6973            )
6974            .with_context(|| "raising lexical rebuild footprints to exact message counts")?;
6975        Ok(())
6976    }
6977
6978    fn fill_missing_lexical_rebuild_footprint_tails(
6979        &self,
6980        footprints: &mut [LexicalRebuildConversationFootprintRow],
6981        missing_tail_positions: &HashMap<i64, usize>,
6982    ) -> Result<()> {
6983        if missing_tail_positions.len() <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT {
6984            for (conversation_id, position) in missing_tail_positions {
6985                let last_message_idx: Option<i64> = self
6986                    .conn
6987                    .query_row_map(
6988                        "SELECT MAX(idx) FROM messages WHERE conversation_id = ?1",
6989                        fparams![*conversation_id],
6990                        |row| row.get_typed(0),
6991                    )
6992                    .with_context(|| {
6993                        format!(
6994                            "looking up missing lexical rebuild tail estimate for conversation {conversation_id}"
6995                        )
6996                    })?;
6997                if let Some(message_count) =
6998                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
6999                {
7000                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7001                        *conversation_id,
7002                        message_count,
7003                    );
7004                }
7005            }
7006            return Ok(());
7007        }
7008
7009        self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7010            footprints,
7011            missing_tail_positions,
7012            "SELECT conversation_id, MAX(idx) AS last_message_idx
7013             FROM messages INDEXED BY idx_messages_conv_idx
7014             GROUP BY conversation_id
7015             ORDER BY conversation_id ASC",
7016        )
7017        .or_else(|err| {
7018            if err
7019                .to_string()
7020                .contains("no such index: idx_messages_conv_idx")
7021            {
7022                return self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7023                    footprints,
7024                    missing_tail_positions,
7025                    "SELECT conversation_id, MAX(idx) AS last_message_idx
7026                     FROM messages
7027                     GROUP BY conversation_id
7028                     ORDER BY conversation_id ASC",
7029                );
7030            }
7031            Err(err)
7032        })
7033        .with_context(|| "grouping missing lexical rebuild tail estimates from messages")?;
7034
7035        Ok(())
7036    }
7037
7038    fn fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7039        &self,
7040        footprints: &mut [LexicalRebuildConversationFootprintRow],
7041        missing_tail_positions: &HashMap<i64, usize>,
7042        sql: &str,
7043    ) -> Result<()> {
7044        self.conn
7045            .query_with_params_for_each(sql, &[] as &[SqliteValue], |row| {
7046                let conversation_id: i64 = row.get_typed(0)?;
7047                let last_message_idx: Option<i64> = row.get_typed(1)?;
7048                let Some(position) = missing_tail_positions.get(&conversation_id) else {
7049                    return Ok(());
7050                };
7051                if let Some(message_count) =
7052                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7053                {
7054                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7055                        conversation_id,
7056                        message_count,
7057                    );
7058                }
7059                Ok(())
7060            })
7061            .with_context(|| "grouping lexical rebuild missing tail estimates")
7062    }
7063
7064    /// List conversation ids in the stable order used by lexical rebuilds.
7065    pub fn list_conversation_ids_for_lexical_rebuild(&self) -> Result<Vec<i64>> {
7066        self.conn
7067            .query_map_collect(
7068                "SELECT id FROM conversations ORDER BY id ASC",
7069                fparams![],
7070                |row| row.get_typed(0),
7071            )
7072            .with_context(|| "listing conversation ids for lexical rebuild")
7073    }
7074    /// Legacy OFFSET-based traversal for one-time checkpoint migration only.
7075    ///
7076    /// New code must use `list_conversations_for_lexical_rebuild_after_id`
7077    /// for keyset pagination.
7078    pub fn list_conversations_for_lexical_rebuild_by_offset(
7079        &self,
7080        limit: i64,
7081        offset: i64,
7082        agent_slugs: &HashMap<i64, String>,
7083        workspace_paths: &HashMap<i64, PathBuf>,
7084    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7085        // Single-table query avoids the 3-table JOIN that triggers
7086        // frankensqlite's full-materialization fallback path.
7087        self.conn
7088            .query_map_collect(
7089                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7090                       started_at,
7091                       COALESCE(
7092                           (SELECT ts.ended_at
7093                            FROM conversation_tail_state ts
7094                            WHERE ts.conversation_id = conversations.id),
7095                           ended_at
7096                       ),
7097                       source_id, origin_host
7098                FROM conversations
7099                ORDER BY id ASC
7100                LIMIT ?1 OFFSET ?2",
7101                fparams![limit, offset],
7102                |row| {
7103                    let agent_id: Option<i64> = row.get_typed(1)?;
7104                    let workspace_id: Option<i64> = row.get_typed(2)?;
7105                    let source_path: String = row.get_typed(5)?;
7106                    let raw_source_id: Option<String> = row.get_typed(8)?;
7107                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7108                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7109                        raw_source_id.as_deref(),
7110                        None,
7111                        raw_origin_host.as_deref(),
7112                    );
7113                    Ok(LexicalRebuildConversationRow {
7114                        id: Some(row.get_typed(0)?),
7115                        agent_slug: agent_id
7116                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7117                            .unwrap_or_else(|| "unknown".to_string()),
7118                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7119                        external_id: row.get_typed(3)?,
7120                        title: row.get_typed(4)?,
7121                        source_path: Path::new(&source_path).to_path_buf(),
7122                        started_at: row.get_typed(6)?,
7123                        ended_at: row.get_typed(7)?,
7124                        source_id,
7125                        origin_host,
7126                    })
7127                },
7128            )
7129            .with_context(|| "listing conversations for lexical rebuild")
7130    }
7131
7132    /// List lexical rebuild conversations strictly after the given primary key.
7133    ///
7134    /// Keyset pagination keeps later rebuild pages as cheap as earlier ones,
7135    /// avoiding the ever-growing `OFFSET` scan cost during large rebuilds.
7136    pub fn list_conversations_for_lexical_rebuild_after_id(
7137        &self,
7138        limit: i64,
7139        after_conversation_id: i64,
7140        agent_slugs: &HashMap<i64, String>,
7141        workspace_paths: &HashMap<i64, PathBuf>,
7142    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7143        self.conn
7144            .query_map_collect(
7145                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7146                       started_at,
7147                       COALESCE(
7148                           (SELECT ts.ended_at
7149                            FROM conversation_tail_state ts
7150                            WHERE ts.conversation_id = conversations.id),
7151                           ended_at
7152                       ),
7153                       source_id, origin_host
7154                FROM conversations
7155                WHERE id > ?2
7156                ORDER BY id ASC
7157                LIMIT ?1",
7158                fparams![limit, after_conversation_id],
7159                |row| {
7160                    let agent_id: Option<i64> = row.get_typed(1)?;
7161                    let workspace_id: Option<i64> = row.get_typed(2)?;
7162                    let source_path: String = row.get_typed(5)?;
7163                    let raw_source_id: Option<String> = row.get_typed(8)?;
7164                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7165                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7166                        raw_source_id.as_deref(),
7167                        None,
7168                        raw_origin_host.as_deref(),
7169                    );
7170                    Ok(LexicalRebuildConversationRow {
7171                        id: Some(row.get_typed(0)?),
7172                        agent_slug: agent_id
7173                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7174                            .unwrap_or_else(|| "unknown".to_string()),
7175                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7176                        external_id: row.get_typed(3)?,
7177                        title: row.get_typed(4)?,
7178                        source_path: Path::new(&source_path).to_path_buf(),
7179                        started_at: row.get_typed(6)?,
7180                        ended_at: row.get_typed(7)?,
7181                        source_id,
7182                        origin_host,
7183                    })
7184                },
7185            )
7186            .with_context(|| {
7187                format!(
7188                    "listing conversations for lexical rebuild after id {after_conversation_id}"
7189                )
7190            })
7191    }
7192
7193    /// List lexical rebuild conversations inside an `(after_id, through_id]`
7194    /// primary-key window.
7195    ///
7196    /// This lets the rebuild producer respect planned shard boundaries without
7197    /// falling back to client-side trimming or multi-table joins.
7198    pub fn list_conversations_for_lexical_rebuild_after_id_through_id(
7199        &self,
7200        limit: i64,
7201        after_conversation_id: i64,
7202        through_conversation_id: i64,
7203        agent_slugs: &HashMap<i64, String>,
7204        workspace_paths: &HashMap<i64, PathBuf>,
7205    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7206        if through_conversation_id <= after_conversation_id {
7207            return Ok(Vec::new());
7208        }
7209        self.conn
7210            .query_map_collect(
7211                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7212                       started_at,
7213                       COALESCE(
7214                           (SELECT ts.ended_at
7215                            FROM conversation_tail_state ts
7216                            WHERE ts.conversation_id = conversations.id),
7217                           ended_at
7218                       ),
7219                       source_id, origin_host
7220                FROM conversations
7221                WHERE id > ?2 AND id <= ?3
7222                ORDER BY id ASC
7223                LIMIT ?1",
7224                fparams![limit, after_conversation_id, through_conversation_id],
7225                |row| {
7226                    let agent_id: Option<i64> = row.get_typed(1)?;
7227                    let workspace_id: Option<i64> = row.get_typed(2)?;
7228                    let source_path: String = row.get_typed(5)?;
7229                    let raw_source_id: Option<String> = row.get_typed(8)?;
7230                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7231                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7232                        raw_source_id.as_deref(),
7233                        None,
7234                        raw_origin_host.as_deref(),
7235                    );
7236                    Ok(LexicalRebuildConversationRow {
7237                        id: Some(row.get_typed(0)?),
7238                        agent_slug: agent_id
7239                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7240                            .unwrap_or_else(|| "unknown".to_string()),
7241                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7242                        external_id: row.get_typed(3)?,
7243                        title: row.get_typed(4)?,
7244                        source_path: Path::new(&source_path).to_path_buf(),
7245                        started_at: row.get_typed(6)?,
7246                        ended_at: row.get_typed(7)?,
7247                        source_id,
7248                        origin_host,
7249                    })
7250                },
7251            )
7252            .with_context(|| {
7253                format!(
7254                    "listing conversations for lexical rebuild after id {after_conversation_id} through id {through_conversation_id}"
7255                )
7256            })
7257    }
7258
7259    /// Fetch messages for a conversation.
7260    pub fn fetch_messages(&self, conversation_id: i64) -> Result<Vec<Message>> {
7261        let hinted_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7262             FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7263             WHERE conversation_id = ?1 ORDER BY idx";
7264        let fallback_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7265             FROM messages \
7266             WHERE conversation_id = ?1 ORDER BY idx";
7267
7268        self.conn
7269            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7270                let role: String = row.get_typed(2)?;
7271                Ok(Message {
7272                    id: Some(row.get_typed(0)?),
7273                    idx: row.get_typed(1)?,
7274                    role: match role.as_str() {
7275                        "user" => MessageRole::User,
7276                        "agent" | "assistant" => MessageRole::Agent,
7277                        "tool" => MessageRole::Tool,
7278                        "system" => MessageRole::System,
7279                        other => MessageRole::Other(other.to_string()),
7280                    },
7281                    author: row.get_typed(3)?,
7282                    created_at: row.get_typed(4)?,
7283                    content: row.get_typed(5)?,
7284                    extra_json: franken_read_message_extra_compat(row, 6, 7),
7285                    snippets: Vec::new(),
7286                })
7287            })
7288            .or_else(|err| {
7289                if err
7290                    .to_string()
7291                    .contains("no such index: sqlite_autoindex_messages_1")
7292                {
7293                    return self.conn.query_map_collect(
7294                        fallback_sql,
7295                        fparams![conversation_id],
7296                        |row| {
7297                            let role: String = row.get_typed(2)?;
7298                            Ok(Message {
7299                                id: Some(row.get_typed(0)?),
7300                                idx: row.get_typed(1)?,
7301                                role: match role.as_str() {
7302                                    "user" => MessageRole::User,
7303                                    "agent" | "assistant" => MessageRole::Agent,
7304                                    "tool" => MessageRole::Tool,
7305                                    "system" => MessageRole::System,
7306                                    other => MessageRole::Other(other.to_string()),
7307                                },
7308                                author: row.get_typed(3)?,
7309                                created_at: row.get_typed(4)?,
7310                                content: row.get_typed(5)?,
7311                                extra_json: franken_read_message_extra_compat(row, 6, 7),
7312                                snippets: Vec::new(),
7313                            })
7314                        },
7315                    );
7316                }
7317                Err(err)
7318            })
7319            .with_context(|| format!("fetching messages for conversation {conversation_id}"))
7320    }
7321
7322    /// Fetch messages for lexical index rebuilds without deserializing extra metadata.
7323    ///
7324    /// Tantivy only needs message text and core envelope fields, so avoiding
7325    /// `extra_json` here prevents rebuilds from rehydrating enormous historical
7326    /// payloads that are irrelevant to lexical search.
7327    pub fn fetch_messages_for_lexical_rebuild(&self, conversation_id: i64) -> Result<Vec<Message>> {
7328        let hinted_sql = "SELECT id, idx, role, author, created_at, content \
7329                 FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7330                 WHERE conversation_id = ?1 ORDER BY idx";
7331        let fallback_sql = "SELECT id, idx, role, author, created_at, content \
7332                 FROM messages \
7333                 WHERE conversation_id = ?1 ORDER BY idx";
7334
7335        self.conn
7336            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7337                let role: String = row.get_typed(2)?;
7338                Ok(Message {
7339                    id: Some(row.get_typed(0)?),
7340                    idx: row.get_typed(1)?,
7341                    role: match role.as_str() {
7342                        "user" => MessageRole::User,
7343                        "agent" | "assistant" => MessageRole::Agent,
7344                        "tool" => MessageRole::Tool,
7345                        "system" => MessageRole::System,
7346                        other => MessageRole::Other(other.to_string()),
7347                    },
7348                    author: row.get_typed(3)?,
7349                    created_at: row.get_typed(4)?,
7350                    content: row.get_typed(5)?,
7351                    extra_json: serde_json::Value::Null,
7352                    snippets: Vec::new(),
7353                })
7354            })
7355            .or_else(|err| {
7356                if err
7357                    .to_string()
7358                    .contains("no such index: sqlite_autoindex_messages_1")
7359                {
7360                    return self.conn.query_map_collect(
7361                        fallback_sql,
7362                        fparams![conversation_id],
7363                        |row| {
7364                            let role: String = row.get_typed(2)?;
7365                            Ok(Message {
7366                                id: Some(row.get_typed(0)?),
7367                                idx: row.get_typed(1)?,
7368                                role: match role.as_str() {
7369                                    "user" => MessageRole::User,
7370                                    "agent" | "assistant" => MessageRole::Agent,
7371                                    "tool" => MessageRole::Tool,
7372                                    "system" => MessageRole::System,
7373                                    other => MessageRole::Other(other.to_string()),
7374                                },
7375                                author: row.get_typed(3)?,
7376                                created_at: row.get_typed(4)?,
7377                                content: row.get_typed(5)?,
7378                                extra_json: serde_json::Value::Null,
7379                                snippets: Vec::new(),
7380                            })
7381                        },
7382                    );
7383                }
7384                Err(err)
7385            })
7386            .with_context(|| {
7387                format!("fetching messages for lexical rebuild of conversation {conversation_id}")
7388            })
7389    }
7390
7391    /// Fetch messages for multiple conversations during lexical rebuilds.
7392    ///
7393    /// This preserves the lightweight lexical-rebuild projection while avoiding
7394    /// one round-trip per conversation when rebuilding large canonical indexes.
7395    pub fn fetch_messages_for_lexical_rebuild_batch(
7396        &self,
7397        conversation_ids: &[i64],
7398        max_messages: Option<usize>,
7399        max_content_bytes: Option<usize>,
7400    ) -> Result<HashMap<i64, Vec<Message>>> {
7401        if conversation_ids.is_empty() {
7402            return Ok(HashMap::new());
7403        }
7404
7405        let mut grouped: HashMap<i64, Vec<Message>> =
7406            HashMap::with_capacity(conversation_ids.len());
7407        let mut fetched_conversation_ids = HashSet::with_capacity(conversation_ids.len());
7408        let mut total_messages = 0usize;
7409        let mut total_content_bytes = 0usize;
7410
7411        // The apparent single-query shape (`WHERE conversation_id IN (...) ORDER BY ...`)
7412        // is a bad frankensqlite plan for large live databases: it can
7413        // materialize far more of `messages` than the requested conversations.
7414        // Reuse the hinted per-conversation primary-key lookup instead.
7415        for conversation_id in conversation_ids {
7416            if !fetched_conversation_ids.insert(*conversation_id) {
7417                continue;
7418            }
7419
7420            let messages = self
7421                .fetch_messages_for_lexical_rebuild(*conversation_id)
7422                .with_context(|| {
7423                    format!("fetching lexical rebuild messages for conversation {conversation_id}")
7424                })?;
7425            total_messages = total_messages.saturating_add(messages.len());
7426            if let Some(limit) = max_messages
7427                && total_messages > limit
7428            {
7429                return Err(anyhow!(
7430                    "lexical rebuild batch fetch exceeded message guardrail: messages={total_messages} limit={limit} conversations={}",
7431                    conversation_ids.len()
7432                ));
7433            }
7434
7435            let message_bytes = messages
7436                .iter()
7437                .map(|message| message.content.len())
7438                .sum::<usize>();
7439            total_content_bytes = total_content_bytes.saturating_add(message_bytes);
7440            if let Some(limit) = max_content_bytes
7441                && total_content_bytes > limit
7442            {
7443                return Err(anyhow!(
7444                    "lexical rebuild batch fetch exceeded content-byte guardrail: bytes={total_content_bytes} limit={limit} conversations={}",
7445                    conversation_ids.len()
7446                ));
7447            }
7448
7449            if !messages.is_empty() {
7450                grouped.insert(*conversation_id, messages);
7451            }
7452        }
7453
7454        Ok(grouped)
7455    }
7456
7457    /// Stream lexical rebuild message rows in `(conversation_id, idx)` order
7458    /// without materializing the full result set.
7459    pub fn stream_messages_for_lexical_rebuild_between_conversation_ids<F>(
7460        &self,
7461        start_conversation_id: i64,
7462        end_conversation_id: i64,
7463        mut f: F,
7464    ) -> Result<()>
7465    where
7466        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7467    {
7468        if end_conversation_id < start_conversation_id {
7469            return Ok(());
7470        }
7471
7472        let conversation_ids: Vec<i64> = self
7473            .conn
7474            .query_map_collect(
7475                "SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
7476                fparams![start_conversation_id, end_conversation_id],
7477                |row| row.get_typed(0),
7478            )
7479            .with_context(|| "listing conversation ids for streamed lexical rebuild")?;
7480
7481        for conversation_id in conversation_ids {
7482            let messages = self
7483                .fetch_messages_for_lexical_rebuild(conversation_id)
7484                .with_context(|| {
7485                    format!("streaming lexical rebuild messages for conversation {conversation_id}")
7486                })?;
7487
7488            for message in messages {
7489                let message_id = message.id.ok_or_else(|| {
7490                    anyhow!(
7491                        "lexical rebuild message missing id for conversation {conversation_id} idx {}",
7492                        message.idx
7493                    )
7494                })?;
7495                f(LexicalRebuildMessageRow {
7496                    conversation_id,
7497                    id: message_id,
7498                    idx: message.idx,
7499                    role: role_str(&message.role),
7500                    author: message.author,
7501                    created_at: message.created_at,
7502                    content: message.content,
7503                })?;
7504            }
7505        }
7506
7507        Ok(())
7508    }
7509
7510    /// Stream grouped lexical rebuild message rows in `(conversation_id, idx)`
7511    /// order by reusing the canonical per-message stream and coalescing rows
7512    /// per conversation.
7513    pub fn stream_grouped_messages_for_lexical_rebuild_between_conversation_ids<F>(
7514        &self,
7515        start_conversation_id: i64,
7516        end_conversation_id: i64,
7517        mut f: F,
7518    ) -> Result<()>
7519    where
7520        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7521    {
7522        if end_conversation_id < start_conversation_id {
7523            return Ok(());
7524        }
7525
7526        let mut current_conversation_id: Option<i64> = None;
7527        let mut current_messages: LexicalRebuildGroupedMessageRows = SmallVec::new();
7528        let mut current_last_message_id = 0i64;
7529        let mut flush_current = |current_conversation_id: &mut Option<i64>,
7530                                 current_messages: &mut LexicalRebuildGroupedMessageRows,
7531                                 current_last_message_id: &mut i64|
7532         -> Result<()> {
7533            let Some(conversation_id) = current_conversation_id.take() else {
7534                return Ok(());
7535            };
7536            let messages = std::mem::take(current_messages);
7537            let last_message_id = std::mem::take(current_last_message_id);
7538            f(conversation_id, messages, last_message_id)
7539        };
7540
7541        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7542            start_conversation_id,
7543            end_conversation_id,
7544            |row| {
7545                if current_conversation_id != Some(row.conversation_id) {
7546                    flush_current(
7547                        &mut current_conversation_id,
7548                        &mut current_messages,
7549                        &mut current_last_message_id,
7550                    )?;
7551                    current_conversation_id = Some(row.conversation_id);
7552                }
7553                current_last_message_id = row.id;
7554                current_messages.push(LexicalRebuildGroupedMessageRow {
7555                    idx: row.idx,
7556                    is_tool_role: row.role == "tool",
7557                    created_at: row.created_at,
7558                    content: row.content,
7559                });
7560                Ok(())
7561            },
7562        )
7563        .with_context(|| "streaming grouped lexical rebuild messages")?;
7564
7565        flush_current(
7566            &mut current_conversation_id,
7567            &mut current_messages,
7568            &mut current_last_message_id,
7569        )
7570        .with_context(|| "flushing grouped lexical rebuild messages")
7571    }
7572
7573    /// Stream grouped lexical rebuild message rows from a starting conversation
7574    /// id to the end of the table.
7575    pub fn stream_grouped_messages_for_lexical_rebuild_from_conversation_id<F>(
7576        &self,
7577        start_conversation_id: i64,
7578        f: F,
7579    ) -> Result<()>
7580    where
7581        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7582    {
7583        self.stream_grouped_messages_for_lexical_rebuild_between_conversation_ids(
7584            start_conversation_id,
7585            i64::MAX,
7586            f,
7587        )
7588    }
7589
7590    /// Stream lexical rebuild message rows from a starting conversation id to
7591    /// the end of the table.
7592    pub fn stream_messages_for_lexical_rebuild_from_conversation_id<F>(
7593        &self,
7594        start_conversation_id: i64,
7595        f: F,
7596    ) -> Result<()>
7597    where
7598        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7599    {
7600        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7601            start_conversation_id,
7602            i64::MAX,
7603            f,
7604        )
7605    }
7606
7607    /// Get a source by ID.
7608    pub fn get_source(&self, id: &str) -> Result<Option<Source>> {
7609        let result = self.conn.query_row_map(
7610            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources WHERE id = ?1",
7611            fparams![id],
7612            |row| {
7613                let kind_str: String = row.get_typed(1)?;
7614                let config_json_str: Option<String> = row.get_typed(5)?;
7615                Ok(Source {
7616                    id: row.get_typed(0)?,
7617                    kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7618                    host_label: row.get_typed(2)?,
7619                    machine_id: row.get_typed(3)?,
7620                    platform: row.get_typed(4)?,
7621                    config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7622                    created_at: row.get_typed(6)?,
7623                    updated_at: row.get_typed(7)?,
7624                })
7625            },
7626        );
7627        Ok(result.optional()?)
7628    }
7629
7630    /// List all sources.
7631    pub fn list_sources(&self) -> Result<Vec<Source>> {
7632        self.conn
7633            .query_map_collect(
7634                "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources ORDER BY id",
7635                fparams![],
7636                |row| {
7637                    let kind_str: String = row.get_typed(1)?;
7638                    let config_json_str: Option<String> = row.get_typed(5)?;
7639                    Ok(Source {
7640                        id: row.get_typed(0)?,
7641                        kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7642                        host_label: row.get_typed(2)?,
7643                        machine_id: row.get_typed(3)?,
7644                        platform: row.get_typed(4)?,
7645                        config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7646                        created_at: row.get_typed(6)?,
7647                        updated_at: row.get_typed(7)?,
7648                    })
7649                },
7650            )
7651            .with_context(|| "listing sources")
7652    }
7653
7654    /// Get IDs of all non-local sources.
7655    pub fn get_source_ids(&self) -> Result<Vec<String>> {
7656        self.conn
7657            .query_map_collect(
7658                "SELECT id FROM sources WHERE id != 'local' ORDER BY id",
7659                fparams![],
7660                |row| row.get_typed(0),
7661            )
7662            .with_context(|| "listing source ids")
7663    }
7664
7665    /// Create or update a source.
7666    pub fn upsert_source(&self, source: &Source) -> Result<()> {
7667        self.invalidate_conversation_source_cache(source.id.as_str());
7668        let now = Self::now_millis();
7669        let kind_str = source.kind.to_string();
7670        let config_json_str = source
7671            .config_json
7672            .as_ref()
7673            .map(serde_json::to_string)
7674            .transpose()?;
7675
7676        // Re-indexing commonly reuses the same normalized source metadata
7677        // across many conversations. Skip the write entirely when the row is
7678        // already identical so we avoid needless WAL churn and timestamp bumps.
7679        self.conn.execute_compat(
7680            "INSERT INTO sources(id, kind, host_label, machine_id, platform, config_json, created_at, updated_at)
7681             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
7682             ON CONFLICT(id) DO UPDATE SET
7683                 kind = excluded.kind,
7684                 host_label = excluded.host_label,
7685                 machine_id = excluded.machine_id,
7686                 platform = excluded.platform,
7687                 config_json = excluded.config_json,
7688                 updated_at = excluded.updated_at
7689             WHERE NOT (
7690                 sources.kind IS excluded.kind
7691                 AND sources.host_label IS excluded.host_label
7692                 AND sources.machine_id IS excluded.machine_id
7693                 AND sources.platform IS excluded.platform
7694                 AND sources.config_json IS excluded.config_json
7695             )",
7696            fparams![
7697                source.id.as_str(),
7698                kind_str.as_str(),
7699                source.host_label.as_deref(),
7700                source.machine_id.as_deref(),
7701                source.platform.as_deref(),
7702                config_json_str.as_deref(),
7703                source.created_at.unwrap_or(now),
7704                now
7705            ],
7706        )?;
7707        Ok(())
7708    }
7709
7710    fn historical_bundle_key_hash(
7711        version: u32,
7712        bundle: &HistoricalDatabaseBundle,
7713        include_bundle_stats: bool,
7714    ) -> String {
7715        let signature = if include_bundle_stats {
7716            format!(
7717                "{}:{}:{}:{}",
7718                version,
7719                bundle.root_path.display(),
7720                bundle.total_bytes,
7721                bundle.modified_at_ms
7722            )
7723        } else {
7724            format!("{}:{}", version, bundle.root_path.display())
7725        };
7726        blake3::hash(signature.as_bytes()).to_hex().to_string()
7727    }
7728
7729    fn historical_bundle_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7730        format!(
7731            "historical_bundle_salvaged:{}",
7732            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_LEDGER_VERSION, bundle, false)
7733        )
7734    }
7735
7736    fn historical_bundle_legacy_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7737        let signature = format!(
7738            "{}:{}:{}:{}",
7739            HISTORICAL_SALVAGE_LEDGER_VERSION,
7740            bundle.root_path.display(),
7741            bundle.total_bytes,
7742            bundle.modified_at_ms
7743        );
7744        format!(
7745            "historical_bundle_salvaged:{}",
7746            blake3::hash(signature.as_bytes()).to_hex()
7747        )
7748    }
7749
7750    fn historical_bundle_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
7751        format!(
7752            "historical_bundle_progress:{}",
7753            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_PROGRESS_VERSION, bundle, false)
7754        )
7755    }
7756
7757    fn historical_bundle_legacy_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
7758        let signature = format!(
7759            "{}:{}:{}:{}",
7760            HISTORICAL_SALVAGE_PROGRESS_VERSION,
7761            bundle.root_path.display(),
7762            bundle.total_bytes,
7763            bundle.modified_at_ms
7764        );
7765        format!(
7766            "historical_bundle_progress:{}",
7767            blake3::hash(signature.as_bytes()).to_hex()
7768        )
7769    }
7770
7771    fn historical_bundle_already_imported(
7772        &self,
7773        bundle: &HistoricalDatabaseBundle,
7774    ) -> Result<bool> {
7775        for key in [
7776            Self::historical_bundle_meta_key(bundle),
7777            Self::historical_bundle_legacy_meta_key(bundle),
7778        ] {
7779            let existing: Option<String> = self
7780                .conn
7781                .query_row_map(
7782                    "SELECT value FROM meta WHERE key = ?1",
7783                    fparams![key.as_str()],
7784                    |row| row.get_typed(0),
7785                )
7786                .optional()?;
7787            if existing.is_some() {
7788                return Ok(true);
7789            }
7790        }
7791        Ok(false)
7792    }
7793
7794    pub(crate) fn has_pending_historical_bundles(&self, canonical_db_path: &Path) -> Result<bool> {
7795        for bundle in discover_historical_database_bundles(canonical_db_path) {
7796            if !self.historical_bundle_already_imported(&bundle)? {
7797                return Ok(true);
7798            }
7799        }
7800        Ok(false)
7801    }
7802
7803    fn load_historical_bundle_progress(
7804        &self,
7805        bundle: &HistoricalDatabaseBundle,
7806    ) -> Result<Option<HistoricalBundleProgress>> {
7807        for key in [
7808            Self::historical_bundle_progress_key(bundle),
7809            Self::historical_bundle_legacy_progress_key(bundle),
7810        ] {
7811            let raw: Option<String> = self
7812                .conn
7813                .query_row_map(
7814                    "SELECT value FROM meta WHERE key = ?1",
7815                    fparams![key.as_str()],
7816                    |row| row.get_typed(0),
7817                )
7818                .optional()?;
7819            let Some(raw) = raw else {
7820                continue;
7821            };
7822            let parsed: HistoricalBundleProgress =
7823                serde_json::from_str(&raw).with_context(|| {
7824                    format!(
7825                        "parsing historical salvage progress checkpoint for {}",
7826                        bundle.root_path.display()
7827                    )
7828                })?;
7829            if parsed.progress_version == HISTORICAL_SALVAGE_PROGRESS_VERSION {
7830                return Ok(Some(parsed));
7831            }
7832        }
7833        Ok(None)
7834    }
7835
7836    fn record_historical_bundle_progress(
7837        &self,
7838        bundle: &HistoricalDatabaseBundle,
7839        method: &str,
7840        last_completed_source_row_id: i64,
7841        conversations_imported: usize,
7842        messages_imported: usize,
7843    ) -> Result<()> {
7844        let key = Self::historical_bundle_progress_key(bundle);
7845        let value = HistoricalBundleProgress {
7846            progress_version: HISTORICAL_SALVAGE_PROGRESS_VERSION,
7847            path: bundle.root_path.display().to_string(),
7848            bytes: bundle.total_bytes,
7849            modified_at_ms: bundle.modified_at_ms,
7850            method: method.to_string(),
7851            last_completed_source_row_id,
7852            conversations_imported,
7853            messages_imported,
7854            updated_at_ms: Self::now_millis(),
7855        };
7856        let value_str = serde_json::to_string(&value)?;
7857        self.conn.execute_compat(
7858            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
7859            fparams![key.as_str(), value_str.as_str()],
7860        )?;
7861        Ok(())
7862    }
7863
7864    fn clear_historical_bundle_progress(&self, bundle: &HistoricalDatabaseBundle) -> Result<()> {
7865        for key in [
7866            Self::historical_bundle_progress_key(bundle),
7867            Self::historical_bundle_legacy_progress_key(bundle),
7868        ] {
7869            self.conn
7870                .execute_compat("DELETE FROM meta WHERE key = ?1", fparams![key.as_str()])?;
7871        }
7872        Ok(())
7873    }
7874
7875    fn record_historical_bundle_import(
7876        &self,
7877        bundle: &HistoricalDatabaseBundle,
7878        method: &str,
7879        conversations_imported: usize,
7880        messages_imported: usize,
7881    ) -> Result<()> {
7882        let key = Self::historical_bundle_meta_key(bundle);
7883        let value = serde_json::json!({
7884            "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
7885            "path": bundle.root_path.display().to_string(),
7886            "bytes": bundle.total_bytes,
7887            "modified_at_ms": bundle.modified_at_ms,
7888            "method": method,
7889            "conversations_imported": conversations_imported,
7890            "messages_imported": messages_imported,
7891            "recorded_at_ms": Self::now_millis(),
7892        });
7893        let value_str = serde_json::to_string(&value)?;
7894        self.conn.execute_compat(
7895            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
7896            fparams![key.as_str(), value_str.as_str()],
7897        )?;
7898        Ok(())
7899    }
7900
7901    fn historical_import_error_is_split_retryable(err: &anyhow::Error) -> bool {
7902        const RETRYABLE_PATTERNS: &[&str] = &[
7903            "out of memory",
7904            "string or blob too big",
7905            "too many sql variables",
7906        ];
7907        err.chain().any(|cause| {
7908            let rendered = cause.to_string().to_ascii_lowercase();
7909            RETRYABLE_PATTERNS
7910                .iter()
7911                .any(|pattern| rendered.contains(pattern))
7912        })
7913    }
7914
7915    fn split_historical_batch_entry_messages(
7916        entry: &HistoricalBatchEntry,
7917    ) -> Option<(HistoricalBatchEntry, HistoricalBatchEntry)> {
7918        if entry.conversation.messages.len() < 2 {
7919            return None;
7920        }
7921        let split_at = entry.conversation.messages.len() / 2;
7922        if split_at == 0 || split_at >= entry.conversation.messages.len() {
7923            return None;
7924        }
7925
7926        let mut left = entry.clone();
7927        left.conversation.messages = entry.conversation.messages[..split_at].to_vec();
7928
7929        let mut right = entry.clone();
7930        right.conversation.messages = entry.conversation.messages[split_at..].to_vec();
7931
7932        Some((left, right))
7933    }
7934
7935    fn import_historical_batch_with_retry<F>(
7936        entries: &[HistoricalBatchEntry],
7937        insert_batch: &mut F,
7938    ) -> Result<HistoricalBatchImportTotals>
7939    where
7940        F: FnMut(&[HistoricalBatchEntry]) -> Result<HistoricalBatchImportTotals>,
7941    {
7942        match insert_batch(entries) {
7943            Ok(totals) => Ok(totals),
7944            Err(err) if Self::historical_import_error_is_split_retryable(&err) => {
7945                if entries.len() > 1 {
7946                    let mid = entries.len() / 2;
7947                    tracing::warn!(
7948                        batch_entries = entries.len(),
7949                        split_left = mid,
7950                        split_right = entries.len() - mid,
7951                        error = %err,
7952                        "historical salvage batch failed; retrying in smaller sub-batches"
7953                    );
7954                    let left =
7955                        Self::import_historical_batch_with_retry(&entries[..mid], insert_batch)?;
7956                    let right =
7957                        Self::import_historical_batch_with_retry(&entries[mid..], insert_batch)?;
7958                    return Ok(HistoricalBatchImportTotals {
7959                        inserted_source_rows: left.inserted_source_rows
7960                            + right.inserted_source_rows,
7961                        inserted_messages: left.inserted_messages + right.inserted_messages,
7962                    });
7963                }
7964
7965                if let Some(entry) = entries.first()
7966                    && let Some((left, right)) = Self::split_historical_batch_entry_messages(entry)
7967                {
7968                    tracing::warn!(
7969                        source_row_id = entry.source_row_id,
7970                        message_count = entry.conversation.messages.len(),
7971                        error = %err,
7972                        "historical salvage conversation failed; retrying in smaller message slices"
7973                    );
7974                    let left_totals = Self::import_historical_batch_with_retry(
7975                        std::slice::from_ref(&left),
7976                        insert_batch,
7977                    )?;
7978                    let right_totals = Self::import_historical_batch_with_retry(
7979                        std::slice::from_ref(&right),
7980                        insert_batch,
7981                    )?;
7982                    return Ok(HistoricalBatchImportTotals {
7983                        inserted_source_rows: usize::from(
7984                            left_totals.inserted_source_rows > 0
7985                                || right_totals.inserted_source_rows > 0,
7986                        ),
7987                        inserted_messages: left_totals
7988                            .inserted_messages
7989                            .saturating_add(right_totals.inserted_messages),
7990                    });
7991                }
7992
7993                Err(err)
7994            }
7995            Err(err) => Err(err),
7996        }
7997    }
7998
7999    fn import_historical_sources(&self, source_conn: &FrankenConnection) -> Result<()> {
8000        let sources: Vec<Source> = match source_conn.query_map_collect(
8001            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at
8002             FROM sources",
8003            fparams![],
8004            |row| {
8005                let raw_source_id: String = row.get_typed(0)?;
8006                let kind_str: String = row.get_typed(1)?;
8007                let raw_host_label: Option<String> = row.get_typed(2)?;
8008                let config_json_raw: Option<String> = row.get_typed(5)?;
8009                let (source_id, source_kind, host_label) = normalized_storage_source_parts(
8010                    Some(raw_source_id.as_str()),
8011                    Some(kind_str.as_str()),
8012                    raw_host_label.as_deref(),
8013                );
8014                Ok(Source {
8015                    id: source_id,
8016                    kind: source_kind,
8017                    host_label,
8018                    machine_id: row.get_typed(3)?,
8019                    platform: row.get_typed(4)?,
8020                    config_json: config_json_raw.and_then(|raw| serde_json::from_str(&raw).ok()),
8021                    created_at: row.get_typed(6)?,
8022                    updated_at: row.get_typed(7)?,
8023                })
8024            },
8025        ) {
8026            Ok(rows) => rows,
8027            Err(err) => {
8028                tracing::warn!(error = %err, "historical sources table unavailable; skipping source import");
8029                return Ok(());
8030            }
8031        };
8032
8033        for source in sources {
8034            self.upsert_source(&source)?;
8035        }
8036        Ok(())
8037    }
8038
8039    fn import_historical_conversations(
8040        &self,
8041        bundle: &HistoricalDatabaseBundle,
8042        salvage_method: &str,
8043        source_conn: &FrankenConnection,
8044    ) -> Result<(usize, usize)> {
8045        let batch_limits = historical_import_batch_limits();
8046        let cache_enabled = IndexingCache::is_enabled();
8047        let mut indexing_cache = IndexingCache::new();
8048        let mut known_sources: HashSet<String> = self
8049            .list_sources()?
8050            .into_iter()
8051            .map(|source| source.id)
8052            .collect();
8053        let resume_progress = self.load_historical_bundle_progress(bundle)?;
8054        let resume_after_row_id = resume_progress
8055            .as_ref()
8056            .map(|progress| progress.last_completed_source_row_id)
8057            .filter(|row_id| *row_id > 0);
8058
8059        tracing::info!(
8060            target: "cass::historical_salvage",
8061            batch_conversations = batch_limits.conversations,
8062            batch_messages = batch_limits.messages,
8063            batch_payload_chars = batch_limits.payload_chars,
8064            cache_enabled,
8065            resume_after_row_id,
8066            "configured historical salvage batch limits"
8067        );
8068
8069        if let Some(progress) = &resume_progress {
8070            tracing::info!(
8071                target: "cass::historical_salvage",
8072                path = %bundle.root_path.display(),
8073                resume_after_row_id = progress.last_completed_source_row_id,
8074                prior_conversations_imported = progress.conversations_imported,
8075                prior_messages_imported = progress.messages_imported,
8076                "resuming historical salvage bundle from durable checkpoint"
8077            );
8078        }
8079
8080        // LEFT JOIN + COALESCE on agents so legacy source databases with NULL
8081        // agent_id (the V1 schema did not require NOT NULL) still have their
8082        // conversations imported, degrading to 'unknown' slug like the other
8083        // rebuild paths.  Using INNER JOIN here would silently drop those
8084        // conversations during historical salvage, which is data loss.
8085        let conv_sql = if resume_after_row_id.is_some() {
8086            "SELECT
8087                c.id,
8088                COALESCE(a.slug, 'unknown'),
8089                w.path,
8090                c.external_id,
8091                c.title,
8092                c.source_path,
8093                c.started_at,
8094                c.ended_at,
8095                c.approx_tokens,
8096                c.metadata_json,
8097                c.source_id,
8098                c.origin_host
8099             FROM conversations c
8100             LEFT JOIN agents a ON c.agent_id = a.id
8101             LEFT JOIN workspaces w ON c.workspace_id = w.id
8102             WHERE c.id > ?1
8103             ORDER BY c.id"
8104        } else {
8105            "SELECT
8106                c.id,
8107                COALESCE(a.slug, 'unknown'),
8108                w.path,
8109                c.external_id,
8110                c.title,
8111                c.source_path,
8112                c.started_at,
8113                c.ended_at,
8114                c.approx_tokens,
8115                c.metadata_json,
8116                c.source_id,
8117                c.origin_host
8118             FROM conversations c
8119             LEFT JOIN agents a ON c.agent_id = a.id
8120             LEFT JOIN workspaces w ON c.workspace_id = w.id
8121             ORDER BY c.id"
8122        };
8123        let conv_params: &[ParamValue] =
8124            if let Some(last_completed_source_row_id) = resume_after_row_id {
8125                &[ParamValue::from(last_completed_source_row_id)]
8126            } else {
8127                &[]
8128            };
8129
8130        #[allow(clippy::type_complexity)]
8131        let conv_rows: Vec<(
8132            i64,
8133            String,
8134            Option<String>,
8135            Option<String>,
8136            Option<String>,
8137            String,
8138            Option<i64>,
8139            Option<i64>,
8140            Option<i64>,
8141            Option<String>,
8142            Option<String>,
8143            Option<String>,
8144        )> = source_conn
8145            .query_map_collect(conv_sql, conv_params, |row| {
8146                Ok((
8147                    row.get_typed::<i64>(0)?,
8148                    row.get_typed::<String>(1)?,
8149                    row.get_typed::<Option<String>>(2)?,
8150                    row.get_typed::<Option<String>>(3)?,
8151                    row.get_typed::<Option<String>>(4)?,
8152                    row.get_typed::<String>(5)?,
8153                    row.get_typed::<Option<i64>>(6)?,
8154                    row.get_typed::<Option<i64>>(7)?,
8155                    row.get_typed::<Option<i64>>(8)?,
8156                    row.get_typed::<Option<String>>(9)?,
8157                    row.get_typed::<Option<String>>(10)?,
8158                    row.get_typed::<Option<String>>(11)?,
8159                ))
8160            })
8161            .context("querying historical conversations")?;
8162
8163        let msg_sql = "SELECT idx, role, author, created_at, content, extra_json
8164             FROM messages
8165             WHERE conversation_id = ?1
8166             ORDER BY idx";
8167
8168        let mut imported_conversations = resume_progress
8169            .as_ref()
8170            .map(|progress| progress.conversations_imported)
8171            .unwrap_or(0);
8172        let mut imported_messages = resume_progress
8173            .as_ref()
8174            .map(|progress| progress.messages_imported)
8175            .unwrap_or(0);
8176        let mut pending_batch: Vec<HistoricalBatchEntry> = Vec::new();
8177        let mut pending_batch_messages = 0usize;
8178        let mut pending_batch_chars = 0usize;
8179        let mut pending_batch_first_row_id: Option<i64> = None;
8180        let mut pending_batch_last_row_id: Option<i64> = None;
8181
8182        let flush_batch = |storage: &FrankenStorage,
8183                           batch: &mut Vec<HistoricalBatchEntry>,
8184                           pending_messages: &mut usize,
8185                           pending_chars: &mut usize,
8186                           first_row_id: &mut Option<i64>,
8187                           last_row_id: &mut Option<i64>,
8188                           imported_conversations: &mut usize,
8189                           imported_messages: &mut usize|
8190         -> Result<()> {
8191            if batch.is_empty() {
8192                return Ok(());
8193            }
8194
8195            let batch_first_row_id = *first_row_id;
8196            let batch_last_row_id = *last_row_id;
8197            if historical_salvage_debug_enabled() {
8198                eprintln!(
8199                    "[historical-salvage] flushing batch rows {:?}..{:?} conversations={} messages={} payload_chars={}",
8200                    batch_first_row_id,
8201                    batch_last_row_id,
8202                    batch.len(),
8203                    *pending_messages,
8204                    *pending_chars
8205                );
8206            }
8207            tracing::info!(
8208                target: "cass::historical_salvage",
8209                batch_conversations = batch.len(),
8210                batch_messages = *pending_messages,
8211                batch_payload_chars = *pending_chars,
8212                first_source_row_id = batch_first_row_id,
8213                last_source_row_id = batch_last_row_id,
8214                "flushing historical salvage batch"
8215            );
8216
8217            let mut insert_batch =
8218                |entries: &[HistoricalBatchEntry]| -> Result<HistoricalBatchImportTotals> {
8219                    let borrowed_batch: Vec<(i64, Option<i64>, &Conversation)> = entries
8220                        .iter()
8221                        .map(|entry| (entry.agent_id, entry.workspace_id, &entry.conversation))
8222                        .collect();
8223                    let outcomes = storage
8224                        .insert_conversations_batched(&borrowed_batch)
8225                        .with_context(|| {
8226                            let first_source_row_id =
8227                                entries.first().map(|entry| entry.source_row_id);
8228                            let last_source_row_id =
8229                                entries.last().map(|entry| entry.source_row_id);
8230                            format!(
8231                                "inserting historical salvage batch source rows {:?}..{:?}",
8232                                first_source_row_id, last_source_row_id
8233                            )
8234                        })?;
8235                    let mut totals = HistoricalBatchImportTotals::default();
8236                    for outcome in outcomes {
8237                        if !outcome.inserted_indices.is_empty() {
8238                            totals.inserted_source_rows += 1;
8239                            totals.inserted_messages += outcome.inserted_indices.len();
8240                        }
8241                    }
8242                    Ok(totals)
8243                };
8244            let totals =
8245                Self::import_historical_batch_with_retry(batch.as_slice(), &mut insert_batch)?;
8246            *imported_conversations =
8247                (*imported_conversations).saturating_add(totals.inserted_source_rows);
8248            *imported_messages = (*imported_messages).saturating_add(totals.inserted_messages);
8249            if let Some(last_completed_row_id) = batch_last_row_id {
8250                storage.record_historical_bundle_progress(
8251                    bundle,
8252                    salvage_method,
8253                    last_completed_row_id,
8254                    *imported_conversations,
8255                    *imported_messages,
8256                )?;
8257            }
8258            tracing::info!(
8259                target: "cass::historical_salvage",
8260                batch_conversations = batch.len(),
8261                batch_messages = *pending_messages,
8262                imported_conversations = *imported_conversations,
8263                imported_messages = *imported_messages,
8264                first_source_row_id = batch_first_row_id,
8265                last_source_row_id = batch_last_row_id,
8266                "historical salvage batch committed"
8267            );
8268            if historical_salvage_debug_enabled() {
8269                eprintln!(
8270                    "[historical-salvage] committed batch rows {:?}..{:?} imported_conversations={} imported_messages={}",
8271                    batch_first_row_id,
8272                    batch_last_row_id,
8273                    *imported_conversations,
8274                    *imported_messages
8275                );
8276            }
8277            batch.clear();
8278            *pending_messages = 0;
8279            *pending_chars = 0;
8280            *first_row_id = None;
8281            *last_row_id = None;
8282            Ok(())
8283        };
8284
8285        for (
8286            conversation_row_id,
8287            agent_slug,
8288            workspace_path,
8289            external_id,
8290            title,
8291            source_path,
8292            started_at,
8293            ended_at,
8294            approx_tokens,
8295            metadata_json_raw,
8296            raw_source_id,
8297            raw_origin_host,
8298        ) in conv_rows
8299        {
8300            let source_id = crate::search::tantivy::normalized_index_source_id(
8301                raw_source_id.as_deref(),
8302                None,
8303                raw_origin_host.as_deref(),
8304            );
8305            let origin_host =
8306                crate::search::tantivy::normalized_index_origin_host(raw_origin_host.as_deref());
8307
8308            let messages: Vec<Message> = source_conn
8309                .query_map_collect(msg_sql, fparams![conversation_row_id], |msg_row| {
8310                    let role: String = msg_row.get_typed(1)?;
8311                    Ok(Message {
8312                        id: None,
8313                        idx: msg_row.get_typed(0)?,
8314                        role: match role.as_str() {
8315                            "user" => MessageRole::User,
8316                            "agent" | "assistant" => MessageRole::Agent,
8317                            "tool" => MessageRole::Tool,
8318                            "system" => MessageRole::System,
8319                            other => MessageRole::Other(other.to_string()),
8320                        },
8321                        author: msg_row.get_typed(2)?,
8322                        created_at: msg_row.get_typed(3)?,
8323                        content: msg_row.get_typed(4)?,
8324                        extra_json: parse_historical_json_column(msg_row.get_typed(5)?),
8325                        snippets: Vec::new(),
8326                    })
8327                })
8328                .context("collecting historical message rows")?;
8329
8330            if messages.is_empty() {
8331                continue;
8332            }
8333
8334            let conversation_message_count = messages.len();
8335            let conversation_chars = messages
8336                .iter()
8337                .map(message_payload_size_hint)
8338                .sum::<usize>();
8339
8340            let conversation = Conversation {
8341                id: None,
8342                agent_slug: agent_slug.clone(),
8343                workspace: workspace_path.map(PathBuf::from),
8344                external_id,
8345                title,
8346                source_path: PathBuf::from(source_path),
8347                started_at,
8348                ended_at,
8349                approx_tokens,
8350                metadata_json: parse_json_column(metadata_json_raw),
8351                messages,
8352                source_id,
8353                origin_host,
8354            };
8355
8356            if !known_sources.contains(&conversation.source_id) {
8357                let placeholder = if conversation.source_id == LOCAL_SOURCE_ID {
8358                    Source::local()
8359                } else {
8360                    Source {
8361                        id: conversation.source_id.clone(),
8362                        kind: SourceKind::Ssh,
8363                        host_label: conversation.origin_host.clone(),
8364                        machine_id: None,
8365                        platform: None,
8366                        config_json: None,
8367                        created_at: None,
8368                        updated_at: None,
8369                    }
8370                };
8371                self.upsert_source(&placeholder)?;
8372                known_sources.insert(conversation.source_id.clone());
8373            }
8374
8375            let agent = Agent {
8376                id: None,
8377                slug: agent_slug.clone(),
8378                name: agent_slug,
8379                version: None,
8380                kind: AgentKind::Cli,
8381            };
8382            let agent_id = if cache_enabled {
8383                indexing_cache.get_or_insert_agent(self, &agent)?
8384            } else {
8385                self.ensure_agent(&agent)?
8386            };
8387            let workspace_id = if let Some(workspace) = &conversation.workspace {
8388                if cache_enabled {
8389                    Some(indexing_cache.get_or_insert_workspace(self, workspace, None)?)
8390                } else {
8391                    Some(self.ensure_workspace(workspace, None)?)
8392                }
8393            } else {
8394                None
8395            };
8396
8397            let exceeds_pending_limits = !pending_batch.is_empty()
8398                && (pending_batch.len() >= batch_limits.conversations
8399                    || pending_batch_messages.saturating_add(conversation_message_count)
8400                        > batch_limits.messages
8401                    || pending_batch_chars.saturating_add(conversation_chars)
8402                        > batch_limits.payload_chars);
8403            if exceeds_pending_limits {
8404                flush_batch(
8405                    self,
8406                    &mut pending_batch,
8407                    &mut pending_batch_messages,
8408                    &mut pending_batch_chars,
8409                    &mut pending_batch_first_row_id,
8410                    &mut pending_batch_last_row_id,
8411                    &mut imported_conversations,
8412                    &mut imported_messages,
8413                )?;
8414            }
8415
8416            if pending_batch_first_row_id.is_none() {
8417                pending_batch_first_row_id = Some(conversation_row_id);
8418            }
8419            pending_batch_last_row_id = Some(conversation_row_id);
8420            pending_batch_messages =
8421                pending_batch_messages.saturating_add(conversation_message_count);
8422            pending_batch_chars = pending_batch_chars.saturating_add(conversation_chars);
8423            pending_batch.push(HistoricalBatchEntry {
8424                source_row_id: conversation_row_id,
8425                agent_id,
8426                workspace_id,
8427                conversation,
8428            });
8429
8430            if pending_batch.len() >= batch_limits.conversations
8431                || pending_batch_messages >= batch_limits.messages
8432                || pending_batch_chars >= batch_limits.payload_chars
8433            {
8434                flush_batch(
8435                    self,
8436                    &mut pending_batch,
8437                    &mut pending_batch_messages,
8438                    &mut pending_batch_chars,
8439                    &mut pending_batch_first_row_id,
8440                    &mut pending_batch_last_row_id,
8441                    &mut imported_conversations,
8442                    &mut imported_messages,
8443                )?;
8444            }
8445        }
8446
8447        flush_batch(
8448            self,
8449            &mut pending_batch,
8450            &mut pending_batch_messages,
8451            &mut pending_batch_chars,
8452            &mut pending_batch_first_row_id,
8453            &mut pending_batch_last_row_id,
8454            &mut imported_conversations,
8455            &mut imported_messages,
8456        )?;
8457
8458        if cache_enabled {
8459            let (hits, misses, hit_rate) = indexing_cache.stats();
8460            tracing::info!(
8461                target: "cass::historical_salvage",
8462                hits,
8463                misses,
8464                hit_rate = format!("{:.1}%", hit_rate * 100.0),
8465                agents = indexing_cache.agent_count(),
8466                workspaces = indexing_cache.workspace_count(),
8467                sources = known_sources.len(),
8468                "historical salvage cache stats"
8469            );
8470        }
8471
8472        Ok((imported_conversations, imported_messages))
8473    }
8474
8475    pub fn salvage_historical_databases(
8476        &self,
8477        canonical_db_path: &Path,
8478    ) -> Result<HistoricalSalvageOutcome> {
8479        let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
8480        let mut outcome = HistoricalSalvageOutcome {
8481            bundles_considered: ordered_bundles.len(),
8482            ..HistoricalSalvageOutcome::default()
8483        };
8484
8485        for bundle in ordered_bundles {
8486            if self.historical_bundle_already_imported(&bundle)? {
8487                self.clear_historical_bundle_progress(&bundle)?;
8488                continue;
8489            }
8490
8491            let source = match open_historical_bundle_for_salvage(&bundle).with_context(|| {
8492                format!(
8493                    "opening historical bundle {} for salvage",
8494                    bundle.root_path.display()
8495                )
8496            }) {
8497                Ok(source) => source,
8498                Err(err) => {
8499                    tracing::warn!(
8500                        path = %bundle.root_path.display(),
8501                        error = %err,
8502                        "skipping unreadable historical cass database bundle during salvage"
8503                    );
8504                    self.clear_historical_bundle_progress(&bundle)?;
8505                    continue;
8506                }
8507            };
8508
8509            // #247 (coding_agent_session_search-r8pcy): if a per-bundle progress
8510            // checkpoint already covers the backup's entire conversation row-id
8511            // space, the bundle was effectively fully imported but the daemon was
8512            // killed (e.g. OOM) before the completion ledger marker landed.
8513            // Re-scanning it is a pure O(n) no-op — every batch commits
8514            // imported=0 while taking 5-12 min. Detect it via the high-water
8515            // checkpoint, write the ledger marker, drop the checkpoint, and skip.
8516            if let Some(progress) = self.load_historical_bundle_progress(&bundle)? {
8517                let backup_max_conversation_id: i64 = source
8518                    .conn
8519                    .query_row_map(
8520                        "SELECT COALESCE(MAX(id), 0) FROM conversations",
8521                        fparams![],
8522                        |row| row.get_typed(0),
8523                    )
8524                    .unwrap_or(0);
8525                if backup_max_conversation_id > 0
8526                    && progress.last_completed_source_row_id >= backup_max_conversation_id
8527                {
8528                    self.record_historical_bundle_import(
8529                        &bundle,
8530                        source.method,
8531                        progress.conversations_imported,
8532                        progress.messages_imported,
8533                    )?;
8534                    self.clear_historical_bundle_progress(&bundle)?;
8535                    tracing::info!(
8536                        path = %bundle.root_path.display(),
8537                        last_completed_source_row_id = progress.last_completed_source_row_id,
8538                        backup_max_conversation_id,
8539                        conversations_imported = progress.conversations_imported,
8540                        messages_imported = progress.messages_imported,
8541                        "historical bundle already fully imported per checkpoint; marking salvaged and skipping O(n) re-scan"
8542                    );
8543                    continue;
8544                }
8545            }
8546
8547            self.import_historical_sources(&source.conn)?;
8548            let (imported_conversations, imported_messages) =
8549                self.import_historical_conversations(&bundle, source.method, &source.conn)?;
8550            self.record_historical_bundle_import(
8551                &bundle,
8552                source.method,
8553                imported_conversations,
8554                imported_messages,
8555            )?;
8556            self.clear_historical_bundle_progress(&bundle)?;
8557
8558            outcome.bundles_imported += 1;
8559            outcome.conversations_imported += imported_conversations;
8560            outcome.messages_imported += imported_messages;
8561
8562            tracing::info!(
8563                path = %bundle.root_path.display(),
8564                bytes = bundle.total_bytes,
8565                method = source.method,
8566                imported_conversations,
8567                imported_messages,
8568                "salvaged historical cass database bundle"
8569            );
8570        }
8571
8572        Ok(outcome)
8573    }
8574
8575    /// Delete a source by ID. Returns true if a row was deleted.
8576    pub fn delete_source(&self, id: &str, _cascade: bool) -> Result<bool> {
8577        if id == LOCAL_SOURCE_ID {
8578            anyhow::bail!("cannot delete the local source");
8579        }
8580        let count = self
8581            .conn
8582            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams![id])?;
8583        if count > 0 {
8584            self.invalidate_conversation_source_cache(id);
8585        }
8586        Ok(count > 0)
8587    }
8588
8589    /// Insert a conversation tree (conversation + messages + snippets + FTS).
8590    pub fn insert_conversation_tree(
8591        &self,
8592        agent_id: i64,
8593        workspace_id: Option<i64>,
8594        conv: &Conversation,
8595    ) -> Result<InsertOutcome> {
8596        let normalized_conv = normalized_conversation_for_storage(conv);
8597        let conv = normalized_conv.as_ref();
8598        self.ensure_source_for_conversation(conv)?;
8599        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
8600        let defer_analytics_updates = defer_analytics_updates_enabled();
8601        let conversation_key = conversation_merge_key(agent_id, conv);
8602        let mut tx = self.conn.transaction()?;
8603        let existing = franken_find_existing_conversation_with_tail_by_key(
8604            &tx,
8605            &conversation_key,
8606            Some(conv),
8607        )?;
8608        if let Some(existing) = existing {
8609            let outcome = self.franken_append_messages_with_tail_in_tx(
8610                &tx,
8611                agent_id,
8612                existing.id,
8613                conv,
8614                existing.tail_state,
8615                defer_lexical_updates,
8616                defer_analytics_updates,
8617            )?;
8618            tx.commit()?;
8619            return Ok(outcome);
8620        }
8621
8622        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
8623            &tx,
8624            agent_id,
8625            workspace_id,
8626            conv,
8627            &conversation_key,
8628        )? {
8629            ConversationInsertStatus::Inserted(conv_id) => conv_id,
8630            ConversationInsertStatus::Existing(existing_id) => {
8631                let ExistingMessageLookup {
8632                    by_idx: mut existing_messages,
8633                    replay: mut existing_replay_fingerprints,
8634                } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
8635                let ExistingConversationNewMessages {
8636                    messages: new_messages,
8637                    new_chars,
8638                    idx_collision_count,
8639                    first_collision_idx,
8640                } = collect_new_messages_for_existing_conversation(
8641                    existing_id,
8642                    conv,
8643                    &mut existing_messages,
8644                    &mut existing_replay_fingerprints,
8645                    "skipping replay-equivalent recovered message with shifted idx",
8646                );
8647                let (inserted_last_idx, inserted_last_created_at) =
8648                    borrowed_messages_tail_state(&new_messages);
8649                let mut inserted_indices = Vec::new();
8650                let mut fts_entries = Vec::new();
8651                let mut fts_pending_chars = 0usize;
8652                let mut _fts_inserted_total = 0usize;
8653                let inserted_message_ids =
8654                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
8655                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8656                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8657                    if !defer_lexical_updates {
8658                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8659                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8660                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8661                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8662                        {
8663                            flush_pending_fts_entries(
8664                                self,
8665                                &tx,
8666                                &mut fts_entries,
8667                                &mut fts_pending_chars,
8668                                &mut _fts_inserted_total,
8669                            )?;
8670                        }
8671                    }
8672                    inserted_indices.push(msg.idx);
8673                }
8674
8675                if idx_collision_count > 0 {
8676                    tracing::warn!(
8677                        conversation_id = existing_id,
8678                        collision_count = idx_collision_count,
8679                        first_idx = first_collision_idx,
8680                        source_path = %conv.source_path.display(),
8681                        "message idx collisions encountered while merging recovered conversation; retaining canonical message variants"
8682                    );
8683                }
8684
8685                if !defer_lexical_updates {
8686                    flush_pending_fts_entries(
8687                        self,
8688                        &tx,
8689                        &mut fts_entries,
8690                        &mut fts_pending_chars,
8691                        &mut _fts_inserted_total,
8692                    )?;
8693                }
8694
8695                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
8696                franken_update_conversation_tail_state(
8697                    &tx,
8698                    existing_id,
8699                    conv_last_ts,
8700                    inserted_last_idx,
8701                    inserted_last_created_at,
8702                )?;
8703                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
8704                {
8705                    franken_update_external_conversation_tail_lookup_key(
8706                        &tx,
8707                        &lookup_key,
8708                        conv_last_ts,
8709                        inserted_last_idx,
8710                        inserted_last_created_at,
8711                    )?;
8712                }
8713
8714                if !defer_analytics_updates && !inserted_indices.is_empty() {
8715                    franken_update_daily_stats_in_tx(
8716                        self,
8717                        &tx,
8718                        &conv.agent_slug,
8719                        &conv.source_id,
8720                        conversation_effective_started_at(conv),
8721                        StatsDelta {
8722                            session_count_delta: 0,
8723                            message_count_delta: inserted_indices.len() as i64,
8724                            total_chars_delta: new_chars,
8725                        },
8726                    )?;
8727                }
8728
8729                tx.commit()?;
8730                return Ok(InsertOutcome {
8731                    conversation_id: existing_id,
8732                    conversation_inserted: false,
8733                    inserted_indices,
8734                });
8735            }
8736        };
8737        let mut fts_entries = Vec::new();
8738        let mut fts_pending_chars = 0usize;
8739        let mut _fts_inserted_total = 0usize;
8740        let mut total_chars: i64 = 0;
8741        let mut inserted_indices = Vec::new();
8742        let mut pending_messages = HashMap::new();
8743        let mut pending_replay_fingerprints = HashSet::new();
8744        let mut idx_collision_count = 0usize;
8745        let mut first_collision_idx: Option<i64> = None;
8746        let mut new_messages = Vec::new();
8747        for msg in &conv.messages {
8748            let incoming_fingerprint = message_merge_fingerprint(msg);
8749            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
8750                if existing_fingerprint != &incoming_fingerprint {
8751                    idx_collision_count = idx_collision_count.saturating_add(1);
8752                    first_collision_idx.get_or_insert(msg.idx);
8753                }
8754                continue;
8755            }
8756            let incoming_replay = message_replay_fingerprint(msg);
8757            if pending_replay_fingerprints.contains(&incoming_replay) {
8758                tracing::debug!(
8759                    conversation_id = conv_id,
8760                    idx = msg.idx,
8761                    source_path = %conv.source_path.display(),
8762                    "skipping replay-equivalent duplicate message within new conversation insert"
8763                );
8764                continue;
8765            }
8766            pending_messages.insert(msg.idx, incoming_fingerprint);
8767            pending_replay_fingerprints.insert(incoming_replay);
8768            new_messages.push(msg);
8769        }
8770        let inserted_message_ids = franken_batch_insert_new_messages(&tx, conv_id, &new_messages)?;
8771        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8772            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8773            if !defer_lexical_updates {
8774                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8775                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8776                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8777                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8778                {
8779                    flush_pending_fts_entries(
8780                        self,
8781                        &tx,
8782                        &mut fts_entries,
8783                        &mut fts_pending_chars,
8784                        &mut _fts_inserted_total,
8785                    )?;
8786                }
8787            }
8788            total_chars += msg.content.len() as i64;
8789            inserted_indices.push(msg.idx);
8790        }
8791        if idx_collision_count > 0 {
8792            tracing::warn!(
8793                conversation_id = conv_id,
8794                collision_count = idx_collision_count,
8795                first_idx = first_collision_idx,
8796                source_path = %conv.source_path.display(),
8797                "message idx collisions encountered while inserting a new conversation; retaining the first canonical variant per idx"
8798            );
8799        }
8800        if !defer_lexical_updates {
8801            flush_pending_fts_entries(
8802                self,
8803                &tx,
8804                &mut fts_entries,
8805                &mut fts_pending_chars,
8806                &mut _fts_inserted_total,
8807            )?;
8808        }
8809
8810        if !defer_analytics_updates {
8811            franken_update_daily_stats_in_tx(
8812                self,
8813                &tx,
8814                &conv.agent_slug,
8815                &conv.source_id,
8816                conversation_effective_started_at(conv),
8817                StatsDelta {
8818                    session_count_delta: 1,
8819                    message_count_delta: inserted_indices.len() as i64,
8820                    total_chars_delta: total_chars,
8821                },
8822            )?;
8823        }
8824
8825        tx.commit()?;
8826        Ok(InsertOutcome {
8827            conversation_id: conv_id,
8828            conversation_inserted: true,
8829            inserted_indices,
8830        })
8831    }
8832
8833    #[cfg(test)]
8834    fn insert_conversation_tree_with_profile(
8835        &self,
8836        agent_id: i64,
8837        workspace_id: Option<i64>,
8838        conv: &Conversation,
8839        profile: &mut InsertConversationTreePerfProfile,
8840    ) -> Result<InsertOutcome> {
8841        let total_start = Instant::now();
8842        let normalized_conv = normalized_conversation_for_storage(conv);
8843        let conv = normalized_conv.as_ref();
8844
8845        let source_start = Instant::now();
8846        self.ensure_source_for_conversation(conv)?;
8847        profile.source_duration += source_start.elapsed();
8848
8849        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
8850        let defer_analytics_updates = defer_analytics_updates_enabled();
8851        let conversation_key = conversation_merge_key(agent_id, conv);
8852
8853        let tx_open_start = Instant::now();
8854        let mut tx = self.conn.transaction()?;
8855        profile.tx_open_duration += tx_open_start.elapsed();
8856
8857        let existing_lookup_start = Instant::now();
8858        let existing =
8859            franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
8860        profile.existing_lookup_duration += existing_lookup_start.elapsed();
8861        if let Some(existing_id) = existing {
8862            return Err(anyhow!(
8863                "profile helper expects new conversation path, found existing id {existing_id}"
8864            ));
8865        }
8866
8867        let conversation_row_start = Instant::now();
8868        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
8869            &tx,
8870            agent_id,
8871            workspace_id,
8872            conv,
8873            &conversation_key,
8874        )? {
8875            ConversationInsertStatus::Inserted(conv_id) => conv_id,
8876            ConversationInsertStatus::Existing(existing_id) => {
8877                return Err(anyhow!(
8878                    "profile helper expected inserted conversation row, reused existing id {existing_id}"
8879                ));
8880            }
8881        };
8882        profile.conversation_row_duration += conversation_row_start.elapsed();
8883
8884        let mut fts_entries = Vec::new();
8885        let mut fts_pending_chars = 0usize;
8886        let mut fts_inserted_total = 0usize;
8887        let mut total_chars: i64 = 0;
8888        let mut inserted_indices = Vec::new();
8889        let mut pending_messages = HashMap::new();
8890        let mut pending_replay_fingerprints = HashSet::new();
8891        let mut idx_collision_count = 0usize;
8892        let mut first_collision_idx: Option<i64> = None;
8893        let mut new_messages = Vec::new();
8894
8895        for msg in &conv.messages {
8896            let incoming_fingerprint = message_merge_fingerprint(msg);
8897            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
8898                if existing_fingerprint != &incoming_fingerprint {
8899                    idx_collision_count = idx_collision_count.saturating_add(1);
8900                    first_collision_idx.get_or_insert(msg.idx);
8901                }
8902                continue;
8903            }
8904
8905            let incoming_replay = message_replay_fingerprint(msg);
8906            if pending_replay_fingerprints.contains(&incoming_replay) {
8907                tracing::debug!(
8908                    conversation_id = conv_id,
8909                    idx = msg.idx,
8910                    source_path = %conv.source_path.display(),
8911                    "skipping replay-equivalent duplicate message within profiled new conversation insert"
8912                );
8913                continue;
8914            }
8915
8916            pending_messages.insert(msg.idx, incoming_fingerprint);
8917            pending_replay_fingerprints.insert(incoming_replay);
8918            new_messages.push(msg);
8919        }
8920
8921        let message_insert_start = Instant::now();
8922        let inserted_message_ids = franken_batch_insert_new_messages_with_profile(
8923            &tx,
8924            conv_id,
8925            &new_messages,
8926            &mut profile.message_insert_breakdown,
8927        )?;
8928        profile.message_insert_duration += message_insert_start.elapsed();
8929
8930        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8931            let snippet_insert_start = Instant::now();
8932            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8933            profile.snippet_insert_duration += snippet_insert_start.elapsed();
8934
8935            if !defer_lexical_updates {
8936                let fts_entry_start = Instant::now();
8937                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8938                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8939                profile.fts_entry_duration += fts_entry_start.elapsed();
8940                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8941                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8942                {
8943                    let fts_flush_start = Instant::now();
8944                    flush_pending_fts_entries(
8945                        self,
8946                        &tx,
8947                        &mut fts_entries,
8948                        &mut fts_pending_chars,
8949                        &mut fts_inserted_total,
8950                    )?;
8951                    profile.fts_flush_duration += fts_flush_start.elapsed();
8952                }
8953            }
8954
8955            total_chars += msg.content.len() as i64;
8956            inserted_indices.push(msg.idx);
8957        }
8958
8959        if idx_collision_count > 0 {
8960            tracing::warn!(
8961                conversation_id = conv_id,
8962                collision_count = idx_collision_count,
8963                first_idx = first_collision_idx,
8964                source_path = %conv.source_path.display(),
8965                "message idx collisions encountered while profiling a new conversation insert; retaining the first canonical variant per idx"
8966            );
8967        }
8968
8969        if !defer_lexical_updates {
8970            let fts_flush_start = Instant::now();
8971            flush_pending_fts_entries(
8972                self,
8973                &tx,
8974                &mut fts_entries,
8975                &mut fts_pending_chars,
8976                &mut fts_inserted_total,
8977            )?;
8978            profile.fts_flush_duration += fts_flush_start.elapsed();
8979        }
8980
8981        if !defer_analytics_updates {
8982            let analytics_start = Instant::now();
8983            franken_update_daily_stats_in_tx(
8984                self,
8985                &tx,
8986                &conv.agent_slug,
8987                &conv.source_id,
8988                conversation_effective_started_at(conv),
8989                StatsDelta {
8990                    session_count_delta: 1,
8991                    message_count_delta: inserted_indices.len() as i64,
8992                    total_chars_delta: total_chars,
8993                },
8994            )?;
8995            profile.analytics_duration += analytics_start.elapsed();
8996        }
8997
8998        let commit_start = Instant::now();
8999        tx.commit()?;
9000        profile.commit_duration += commit_start.elapsed();
9001        profile.invocations += 1;
9002        profile.messages += conv.messages.len();
9003        profile.inserted_messages += inserted_indices.len();
9004        profile.total_duration += total_start.elapsed();
9005
9006        Ok(InsertOutcome {
9007            conversation_id: conv_id,
9008            conversation_inserted: true,
9009            inserted_indices,
9010        })
9011    }
9012
9013    #[cfg(test)]
9014    fn append_existing_conversation_with_profile(
9015        &self,
9016        agent_id: i64,
9017        _workspace_id: Option<i64>,
9018        conv: &Conversation,
9019        profile: &mut InsertConversationTreePerfProfile,
9020    ) -> Result<InsertOutcome> {
9021        let total_start = Instant::now();
9022        let normalized_conv = normalized_conversation_for_storage(conv);
9023        let conv = normalized_conv.as_ref();
9024
9025        let source_start = Instant::now();
9026        self.ensure_source_for_conversation(conv)?;
9027        profile.source_duration += source_start.elapsed();
9028
9029        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9030        let defer_analytics_updates = defer_analytics_updates_enabled();
9031        let conversation_key = conversation_merge_key(agent_id, conv);
9032
9033        let tx_open_start = Instant::now();
9034        let mut tx = self.conn.transaction()?;
9035        profile.tx_open_duration += tx_open_start.elapsed();
9036
9037        let existing_lookup_start = Instant::now();
9038        let existing = franken_find_existing_conversation_with_tail_by_key(
9039            &tx,
9040            &conversation_key,
9041            Some(conv),
9042        )?;
9043        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9044        let existing = existing.ok_or_else(|| {
9045            anyhow!("append profile helper expects existing conversation for {conversation_key:?}")
9046        })?;
9047        let existing_id = existing.id;
9048
9049        let existing_idx_lookup_start = Instant::now();
9050        let append_tail_state = existing.tail_state;
9051        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9052        let existing_plan = append_tail_state.as_ref().and_then(|state| {
9053            collect_append_only_tail_messages(
9054                conv,
9055                state.last_message_idx,
9056                state.last_message_created_at,
9057            )
9058        });
9059        let used_append_tail_plan = existing_plan.is_some();
9060        profile.existing_idx_lookup_duration += existing_idx_lookup_start.elapsed();
9061
9062        let dedupe_filter_start = Instant::now();
9063        let ExistingConversationNewMessages {
9064            messages: new_messages,
9065            new_chars,
9066            idx_collision_count,
9067            first_collision_idx,
9068        } = if let Some(existing_plan) = existing_plan {
9069            existing_plan
9070        } else {
9071            let ExistingMessageLookup {
9072                by_idx: mut existing_messages,
9073                replay: mut existing_replay_fingerprints,
9074            } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9075            collect_new_messages_for_existing_conversation(
9076                existing_id,
9077                conv,
9078                &mut existing_messages,
9079                &mut existing_replay_fingerprints,
9080                "skipping replay-equivalent profiled append message with shifted idx",
9081            )
9082        };
9083        profile.dedupe_filter_duration += dedupe_filter_start.elapsed();
9084
9085        let mut inserted_indices = Vec::new();
9086        let mut fts_entries = Vec::new();
9087        let mut fts_pending_chars = 0usize;
9088        let mut fts_inserted_total = 0usize;
9089        let (inserted_last_idx, inserted_last_created_at) =
9090            borrowed_messages_tail_state(&new_messages);
9091
9092        let message_insert_start = Instant::now();
9093        let inserted_message_ids = franken_append_insert_new_messages_with_profile(
9094            &tx,
9095            existing_id,
9096            &new_messages,
9097            &mut profile.message_insert_breakdown,
9098        )?;
9099        profile.message_insert_duration += message_insert_start.elapsed();
9100
9101        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9102            let snippet_insert_start = Instant::now();
9103            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9104            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9105
9106            if !defer_lexical_updates {
9107                let fts_entry_start = Instant::now();
9108                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9109                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9110                profile.fts_entry_duration += fts_entry_start.elapsed();
9111                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9112                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9113                {
9114                    let fts_flush_start = Instant::now();
9115                    flush_pending_fts_entries(
9116                        self,
9117                        &tx,
9118                        &mut fts_entries,
9119                        &mut fts_pending_chars,
9120                        &mut fts_inserted_total,
9121                    )?;
9122                    profile.fts_flush_duration += fts_flush_start.elapsed();
9123                }
9124            }
9125
9126            inserted_indices.push(msg.idx);
9127        }
9128
9129        if idx_collision_count > 0 {
9130            tracing::warn!(
9131                conversation_id = existing_id,
9132                collision_count = idx_collision_count,
9133                first_idx = first_collision_idx,
9134                source_path = %conv.source_path.display(),
9135                "message idx collisions encountered while profiling append merge; retaining canonical message variants"
9136            );
9137        }
9138
9139        if !defer_lexical_updates {
9140            let fts_flush_start = Instant::now();
9141            flush_pending_fts_entries(
9142                self,
9143                &tx,
9144                &mut fts_entries,
9145                &mut fts_pending_chars,
9146                &mut fts_inserted_total,
9147            )?;
9148            profile.fts_flush_duration += fts_flush_start.elapsed();
9149        }
9150
9151        let conversation_row_start = Instant::now();
9152        let mut exact_append_tail_set = false;
9153        if used_append_tail_plan {
9154            if let (Some(last_message_idx), Some(last_message_created_at)) =
9155                (inserted_last_idx, inserted_last_created_at)
9156            {
9157                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9158                    franken_set_conversation_tail_state_after_append(
9159                        &tx,
9160                        existing_id,
9161                        last_message_created_at,
9162                        last_message_idx,
9163                        last_message_created_at,
9164                    )?;
9165                    exact_append_tail_set = true;
9166                } else {
9167                    franken_update_conversation_tail_state(
9168                        &tx,
9169                        existing_id,
9170                        Some(last_message_created_at),
9171                        inserted_last_idx,
9172                        inserted_last_created_at,
9173                    )?;
9174                }
9175            }
9176        } else {
9177            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9178            franken_update_conversation_tail_state(
9179                &tx,
9180                existing_id,
9181                conv_last_ts,
9182                inserted_last_idx,
9183                inserted_last_created_at,
9184            )?;
9185        }
9186        franken_update_external_conversation_tail_after_append(
9187            &tx,
9188            agent_id,
9189            conv,
9190            used_append_tail_plan,
9191            exact_append_tail_set,
9192            inserted_last_idx,
9193            inserted_last_created_at,
9194        )?;
9195        profile.conversation_row_duration += conversation_row_start.elapsed();
9196
9197        if !defer_analytics_updates && !inserted_indices.is_empty() {
9198            let analytics_start = Instant::now();
9199            franken_update_daily_stats_in_tx(
9200                self,
9201                &tx,
9202                &conv.agent_slug,
9203                &conv.source_id,
9204                conversation_effective_started_at(conv),
9205                StatsDelta {
9206                    session_count_delta: 0,
9207                    message_count_delta: inserted_indices.len() as i64,
9208                    total_chars_delta: new_chars,
9209                },
9210            )?;
9211            profile.analytics_duration += analytics_start.elapsed();
9212        }
9213
9214        let commit_start = Instant::now();
9215        tx.commit()?;
9216        profile.commit_duration += commit_start.elapsed();
9217        profile.invocations += 1;
9218        profile.messages += conv.messages.len();
9219        profile.inserted_messages += inserted_indices.len();
9220        profile.total_duration += total_start.elapsed();
9221
9222        Ok(InsertOutcome {
9223            conversation_id: existing_id,
9224            conversation_inserted: false,
9225            inserted_indices,
9226        })
9227    }
9228
9229    /// Append new messages to an existing conversation within an active transaction.
9230    #[allow(clippy::too_many_arguments)]
9231    fn franken_append_messages_with_tail_in_tx(
9232        &self,
9233        tx: &FrankenTransaction<'_>,
9234        agent_id: i64,
9235        conversation_id: i64,
9236        conv: &Conversation,
9237        append_tail_state: Option<ExistingConversationTailState>,
9238        defer_lexical_updates: bool,
9239        defer_analytics_updates: bool,
9240    ) -> Result<InsertOutcome> {
9241        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9242        let append_plan = append_tail_state.as_ref().and_then(|state| {
9243            collect_append_only_tail_messages(
9244                conv,
9245                state.last_message_idx,
9246                state.last_message_created_at,
9247            )
9248        });
9249        let used_append_tail_plan = append_plan.is_some();
9250        let ExistingConversationNewMessages {
9251            messages: new_messages,
9252            new_chars,
9253            idx_collision_count,
9254            first_collision_idx,
9255        } = if let Some(append_plan) = append_plan {
9256            append_plan
9257        } else {
9258            let ExistingMessageLookup {
9259                by_idx: mut existing_messages,
9260                replay: mut existing_replay_fingerprints,
9261            } = franken_existing_message_lookup(tx, conversation_id, &conv.messages)?;
9262            collect_new_messages_for_existing_conversation(
9263                conversation_id,
9264                conv,
9265                &mut existing_messages,
9266                &mut existing_replay_fingerprints,
9267                "skipping replay-equivalent recovered message with shifted idx",
9268            )
9269        };
9270
9271        let mut inserted_indices = Vec::new();
9272        let mut fts_entries = Vec::new();
9273        let mut fts_pending_chars = 0usize;
9274        let mut _fts_inserted_total = 0usize;
9275        let (inserted_last_idx, inserted_last_created_at) =
9276            borrowed_messages_tail_state(&new_messages);
9277        let inserted_message_ids =
9278            franken_append_insert_new_messages(tx, conversation_id, &new_messages)?;
9279        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9280            franken_insert_snippets(tx, msg_id, &msg.snippets)?;
9281            if !defer_lexical_updates {
9282                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9283                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9284                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9285                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9286                {
9287                    flush_pending_fts_entries(
9288                        self,
9289                        tx,
9290                        &mut fts_entries,
9291                        &mut fts_pending_chars,
9292                        &mut _fts_inserted_total,
9293                    )?;
9294                }
9295            }
9296            inserted_indices.push(msg.idx);
9297        }
9298
9299        if idx_collision_count > 0 {
9300            tracing::warn!(
9301                conversation_id,
9302                collision_count = idx_collision_count,
9303                first_idx = first_collision_idx,
9304                source_path = %conv.source_path.display(),
9305                "message idx collisions encountered while appending to an existing conversation; retaining canonical message variants"
9306            );
9307        }
9308
9309        if !defer_lexical_updates {
9310            flush_pending_fts_entries(
9311                self,
9312                tx,
9313                &mut fts_entries,
9314                &mut fts_pending_chars,
9315                &mut _fts_inserted_total,
9316            )?;
9317        }
9318
9319        let mut exact_append_tail_set = false;
9320        if used_append_tail_plan {
9321            if let (Some(last_message_idx), Some(last_message_created_at)) =
9322                (inserted_last_idx, inserted_last_created_at)
9323            {
9324                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9325                    franken_set_conversation_tail_state_after_append(
9326                        tx,
9327                        conversation_id,
9328                        last_message_created_at,
9329                        last_message_idx,
9330                        last_message_created_at,
9331                    )?;
9332                    exact_append_tail_set = true;
9333                } else {
9334                    franken_update_conversation_tail_state(
9335                        tx,
9336                        conversation_id,
9337                        Some(last_message_created_at),
9338                        inserted_last_idx,
9339                        inserted_last_created_at,
9340                    )?;
9341                }
9342            }
9343        } else {
9344            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9345            franken_update_conversation_tail_state(
9346                tx,
9347                conversation_id,
9348                conv_last_ts,
9349                inserted_last_idx,
9350                inserted_last_created_at,
9351            )?;
9352        }
9353        franken_update_external_conversation_tail_after_append(
9354            tx,
9355            agent_id,
9356            conv,
9357            used_append_tail_plan,
9358            exact_append_tail_set,
9359            inserted_last_idx,
9360            inserted_last_created_at,
9361        )?;
9362
9363        if !defer_analytics_updates && !inserted_indices.is_empty() {
9364            let message_count = inserted_indices.len() as i64;
9365            franken_update_daily_stats_in_tx(
9366                self,
9367                tx,
9368                &conv.agent_slug,
9369                &conv.source_id,
9370                conversation_effective_started_at(conv),
9371                StatsDelta {
9372                    session_count_delta: 0,
9373                    message_count_delta: message_count,
9374                    total_chars_delta: new_chars,
9375                },
9376            )?;
9377        }
9378
9379        Ok(InsertOutcome {
9380            conversation_id,
9381            conversation_inserted: false,
9382            inserted_indices,
9383        })
9384    }
9385
9386    /// Rebuild the FTS5 index from scratch (chunked to avoid OOM on large databases, #110).
9387    pub fn rebuild_fts(&self) -> Result<()> {
9388        self.rebuild_fts_via_frankensqlite().map(|_| ())
9389    }
9390
9391    /// Best-effort repair for the derived SQLite FTS fallback index.
9392    ///
9393    /// The canonical archive and Tantivy index remain authoritative, so callers
9394    /// should invoke this from maintenance paths rather than ordinary opens.
9395    pub(crate) fn ensure_search_fallback_fts_consistency(&self) -> Result<FtsConsistencyRepair> {
9396        self.ensure_fts_consistency_via_frankensqlite()
9397    }
9398
9399    pub(crate) fn fallback_fts_is_known_healthy_for_archive_fingerprint(
9400        &self,
9401        archive_fingerprint: &str,
9402    ) -> Result<bool> {
9403        Ok(
9404            self.read_fts_franken_rebuild_generation()? == Some(FTS_FRANKEN_REBUILD_GENERATION)
9405                && self
9406                    .read_fts_franken_rebuild_archive_fingerprint()?
9407                    .as_deref()
9408                    == Some(archive_fingerprint),
9409        )
9410    }
9411
9412    pub(crate) fn record_search_fallback_fts_archive_fingerprint(
9413        &self,
9414        archive_fingerprint: &str,
9415    ) -> Result<()> {
9416        self.conn
9417            .execute_compat(
9418                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9419                fparams![
9420                    FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY,
9421                    archive_fingerprint.to_string()
9422                ],
9423            )
9424            .with_context(|| "recording frankensqlite FTS archive fingerprint")?;
9425        Ok(())
9426    }
9427
9428    pub(crate) fn daily_stats_is_known_healthy_for_archive_fingerprint(
9429        &self,
9430        archive_fingerprint: &str,
9431    ) -> Result<bool> {
9432        Ok(
9433            self.read_daily_stats_health_generation()? == Some(DAILY_STATS_HEALTH_GENERATION)
9434                && self.read_daily_stats_archive_fingerprint()?.as_deref()
9435                    == Some(archive_fingerprint),
9436        )
9437    }
9438
9439    pub(crate) fn record_daily_stats_archive_fingerprint(
9440        &self,
9441        archive_fingerprint: &str,
9442    ) -> Result<()> {
9443        self.conn
9444            .execute_compat(
9445                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9446                fparams![
9447                    DAILY_STATS_HEALTH_GENERATION_META_KEY,
9448                    DAILY_STATS_HEALTH_GENERATION.to_string()
9449                ],
9450            )
9451            .with_context(|| "recording daily_stats health generation")?;
9452        self.conn
9453            .execute_compat(
9454                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9455                fparams![DAILY_STATS_HEALTH_META_KEY, archive_fingerprint.to_string()],
9456            )
9457            .with_context(|| "recording daily_stats archive fingerprint")?;
9458        Ok(())
9459    }
9460
9461    fn read_fts_franken_rebuild_generation(&self) -> Result<Option<i64>> {
9462        let value: Option<String> = self
9463            .conn
9464            .query_row_map(
9465                "SELECT value FROM meta WHERE key = ?1",
9466                fparams![FTS_FRANKEN_REBUILD_META_KEY],
9467                |row| row.get_typed(0),
9468            )
9469            .optional()?;
9470        Ok(value.and_then(|v| v.parse::<i64>().ok()))
9471    }
9472
9473    fn read_fts_franken_rebuild_archive_fingerprint(&self) -> Result<Option<String>> {
9474        Ok(self
9475            .conn
9476            .query_row_map(
9477                "SELECT value FROM meta WHERE key = ?1",
9478                fparams![FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY],
9479                |row| row.get_typed(0),
9480            )
9481            .optional()?)
9482    }
9483
9484    fn read_daily_stats_health_generation(&self) -> Result<Option<i64>> {
9485        let value: Option<String> = self
9486            .conn
9487            .query_row_map(
9488                "SELECT value FROM meta WHERE key = ?1",
9489                fparams![DAILY_STATS_HEALTH_GENERATION_META_KEY],
9490                |row| row.get_typed(0),
9491            )
9492            .optional()?;
9493        Ok(value.and_then(|value| value.parse::<i64>().ok()))
9494    }
9495
9496    fn read_daily_stats_archive_fingerprint(&self) -> Result<Option<String>> {
9497        Ok(self
9498            .conn
9499            .query_row_map(
9500                "SELECT value FROM meta WHERE key = ?1",
9501                fparams![DAILY_STATS_HEALTH_META_KEY],
9502                |row| row.get_typed(0),
9503            )
9504            .optional()?)
9505    }
9506
9507    fn record_fts_franken_rebuild_generation(&self) -> Result<()> {
9508        self.conn
9509            .execute_compat(
9510                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9511                fparams![
9512                    FTS_FRANKEN_REBUILD_META_KEY,
9513                    FTS_FRANKEN_REBUILD_GENERATION.to_string()
9514                ],
9515            )
9516            .with_context(|| "recording frankensqlite FTS rebuild generation")?;
9517        Ok(())
9518    }
9519
9520    fn ensure_fts_consistency_via_frankensqlite(&self) -> Result<FtsConsistencyRepair> {
9521        if self.read_fts_franken_rebuild_generation()? != Some(FTS_FRANKEN_REBUILD_GENERATION) {
9522            // Before triggering an expensive full rebuild, probe whether
9523            // fts_messages is already populated and consistent.  On large
9524            // databases the rebuild can take hours and OOM — skip it when
9525            // the only thing missing is the generation marker (#184).
9526            let fts_already_healthy = (|| -> Result<bool> {
9527                let fts_exists: i64 = self.conn.query_row_map(
9528                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9529                    fparams![],
9530                    |row| row.get_typed(0),
9531                )?;
9532                if fts_exists != 1 {
9533                    return Ok(false);
9534                }
9535                let total: i64 = self.conn.query_row_map(
9536                    "SELECT COUNT(*) FROM messages",
9537                    fparams![],
9538                    |row| row.get_typed(0),
9539                )?;
9540                if total == 0 {
9541                    return Ok(false);
9542                }
9543                let indexed: i64 = self.conn.query_row_map(
9544                    "SELECT COUNT(*) FROM fts_messages",
9545                    fparams![],
9546                    |row| row.get_typed(0),
9547                )?;
9548                // Consider healthy if >=90% of messages are indexed
9549                Ok(indexed > 0 && indexed * 100 >= total * 90)
9550            })()
9551            .unwrap_or(false);
9552
9553            if fts_already_healthy {
9554                tracing::info!(
9555                    target: "cass::fts_rebuild",
9556                    "FTS already populated and consistent; setting generation marker without rebuild"
9557                );
9558                self.record_fts_franken_rebuild_generation()?;
9559                self.set_fts_messages_present_cache(true);
9560            } else {
9561                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9562                self.record_fts_franken_rebuild_generation()?;
9563                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9564            }
9565        }
9566
9567        let inspection = (|| -> Result<(i64, bool)> {
9568            let fts_schema_rows = self.conn.query_row_map(
9569                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9570                fparams![],
9571                |row| row.get_typed::<i64>(0),
9572            )?;
9573            let fts_queryable = fts_schema_rows == 1
9574                && self
9575                    .conn
9576                    .query("SELECT rowid FROM fts_messages LIMIT 1")
9577                    .is_ok();
9578            Ok((fts_schema_rows, fts_queryable))
9579        })();
9580
9581        let (fts_schema_rows, fts_queryable) = match inspection {
9582            Ok(result) => result,
9583            Err(err) => {
9584                tracing::warn!(
9585                    error = %err,
9586                    "frankensqlite FTS consistency probe failed; rebuilding authoritative FTS"
9587                );
9588                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9589                self.record_fts_franken_rebuild_generation()?;
9590                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9591            }
9592        };
9593
9594        if fts_schema_rows != 1 || !fts_queryable {
9595            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9596            self.record_fts_franken_rebuild_generation()?;
9597            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9598        }
9599
9600        let total_messages =
9601            self.conn
9602                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
9603                    row.get_typed::<i64>(0)
9604                })?;
9605        let indexed_messages =
9606            self.conn
9607                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9608                    row.get_typed::<i64>(0)
9609                })?;
9610
9611        if indexed_messages == total_messages {
9612            self.set_fts_messages_present_cache(true);
9613            return Ok(FtsConsistencyRepair::AlreadyHealthy {
9614                rows: usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX),
9615            });
9616        }
9617
9618        if indexed_messages > total_messages {
9619            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9620            self.record_fts_franken_rebuild_generation()?;
9621            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9622        }
9623
9624        let inserted_rows = self
9625            .stream_fts_rows_via_frankensqlite(true)
9626            .with_context(|| "incrementally repairing missing FTS rows via frankensqlite")?;
9627        let repaired_rows =
9628            self.conn
9629                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9630                    row.get_typed::<i64>(0)
9631                })?;
9632        if repaired_rows == total_messages {
9633            self.set_fts_messages_present_cache(true);
9634            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9635                inserted_rows,
9636                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9637            });
9638        }
9639
9640        // The incremental catch-up found nothing to insert, yet the gap
9641        // between total_messages (all rows, including orphans) and
9642        // indexed_messages (only rows with valid conversation_id, since the
9643        // FTS INSERT inner-joins on conversations) remains.  A full rebuild
9644        // cannot close this gap either — the orphaned messages will be
9645        // excluded again — so falling through to one would just re-do ~5 min
9646        // of work on every startup.  Accept the current state.
9647        if inserted_rows == 0 {
9648            tracing::debug!(
9649                target: "cass::fts_rebuild",
9650                indexed_messages = repaired_rows,
9651                total_messages,
9652                un_indexable_gap = total_messages.saturating_sub(repaired_rows),
9653                "FTS catch-up inserted 0 rows; remaining gap is un-indexable (likely orphaned messages with dangling conversation_id)"
9654            );
9655            self.set_fts_messages_present_cache(true);
9656            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9657                inserted_rows: 0,
9658                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9659            });
9660        }
9661
9662        // Incremental made progress but didn't fully close the gap — something
9663        // is genuinely inconsistent, so do a full rebuild.
9664        let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9665        self.record_fts_franken_rebuild_generation()?;
9666        Ok(FtsConsistencyRepair::Rebuilt { inserted_rows })
9667    }
9668
9669    pub(crate) fn rebuild_fts_via_frankensqlite(&self) -> Result<usize> {
9670        self.invalidate_fts_messages_present_cache();
9671        self.conn
9672            .execute("DROP TABLE IF EXISTS fts_messages;")
9673            .with_context(|| "dropping derived fts_messages before frankensqlite rebuild")?;
9674        self.conn
9675            .execute_compat(FTS5_REGISTER_SQL, fparams![])
9676            .with_context(|| "creating derived fts_messages via frankensqlite rebuild")?;
9677        self.set_fts_messages_present_cache(true);
9678
9679        self.stream_fts_rows_via_frankensqlite(false)
9680    }
9681
9682    fn stream_fts_rows_via_frankensqlite(&self, missing_only: bool) -> Result<usize> {
9683        let batch_size = fts_rebuild_batch_size().max(1);
9684        let batch_limit = i64::try_from(batch_size).unwrap_or(i64::MAX);
9685        let mut total_inserted: usize = 0;
9686        let mut total_skipped_orphans: usize = 0;
9687        let mut total_skipped_existing: usize = 0;
9688        let mut last_rowid: i64 = 0;
9689        let conversation_by_id = self.load_fts_conversation_projection_map()?;
9690        let agent_slug_by_id = self.load_fts_agent_slug_map()?;
9691        let workspace_path_by_id = self.load_fts_workspace_path_map()?;
9692        let existing_fts_rowids = if missing_only {
9693            Some(self.load_fts_message_rowid_set()?)
9694        } else {
9695            None
9696        };
9697        let mut entries = Vec::new();
9698        let mut pending_chars = 0usize;
9699
9700        loop {
9701            let rows = self.fetch_fts_rebuild_message_rows(last_rowid, batch_limit)?;
9702            let fetched_count = rows.len();
9703            if fetched_count == 0 {
9704                break;
9705            }
9706
9707            let inserted_before_batch = total_inserted;
9708            let skipped_before_batch = total_skipped_orphans;
9709            let existing_before_batch = total_skipped_existing;
9710
9711            for row in rows {
9712                last_rowid = row.rowid;
9713                if existing_fts_rowids
9714                    .as_ref()
9715                    .is_some_and(|rowids| rowids.contains(&row.message_id))
9716                {
9717                    total_skipped_existing = total_skipped_existing.saturating_add(1);
9718                    continue;
9719                }
9720                let Some(conversation) = conversation_by_id.get(&row.conversation_id) else {
9721                    total_skipped_orphans = total_skipped_orphans.saturating_add(1);
9722                    continue;
9723                };
9724                let agent = conversation
9725                    .agent_id
9726                    .and_then(|agent_id| agent_slug_by_id.get(&agent_id))
9727                    .filter(|slug| !slug.is_empty())
9728                    .cloned()
9729                    .unwrap_or_else(|| "unknown".to_string());
9730                let workspace = conversation
9731                    .workspace_id
9732                    .and_then(|workspace_id| workspace_path_by_id.get(&workspace_id))
9733                    .cloned()
9734                    .unwrap_or_default();
9735                pending_chars = pending_chars.saturating_add(row.content.len());
9736                entries.push(FtsEntry {
9737                    content: row.content,
9738                    title: conversation.title.clone(),
9739                    agent,
9740                    workspace,
9741                    source_path: conversation.source_path.clone(),
9742                    created_at: row.created_at,
9743                    message_id: row.message_id,
9744                });
9745                if entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9746                    || pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9747                {
9748                    total_inserted = total_inserted.saturating_add(
9749                        franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
9750                    );
9751                    entries.clear();
9752                    pending_chars = 0;
9753                }
9754            }
9755
9756            if !entries.is_empty() {
9757                total_inserted = total_inserted.saturating_add(
9758                    franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
9759                );
9760                entries.clear();
9761                pending_chars = 0;
9762            }
9763
9764            tracing::debug!(
9765                target: "cass::fts_rebuild",
9766                batch_rows = fetched_count,
9767                batch_inserted = total_inserted.saturating_sub(inserted_before_batch),
9768                batch_skipped_orphans = total_skipped_orphans.saturating_sub(skipped_before_batch),
9769                batch_skipped_existing = total_skipped_existing.saturating_sub(existing_before_batch),
9770                total_inserted,
9771                total_skipped_orphans,
9772                total_skipped_existing,
9773                last_rowid,
9774                missing_only,
9775                "FTS streaming maintenance batch complete"
9776            );
9777
9778            if fetched_count < batch_size {
9779                break;
9780            }
9781        }
9782
9783        Ok(total_inserted)
9784    }
9785
9786    fn fetch_fts_rebuild_message_rows(
9787        &self,
9788        last_rowid: i64,
9789        batch_limit: i64,
9790    ) -> Result<Vec<FtsRebuildMessageRow>> {
9791        self.conn
9792            .query_map_collect(
9793                "SELECT m.rowid, m.id, m.conversation_id, m.content, m.created_at
9794                 FROM messages m
9795                 WHERE m.rowid > ?1
9796                 ORDER BY m.rowid
9797                 LIMIT ?2",
9798                fparams![last_rowid, batch_limit],
9799                |row| {
9800                    Ok(FtsRebuildMessageRow {
9801                        rowid: row.get_typed(0)?,
9802                        message_id: row.get_typed(1)?,
9803                        conversation_id: row.get_typed(2)?,
9804                        content: row.get_typed::<Option<String>>(3)?.unwrap_or_default(),
9805                        created_at: row.get_typed(4)?,
9806                    })
9807                },
9808            )
9809            .with_context(|| format!("fetching FTS maintenance messages after rowid {last_rowid}"))
9810    }
9811
9812    fn load_fts_message_rowid_set(&self) -> Result<HashSet<i64>> {
9813        let rows: Vec<i64> = self
9814            .conn
9815            .query_map_collect("SELECT rowid FROM fts_messages", fparams![], |row| {
9816                row.get_typed(0)
9817            })
9818            .with_context(|| "loading existing FTS message rowids")?;
9819        Ok(rows.into_iter().collect())
9820    }
9821
9822    fn load_fts_conversation_projection_map(
9823        &self,
9824    ) -> Result<HashMap<i64, FtsConversationProjection>> {
9825        let rows: Vec<(i64, FtsConversationProjection)> = self
9826            .conn
9827            .query_map_collect(
9828                "SELECT id, title, agent_id, workspace_id, source_path
9829                 FROM conversations",
9830                fparams![],
9831                |row| {
9832                    Ok((
9833                        row.get_typed(0)?,
9834                        FtsConversationProjection {
9835                            title: row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
9836                            agent_id: row.get_typed(2)?,
9837                            workspace_id: row.get_typed(3)?,
9838                            source_path: row.get_typed::<Option<String>>(4)?.unwrap_or_default(),
9839                        },
9840                    ))
9841                },
9842            )
9843            .with_context(|| "loading FTS conversation projection map")?;
9844        Ok(rows.into_iter().collect())
9845    }
9846
9847    fn load_fts_agent_slug_map(&self) -> Result<HashMap<i64, String>> {
9848        let rows: Vec<(i64, String)> = self
9849            .conn
9850            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
9851                Ok((
9852                    row.get_typed(0)?,
9853                    row.get_typed::<Option<String>>(1)?
9854                        .unwrap_or_else(|| "unknown".to_string()),
9855                ))
9856            })
9857            .with_context(|| "loading FTS agent slug map")?;
9858        Ok(rows.into_iter().collect())
9859    }
9860
9861    fn load_fts_workspace_path_map(&self) -> Result<HashMap<i64, String>> {
9862        let rows: Vec<(i64, String)> = self
9863            .conn
9864            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
9865                Ok((
9866                    row.get_typed(0)?,
9867                    row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
9868                ))
9869            })
9870            .with_context(|| "loading FTS workspace path map")?;
9871        Ok(rows.into_iter().collect())
9872    }
9873
9874    /// Fetch all messages for embedding generation.
9875    pub fn fetch_messages_for_embedding(&self) -> Result<Vec<MessageForEmbedding>> {
9876        // COALESCE(c.agent_id, 0) so legacy V1 conversations with NULL
9877        // agent_id don't cause a runtime row-decode failure (agent_id in
9878        // MessageForEmbedding is i64).  saturating_u32_from_i64 downstream
9879        // turns 0 into the "unknown agent" sentinel for doc-id hashing.
9880        self.conn
9881            .query_map_collect(
9882                "SELECT m.id, m.created_at, COALESCE(c.agent_id, 0), c.workspace_id, c.source_id, m.role, m.content
9883                 FROM messages m
9884                 JOIN conversations c ON m.conversation_id = c.id
9885                 ORDER BY m.id",
9886                fparams![],
9887                |row| {
9888                    let source_id: String = row.get_typed::<Option<String>>(4)?
9889                        .unwrap_or_else(|| "local".to_string());
9890                    Ok(MessageForEmbedding {
9891                        message_id: row.get_typed(0)?,
9892                        created_at: row.get_typed(1)?,
9893                        agent_id: row.get_typed(2)?,
9894                        workspace_id: row.get_typed(3)?,
9895                        source_id_hash: crc32fast::hash(source_id.as_bytes()),
9896                        role: row.get_typed(5)?,
9897                        content: row.get_typed(6)?,
9898                    })
9899                },
9900            )
9901            .with_context(|| "fetching messages for embedding")
9902    }
9903
9904    /// Get the watermark for incremental semantic embedding.
9905    pub fn get_last_embedded_message_id(&self) -> Result<Option<i64>> {
9906        let result: Result<String, _> = self.conn.query_row_map(
9907            "SELECT value FROM meta WHERE key = 'last_embedded_message_id'",
9908            fparams![],
9909            |row| row.get_typed(0),
9910        );
9911        match result.optional() {
9912            Ok(Some(s)) => Ok(s.parse().ok()),
9913            Ok(None) => Ok(None),
9914            Err(e) => Err(e.into()),
9915        }
9916    }
9917
9918    /// Set the watermark for incremental semantic embedding.
9919    pub fn set_last_embedded_message_id(&self, id: i64) -> Result<()> {
9920        self.conn.execute_compat(
9921            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_embedded_message_id', ?1)",
9922            fparams![id.to_string()],
9923        )?;
9924        Ok(())
9925    }
9926
9927    /// Get embedding jobs for a database path.
9928    pub fn get_embedding_jobs(&self, db_path: &str) -> Result<Vec<EmbeddingJobRow>> {
9929        self.conn
9930            .query_map_collect(
9931                "SELECT id, db_path, model_id, status, total_docs, completed_docs, error_message, created_at, started_at, completed_at
9932                 FROM embedding_jobs WHERE db_path = ?1 ORDER BY id DESC",
9933                fparams![db_path],
9934                |row| {
9935                    Ok(EmbeddingJobRow {
9936                        id: row.get_typed(0)?,
9937                        db_path: row.get_typed(1)?,
9938                        model_id: row.get_typed(2)?,
9939                        status: row.get_typed(3)?,
9940                        total_docs: row.get_typed(4)?,
9941                        completed_docs: row.get_typed(5)?,
9942                        error_message: row.get_typed(6)?,
9943                        created_at: row.get_typed(7)?,
9944                        started_at: row.get_typed(8)?,
9945                        completed_at: row.get_typed(9)?,
9946                    })
9947                },
9948            )
9949            .with_context(|| format!("fetching embedding jobs for {db_path}"))
9950    }
9951
9952    /// Create or update an embedding job.
9953    pub fn upsert_embedding_job(
9954        &self,
9955        db_path: &str,
9956        model_id: &str,
9957        total_docs: i64,
9958    ) -> Result<i64> {
9959        let updated = self.conn.execute_compat(
9960            "UPDATE embedding_jobs
9961             SET total_docs = ?3
9962             WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
9963            fparams![db_path, model_id, total_docs],
9964        )?;
9965        if updated == 0 {
9966            let insert_result = self.conn.execute_compat(
9967                "INSERT INTO embedding_jobs(db_path, model_id, total_docs) VALUES(?1,?2,?3)",
9968                fparams![db_path, model_id, total_docs],
9969            );
9970            if let Err(err) = insert_result {
9971                if !matches!(err, frankensqlite::FrankenError::UniqueViolation { .. }) {
9972                    return Err(err.into());
9973                }
9974                self.conn.execute_compat(
9975                    "UPDATE embedding_jobs
9976                     SET total_docs = ?3
9977                     WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
9978                    fparams![db_path, model_id, total_docs],
9979                )?;
9980            }
9981        }
9982        self.conn
9983            .query_row_map(
9984                "SELECT id FROM embedding_jobs
9985                 WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')
9986                 ORDER BY id DESC
9987                 LIMIT 1",
9988                fparams![db_path, model_id],
9989                |row| row.get_typed(0),
9990            )
9991            .with_context(|| "resolving embedding job id after upsert")
9992    }
9993
9994    /// Mark an embedding job as started.
9995    pub fn start_embedding_job(&self, job_id: i64) -> Result<()> {
9996        self.conn.execute_compat(
9997            "UPDATE embedding_jobs SET status = 'running', started_at = datetime('now') WHERE id = ?1",
9998            fparams![job_id],
9999        )?;
10000        Ok(())
10001    }
10002
10003    /// Mark an embedding job as completed.
10004    pub fn complete_embedding_job(&self, job_id: i64) -> Result<()> {
10005        self.conn.execute_compat(
10006            "UPDATE embedding_jobs SET status = 'completed', completed_at = datetime('now') WHERE id = ?1",
10007            fparams![job_id],
10008        )?;
10009        Ok(())
10010    }
10011
10012    /// Mark an embedding job as failed.
10013    pub fn fail_embedding_job(&self, job_id: i64, error: &str) -> Result<()> {
10014        self.conn.execute_compat(
10015            "UPDATE embedding_jobs SET status = 'failed', error_message = ?2, completed_at = datetime('now') WHERE id = ?1",
10016            fparams![job_id, error],
10017        )?;
10018        Ok(())
10019    }
10020
10021    /// Cancel embedding jobs for a database path.
10022    pub fn cancel_embedding_jobs(&self, db_path: &str, model_id: Option<&str>) -> Result<usize> {
10023        if let Some(mid) = model_id {
10024            Ok(self.conn.execute_compat(
10025                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10026                fparams![db_path, mid],
10027            )?)
10028        } else {
10029            Ok(self.conn.execute_compat(
10030                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND status IN ('pending', 'running')",
10031                fparams![db_path],
10032            )?)
10033        }
10034    }
10035
10036    /// Update embedding job progress.
10037    pub fn update_job_progress(&self, job_id: i64, completed_docs: i64) -> Result<()> {
10038        self.conn.execute_compat(
10039            "UPDATE embedding_jobs SET completed_docs = ?2 WHERE id = ?1",
10040            fparams![job_id, completed_docs],
10041        )?;
10042        Ok(())
10043    }
10044
10045    // =====================================================================
10046    // Analytics query methods
10047    // =====================================================================
10048
10049    /// Get session count for a date range using materialized stats.
10050    /// Returns (count, is_from_cache) where is_from_cache is true if from daily_stats.
10051    ///
10052    /// Falls back to COUNT(*) query when daily_stats table is empty or stale.
10053    pub fn count_sessions_in_range(
10054        &self,
10055        start_ts_ms: Option<i64>,
10056        end_ts_ms: Option<i64>,
10057        agent_slug: Option<&str>,
10058        source_id: Option<&str>,
10059    ) -> Result<(i64, bool)> {
10060        let agent = agent_slug.unwrap_or("all");
10061        let source = source_id.unwrap_or("all");
10062
10063        // Check if we have materialized stats
10064        let stats_count: i64 = self
10065            .conn
10066            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10067                row.get_typed(0)
10068            })
10069            .unwrap_or(0);
10070
10071        if stats_count == 0 {
10072            return self.count_sessions_direct(start_ts_ms, end_ts_ms, agent_slug, source_id);
10073        }
10074
10075        // Use materialized stats
10076        let start_day = start_ts_ms.map(Self::day_id_from_millis);
10077        let end_day = end_ts_ms.map(Self::day_id_from_millis);
10078
10079        let count: i64 = match (start_day, end_day) {
10080            (Some(start), Some(end)) => self.conn.query_row_map(
10081                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10082                 WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4",
10083                fparams![start, end, agent, source],
10084                |row| row.get_typed(0),
10085            )?,
10086            (Some(start), None) => self.conn.query_row_map(
10087                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10088                 WHERE day_id >= ?1 AND agent_slug = ?2 AND source_id = ?3",
10089                fparams![start, agent, source],
10090                |row| row.get_typed(0),
10091            )?,
10092            (None, Some(end)) => self.conn.query_row_map(
10093                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10094                 WHERE day_id <= ?1 AND agent_slug = ?2 AND source_id = ?3",
10095                fparams![end, agent, source],
10096                |row| row.get_typed(0),
10097            )?,
10098            (None, None) => self.conn.query_row_map(
10099                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10100                 WHERE agent_slug = ?1 AND source_id = ?2",
10101                fparams![agent, source],
10102                |row| row.get_typed(0),
10103            )?,
10104        };
10105
10106        Ok((count, true))
10107    }
10108
10109    /// Direct COUNT(*) query as fallback when daily_stats is empty.
10110    fn count_sessions_direct(
10111        &self,
10112        start_ts_ms: Option<i64>,
10113        end_ts_ms: Option<i64>,
10114        agent_slug: Option<&str>,
10115        source_id: Option<&str>,
10116    ) -> Result<(i64, bool)> {
10117        // Build dynamic SQL with positional params.  Single-table scan of
10118        // conversations; filter on agent slug via an EXISTS subquery only
10119        // when that filter is actually requested.  This avoids the unneeded
10120        // 2-table JOIN (which also silently dropped legacy conversations
10121        // with NULL agent_id) and sidesteps frankensqlite's materialization
10122        // fallback entirely.
10123        let mut sql = "SELECT COUNT(*) FROM conversations c WHERE 1=1".to_string();
10124        let mut param_values: Vec<ParamValue> = Vec::new();
10125        let mut idx = 1;
10126
10127        if let Some(start) = start_ts_ms {
10128            sql.push_str(&format!(" AND c.started_at >= ?{idx}"));
10129            param_values.push(ParamValue::from(start));
10130            idx += 1;
10131        }
10132        if let Some(end) = end_ts_ms {
10133            sql.push_str(&format!(" AND c.started_at <= ?{idx}"));
10134            param_values.push(ParamValue::from(end));
10135            idx += 1;
10136        }
10137        if let Some(agent) = agent_slug
10138            && agent != "all"
10139        {
10140            sql.push_str(&format!(
10141                " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug = ?{idx})"
10142            ));
10143            param_values.push(ParamValue::from(agent));
10144            idx += 1;
10145        }
10146        if let Some(source) = source_id
10147            && source != "all"
10148        {
10149            sql.push_str(&format!(" AND c.source_id = ?{idx}"));
10150            param_values.push(ParamValue::from(source));
10151            let _ = idx; // suppress unused warning
10152        }
10153
10154        let count: i64 = self
10155            .conn
10156            .query_row_map(&sql, &param_values, |row| row.get_typed(0))?;
10157        Ok((count, false))
10158    }
10159
10160    /// Get daily histogram data for a date range.
10161    pub fn get_daily_histogram(
10162        &self,
10163        start_ts_ms: i64,
10164        end_ts_ms: i64,
10165        agent_slug: Option<&str>,
10166        source_id: Option<&str>,
10167    ) -> Result<Vec<DailyCount>> {
10168        let start_day = Self::day_id_from_millis(start_ts_ms);
10169        let end_day = Self::day_id_from_millis(end_ts_ms);
10170        let agent = agent_slug.unwrap_or("all");
10171        let source = source_id.unwrap_or("all");
10172
10173        let rows = self.conn.query_map_collect(
10174            "SELECT day_id, session_count, message_count, total_chars
10175             FROM daily_stats
10176             WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4
10177             ORDER BY day_id",
10178            fparams![start_day, end_day, agent, source],
10179            |row| {
10180                Ok(DailyCount {
10181                    day_id: row.get_typed(0)?,
10182                    sessions: row.get_typed(1)?,
10183                    messages: row.get_typed(2)?,
10184                    chars: row.get_typed(3)?,
10185                })
10186            },
10187        )?;
10188
10189        Ok(rows)
10190    }
10191
10192    /// Check health of daily stats table.
10193    pub fn daily_stats_health(&self) -> Result<DailyStatsHealth> {
10194        let row_count: i64 =
10195            self.conn
10196                .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10197                    row.get_typed(0)
10198                })?;
10199
10200        let oldest_update: Option<i64> = self.conn.query_row_map(
10201            "SELECT MIN(last_updated) FROM daily_stats",
10202            fparams![],
10203            |row| row.get_typed(0),
10204        )?;
10205
10206        let conversation_count: i64 =
10207            self.conn
10208                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
10209                    row.get_typed(0)
10210                })?;
10211
10212        let materialized_total: i64 = self.conn.query_row_map(
10213            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10214                 WHERE agent_slug = 'all' AND source_id = 'all'",
10215            fparams![],
10216            |row| row.get_typed(0),
10217        )?;
10218
10219        Ok(DailyStatsHealth {
10220            populated: row_count > 0,
10221            row_count,
10222            oldest_update_ms: oldest_update,
10223            conversation_count,
10224            materialized_total,
10225            drift: (conversation_count - materialized_total).abs(),
10226        })
10227    }
10228
10229    /// Batch insert multiple conversations with full analytics (token usage,
10230    /// message metrics, rollups).  Frankensqlite equivalent of
10231    /// `SqliteStorage::insert_conversations_batched`.
10232    pub fn insert_conversations_batched(
10233        &self,
10234        conversations: &[(i64, Option<i64>, &Conversation)],
10235    ) -> Result<Vec<InsertOutcome>> {
10236        if conversations.is_empty() {
10237            return Ok(Vec::new());
10238        }
10239
10240        self.ensure_sources_for_batch(conversations)?;
10241
10242        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
10243        let defer_analytics_updates = defer_analytics_updates_enabled();
10244
10245        let pricing_table = PricingTable::franken_load(&self.conn).unwrap_or_else(|e| {
10246            tracing::warn!(target: "cass::analytics::pricing", error = %e, "failed to load pricing table");
10247            PricingTable { entries: Vec::new() }
10248        });
10249        let mut pricing_diag = PricingDiagnostics::default();
10250
10251        let mut tx = self.conn.transaction()?;
10252
10253        // Bug #167: Ensure all referenced agents, workspaces, and sources
10254        // exist inside the transaction so FK checks pass.  The caller resolves
10255        // IDs via ensure_agent / ensure_workspace / ensure_sources_for_batch
10256        // outside the transaction, but those autocommit writes may not be
10257        // visible inside the transaction snapshot in frankensqlite.  Re-verify
10258        // (and insert if missing) within the tx.
10259        ensure_agents_in_tx(&tx, conversations)?;
10260        ensure_workspaces_in_tx(&tx, conversations)?;
10261        ensure_sources_in_tx(&tx, conversations)?;
10262
10263        let mut outcomes = Vec::with_capacity(conversations.len());
10264        let mut fts_entries = Vec::new();
10265        let mut fts_pending_chars = 0usize;
10266        let mut fts_inserted_total = 0usize;
10267        let mut fts_count_total = 0usize;
10268        let mut stats = StatsAggregator::new();
10269        let mut token_stats = TokenStatsAggregator::new();
10270        let mut token_entries: Vec<TokenUsageEntry> = Vec::new();
10271        let mut metrics_entries: Vec<MessageMetricsEntry> = Vec::new();
10272        let mut rollup_agg = AnalyticsRollupAggregator::new();
10273        let mut conv_ids_to_summarize: Vec<i64> = Vec::new();
10274        let mut pending_conversation_ids: HashMap<PendingConversationKey, i64> = HashMap::new();
10275        let mut pending_message_fingerprints: HashMap<i64, HashMap<i64, MessageMergeFingerprint>> =
10276            HashMap::new();
10277        let mut pending_message_replay_fingerprints: HashMap<
10278            i64,
10279            HashSet<MessageReplayFingerprint>,
10280        > = HashMap::new();
10281
10282        for &(agent_id, workspace_id, raw_conv) in conversations {
10283            let normalized_conv = normalized_conversation_for_storage(raw_conv);
10284            let conv = normalized_conv.as_ref();
10285            let mut total_chars: i64 = 0;
10286            let mut inserted_indices = Vec::with_capacity(conv.messages.len());
10287            let mut inserted_messages: Vec<(i64, &Message)> =
10288                Vec::with_capacity(conv.messages.len());
10289            let mut session_count_delta = 1_i64;
10290            let conversation_key = conversation_merge_key(agent_id, conv);
10291
10292            let existing_conv_id = if let Some(existing_id) =
10293                pending_conversation_ids.get(&conversation_key)
10294            {
10295                Some(*existing_id)
10296            } else {
10297                let existing_id =
10298                    franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
10299                if let Some(existing_id) = existing_id {
10300                    pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10301                }
10302                existing_id
10303            };
10304
10305            let conv_id = if let Some(existing_id) = existing_conv_id {
10306                session_count_delta = 0;
10307                let ExistingMessageLookup {
10308                    by_idx: mut existing_messages,
10309                    replay: mut existing_replay_fingerprints,
10310                } = franken_existing_message_lookup_with_pending(
10311                    &tx,
10312                    existing_id,
10313                    &conv.messages,
10314                    &mut pending_message_fingerprints,
10315                    &mut pending_message_replay_fingerprints,
10316                )?;
10317                let ExistingConversationNewMessages {
10318                    messages: new_messages,
10319                    new_chars,
10320                    idx_collision_count,
10321                    first_collision_idx,
10322                } = collect_new_messages_for_existing_conversation(
10323                    existing_id,
10324                    conv,
10325                    &mut existing_messages,
10326                    &mut existing_replay_fingerprints,
10327                    "skipping replay-equivalent recovered message with shifted idx during batched merge",
10328                );
10329                let (inserted_last_idx, inserted_last_created_at) =
10330                    borrowed_messages_tail_state(&new_messages);
10331                let inserted_message_ids =
10332                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10333                total_chars += new_chars;
10334                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10335                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10336                    if !defer_lexical_updates {
10337                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10338                        fts_count_total += 1;
10339                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
10340                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10341                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10342                        {
10343                            flush_pending_fts_entries(
10344                                self,
10345                                &tx,
10346                                &mut fts_entries,
10347                                &mut fts_pending_chars,
10348                                &mut fts_inserted_total,
10349                            )?;
10350                        }
10351                    }
10352                    inserted_indices.push(msg.idx);
10353                    inserted_messages.push((msg_id, msg));
10354                }
10355
10356                if idx_collision_count > 0 {
10357                    tracing::warn!(
10358                        conversation_id = existing_id,
10359                        collision_count = idx_collision_count,
10360                        first_idx = first_collision_idx,
10361                        source_path = %conv.source_path.display(),
10362                        "message idx collisions encountered during batched conversation merge; retaining canonical message variants"
10363                    );
10364                }
10365
10366                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10367                franken_update_conversation_tail_state(
10368                    &tx,
10369                    existing_id,
10370                    conv_last_ts,
10371                    inserted_last_idx,
10372                    inserted_last_created_at,
10373                )?;
10374                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
10375                {
10376                    franken_update_external_conversation_tail_lookup_key(
10377                        &tx,
10378                        &lookup_key,
10379                        conv_last_ts,
10380                        inserted_last_idx,
10381                        inserted_last_created_at,
10382                    )?;
10383                }
10384
10385                pending_message_fingerprints.insert(existing_id, existing_messages);
10386                pending_message_replay_fingerprints
10387                    .insert(existing_id, existing_replay_fingerprints);
10388
10389                existing_id
10390            } else {
10391                match franken_insert_conversation_or_get_existing(
10392                    &tx,
10393                    agent_id,
10394                    workspace_id,
10395                    conv,
10396                )? {
10397                    ConversationInsertStatus::Inserted(new_conv_id) => {
10398                        pending_conversation_ids.insert(conversation_key.clone(), new_conv_id);
10399                        let pending_messages =
10400                            pending_message_fingerprints.entry(new_conv_id).or_default();
10401                        let pending_replay_fingerprints = pending_message_replay_fingerprints
10402                            .entry(new_conv_id)
10403                            .or_default();
10404                        let mut new_messages = Vec::new();
10405                        for msg in &conv.messages {
10406                            let incoming_replay = message_replay_fingerprint(msg);
10407                            if pending_messages.contains_key(&msg.idx)
10408                                || pending_replay_fingerprints.contains(&incoming_replay)
10409                            {
10410                                continue;
10411                            }
10412                            pending_messages.insert(msg.idx, message_merge_fingerprint(msg));
10413                            pending_replay_fingerprints.insert(incoming_replay);
10414                            new_messages.push(msg);
10415                        }
10416                        let inserted_message_ids =
10417                            franken_batch_insert_new_messages(&tx, new_conv_id, &new_messages)?;
10418                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10419                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10420                            if !defer_lexical_updates {
10421                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10422                                fts_count_total += 1;
10423                                fts_pending_chars =
10424                                    fts_pending_chars.saturating_add(msg.content.len());
10425                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10426                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10427                                {
10428                                    flush_pending_fts_entries(
10429                                        self,
10430                                        &tx,
10431                                        &mut fts_entries,
10432                                        &mut fts_pending_chars,
10433                                        &mut fts_inserted_total,
10434                                    )?;
10435                                }
10436                            }
10437                            total_chars += msg.content.len() as i64;
10438                            inserted_indices.push(msg.idx);
10439                            inserted_messages.push((msg_id, msg));
10440                        }
10441                        new_conv_id
10442                    }
10443                    ConversationInsertStatus::Existing(existing_id) => {
10444                        session_count_delta = 0;
10445                        pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10446                        let ExistingMessageLookup {
10447                            by_idx: mut existing_messages,
10448                            replay: mut existing_replay_fingerprints,
10449                        } = franken_existing_message_lookup_with_pending(
10450                            &tx,
10451                            existing_id,
10452                            &conv.messages,
10453                            &mut pending_message_fingerprints,
10454                            &mut pending_message_replay_fingerprints,
10455                        )?;
10456                        let ExistingConversationNewMessages {
10457                            messages: new_messages,
10458                            new_chars,
10459                            idx_collision_count,
10460                            first_collision_idx,
10461                        } = collect_new_messages_for_existing_conversation(
10462                            existing_id,
10463                            conv,
10464                            &mut existing_messages,
10465                            &mut existing_replay_fingerprints,
10466                            "skipping replay-equivalent recovered message with shifted idx after duplicate conversation recovery",
10467                        );
10468                        let (inserted_last_idx, inserted_last_created_at) =
10469                            borrowed_messages_tail_state(&new_messages);
10470                        let inserted_message_ids =
10471                            franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10472                        total_chars += new_chars;
10473                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10474                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10475                            if !defer_lexical_updates {
10476                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10477                                fts_count_total += 1;
10478                                fts_pending_chars =
10479                                    fts_pending_chars.saturating_add(msg.content.len());
10480                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10481                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10482                                {
10483                                    flush_pending_fts_entries(
10484                                        self,
10485                                        &tx,
10486                                        &mut fts_entries,
10487                                        &mut fts_pending_chars,
10488                                        &mut fts_inserted_total,
10489                                    )?;
10490                                }
10491                            }
10492                            inserted_indices.push(msg.idx);
10493                            inserted_messages.push((msg_id, msg));
10494                        }
10495
10496                        if idx_collision_count > 0 {
10497                            tracing::warn!(
10498                                conversation_id = existing_id,
10499                                collision_count = idx_collision_count,
10500                                first_idx = first_collision_idx,
10501                                source_path = %conv.source_path.display(),
10502                                "message idx collisions encountered after duplicate conversation recovery; retaining canonical message variants"
10503                            );
10504                        }
10505
10506                        let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10507                        franken_update_conversation_tail_state(
10508                            &tx,
10509                            existing_id,
10510                            conv_last_ts,
10511                            inserted_last_idx,
10512                            inserted_last_created_at,
10513                        )?;
10514                        if let Some(lookup_key) =
10515                            conversation_external_lookup_key_for_conv(agent_id, conv)
10516                        {
10517                            franken_update_external_conversation_tail_lookup_key(
10518                                &tx,
10519                                &lookup_key,
10520                                conv_last_ts,
10521                                inserted_last_idx,
10522                                inserted_last_created_at,
10523                            )?;
10524                        }
10525
10526                        pending_message_fingerprints.insert(existing_id, existing_messages);
10527                        pending_message_replay_fingerprints
10528                            .insert(existing_id, existing_replay_fingerprints);
10529
10530                        existing_id
10531                    }
10532                }
10533            };
10534
10535            if !defer_analytics_updates {
10536                let delta = StatsDelta {
10537                    session_count_delta,
10538                    message_count_delta: inserted_messages.len() as i64,
10539                    total_chars_delta: total_chars,
10540                };
10541
10542                let effective_started_at = conversation_effective_started_at(conv);
10543                let day_id = effective_started_at
10544                    .map(FrankenStorage::day_id_from_millis)
10545                    .unwrap_or(0);
10546                stats.record_delta(
10547                    &conv.agent_slug,
10548                    &conv.source_id,
10549                    day_id,
10550                    delta.session_count_delta,
10551                    delta.message_count_delta,
10552                    delta.total_chars_delta,
10553                );
10554
10555                let conv_day_id = day_id;
10556                let mut session_model_family = String::from("unknown");
10557                let mut has_any_tokens = false;
10558
10559                for &(message_id, msg) in &inserted_messages {
10560                    let role_s = role_str(&msg.role);
10561                    let usage = if historical_raw_json(&msg.extra_json).is_some() {
10562                        crate::connectors::extract_tokens_for_agent(
10563                            &conv.agent_slug,
10564                            &serde_json::Value::Null,
10565                            &msg.content,
10566                            &role_s,
10567                        )
10568                    } else {
10569                        crate::connectors::extract_tokens_for_agent(
10570                            &conv.agent_slug,
10571                            &msg.extra_json,
10572                            &msg.content,
10573                            &role_s,
10574                        )
10575                    };
10576
10577                    let msg_ts = msg
10578                        .created_at
10579                        .or(conversation_effective_started_at(conv))
10580                        .unwrap_or(0);
10581                    let msg_day_id = if msg_ts > 0 {
10582                        FrankenStorage::day_id_from_millis(msg_ts)
10583                    } else {
10584                        conv_day_id
10585                    };
10586
10587                    let model_info = usage
10588                        .model_name
10589                        .as_deref()
10590                        .map(crate::connectors::normalize_model);
10591
10592                    let model_family = model_info
10593                        .as_ref()
10594                        .map(|i| i.family.clone())
10595                        .unwrap_or_else(|| "unknown".into());
10596                    let model_tier = model_info
10597                        .as_ref()
10598                        .map(|i| i.tier.clone())
10599                        .unwrap_or_else(|| "unknown".into());
10600                    let provider = usage
10601                        .provider
10602                        .clone()
10603                        .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
10604                        .unwrap_or_else(|| "unknown".into());
10605
10606                    if model_family != "unknown" {
10607                        session_model_family = model_family.clone();
10608                    }
10609
10610                    let estimated_cost = pricing_table.compute_cost(
10611                        usage.model_name.as_deref(),
10612                        msg_day_id,
10613                        usage.input_tokens,
10614                        usage.output_tokens,
10615                        usage.cache_read_tokens,
10616                        usage.cache_creation_tokens,
10617                    );
10618                    if estimated_cost.is_some() {
10619                        pricing_diag.record_priced();
10620                    } else if usage.has_token_data() {
10621                        pricing_diag.record_unpriced(usage.model_name.as_deref());
10622                    }
10623
10624                    token_stats.record(
10625                        &conv.agent_slug,
10626                        &conv.source_id,
10627                        msg_day_id,
10628                        &model_family,
10629                        &role_s,
10630                        &usage,
10631                        msg.content.len() as i64,
10632                        estimated_cost.unwrap_or(0.0),
10633                    );
10634
10635                    if usage.has_token_data() {
10636                        has_any_tokens = true;
10637                    }
10638
10639                    let content_chars = msg.content.len() as i64;
10640                    let content_tokens_est = content_chars / 4;
10641                    let msg_hour_id = FrankenStorage::hour_id_from_millis(msg_ts);
10642                    let has_plan = has_plan_for_role(&role_s, &msg.content);
10643
10644                    token_entries.push(TokenUsageEntry {
10645                        message_id,
10646                        conversation_id: conv_id,
10647                        agent_id,
10648                        workspace_id,
10649                        source_id: conv.source_id.clone(),
10650                        timestamp_ms: msg_ts,
10651                        day_id: msg_day_id,
10652                        model_name: usage.model_name.clone(),
10653                        model_family: Some(model_family.clone()),
10654                        model_tier: Some(model_tier.clone()),
10655                        service_tier: usage.service_tier.clone(),
10656                        provider: Some(provider.clone()),
10657                        input_tokens: usage.input_tokens,
10658                        output_tokens: usage.output_tokens,
10659                        cache_read_tokens: usage.cache_read_tokens,
10660                        cache_creation_tokens: usage.cache_creation_tokens,
10661                        thinking_tokens: usage.thinking_tokens,
10662                        total_tokens: usage.total_tokens(),
10663                        estimated_cost_usd: estimated_cost,
10664                        role: role_s.to_string(),
10665                        content_chars,
10666                        has_tool_calls: usage.has_tool_calls,
10667                        tool_call_count: usage.tool_call_count,
10668                        data_source: usage.data_source.as_str().to_string(),
10669                    });
10670
10671                    let mm = MessageMetricsEntry {
10672                        message_id,
10673                        created_at_ms: msg_ts,
10674                        hour_id: msg_hour_id,
10675                        day_id: msg_day_id,
10676                        agent_slug: conv.agent_slug.clone(),
10677                        workspace_id: workspace_id.unwrap_or(0),
10678                        source_id: conv.source_id.clone(),
10679                        role: role_s.to_string(),
10680                        content_chars,
10681                        content_tokens_est,
10682                        model_name: usage.model_name.clone(),
10683                        model_family: model_family.clone(),
10684                        model_tier: model_tier.clone(),
10685                        provider,
10686                        api_input_tokens: usage.input_tokens,
10687                        api_output_tokens: usage.output_tokens,
10688                        api_cache_read_tokens: usage.cache_read_tokens,
10689                        api_cache_creation_tokens: usage.cache_creation_tokens,
10690                        api_thinking_tokens: usage.thinking_tokens,
10691                        api_service_tier: usage.service_tier.clone(),
10692                        api_data_source: usage.data_source.as_str().to_string(),
10693                        tool_call_count: usage.tool_call_count as i64,
10694                        has_tool_calls: usage.has_tool_calls,
10695                        has_plan,
10696                    };
10697                    rollup_agg.record(&mm);
10698                    metrics_entries.push(mm);
10699                }
10700
10701                if session_count_delta > 0 {
10702                    token_stats.record_session(
10703                        &conv.agent_slug,
10704                        &conv.source_id,
10705                        conv_day_id,
10706                        &session_model_family,
10707                    );
10708                }
10709
10710                if has_any_tokens {
10711                    conv_ids_to_summarize.push(conv_id);
10712                }
10713            }
10714
10715            outcomes.push(InsertOutcome {
10716                conversation_id: conv_id,
10717                conversation_inserted: session_count_delta > 0,
10718                inserted_indices,
10719            });
10720        }
10721
10722        // Batch insert all FTS entries at once
10723        if !defer_lexical_updates {
10724            flush_pending_fts_entries(
10725                self,
10726                &tx,
10727                &mut fts_entries,
10728                &mut fts_pending_chars,
10729                &mut fts_inserted_total,
10730            )?;
10731        }
10732        if !defer_lexical_updates && fts_count_total > 0 {
10733            tracing::debug!(
10734                target: "cass::perf::fts5",
10735                total = fts_count_total,
10736                inserted = fts_inserted_total,
10737                conversations = conversations.len(),
10738                "franken_batch_fts_insert_complete"
10739            );
10740        }
10741
10742        // Batched daily_stats update
10743        if !defer_analytics_updates && !stats.is_empty() {
10744            let entries = stats.expand();
10745            let affected = franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
10746            tracing::debug!(
10747                target: "cass::perf::daily_stats",
10748                raw = stats.raw_entry_count(),
10749                expanded = entries.len(),
10750                affected = affected,
10751                "franken_batched_stats_update_complete"
10752            );
10753        }
10754
10755        // Batch insert token_usage rows
10756        if !defer_analytics_updates && !token_entries.is_empty() {
10757            let token_count = token_entries.len();
10758            let inserted = franken_insert_token_usage_batched_in_tx(&tx, &token_entries)?;
10759            tracing::debug!(
10760                target: "cass::perf::token_usage",
10761                total = token_count,
10762                inserted = inserted,
10763                "franken_batch_token_usage_insert_complete"
10764            );
10765        }
10766
10767        // Batched token_daily_stats update
10768        if !defer_analytics_updates && !token_stats.is_empty() {
10769            let entries = token_stats.expand();
10770            let affected = franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
10771            tracing::debug!(
10772                target: "cass::perf::token_daily_stats",
10773                raw = token_stats.raw_entry_count(),
10774                expanded = entries.len(),
10775                affected = affected,
10776                "franken_batched_token_stats_update_complete"
10777            );
10778        }
10779
10780        // Batch insert message_metrics rows
10781        if !defer_analytics_updates && !metrics_entries.is_empty() {
10782            let mm_count = metrics_entries.len();
10783            let inserted = franken_insert_message_metrics_batched_in_tx(&tx, &metrics_entries)?;
10784            tracing::debug!(
10785                target: "cass::perf::message_metrics",
10786                total = mm_count,
10787                inserted = inserted,
10788                "franken_batch_message_metrics_insert_complete"
10789            );
10790        }
10791
10792        // Flush usage_hourly + usage_daily rollups
10793        if !defer_analytics_updates && !rollup_agg.is_empty() {
10794            let (hourly, daily, models_daily) =
10795                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
10796            tracing::debug!(
10797                target: "cass::perf::usage_rollups",
10798                hourly_buckets = rollup_agg.hourly_entry_count(),
10799                daily_buckets = rollup_agg.daily_entry_count(),
10800                models_daily_buckets = rollup_agg.models_daily_entry_count(),
10801                hourly_affected = hourly,
10802                daily_affected = daily,
10803                models_daily_affected = models_daily,
10804                "franken_batched_usage_rollups_complete"
10805            );
10806        }
10807
10808        // Update conversation-level token summaries
10809        if !defer_analytics_updates {
10810            for conv_id in &conv_ids_to_summarize {
10811                franken_update_conversation_token_summaries_in_tx(&tx, *conv_id)?;
10812            }
10813        }
10814
10815        tx.commit()?;
10816
10817        pricing_diag.log_summary();
10818
10819        Ok(outcomes)
10820    }
10821}
10822
10823fn normalized_storage_source_parts(
10824    source_id: Option<&str>,
10825    origin_kind: Option<&str>,
10826    origin_host: Option<&str>,
10827) -> (String, SourceKind, Option<String>) {
10828    let host_label = crate::search::tantivy::normalized_index_origin_host(origin_host);
10829    let source_id = crate::search::tantivy::normalized_index_source_id(
10830        source_id,
10831        origin_kind,
10832        host_label.as_deref(),
10833    );
10834
10835    if source_id == LOCAL_SOURCE_ID {
10836        (source_id, SourceKind::Local, None)
10837    } else {
10838        (source_id, SourceKind::Ssh, host_label)
10839    }
10840}
10841
10842fn normalized_source_for_conversation(conv: &Conversation) -> Source {
10843    let (id, kind, host_label) = normalized_storage_source_parts(
10844        Some(conv.source_id.as_str()),
10845        None,
10846        conv.origin_host.as_deref(),
10847    );
10848    Source {
10849        id,
10850        kind,
10851        host_label,
10852        machine_id: None,
10853        platform: None,
10854        config_json: None,
10855        created_at: None,
10856        updated_at: None,
10857    }
10858}
10859
10860fn is_bootstrap_local_source(source: &Source) -> bool {
10861    source.id == LOCAL_SOURCE_ID
10862        && matches!(source.kind, SourceKind::Local)
10863        && source.host_label.is_none()
10864        && source.machine_id.is_none()
10865        && source.platform.is_none()
10866        && source.config_json.is_none()
10867        && source.created_at.is_none()
10868        && source.updated_at.is_none()
10869}
10870
10871fn normalized_conversation_for_storage<'a>(conv: &'a Conversation) -> Cow<'a, Conversation> {
10872    let normalized_source = normalized_source_for_conversation(conv);
10873    if normalized_source.id == conv.source_id && normalized_source.host_label == conv.origin_host {
10874        Cow::Borrowed(conv)
10875    } else {
10876        let mut normalized = conv.clone();
10877        normalized.source_id = normalized_source.id;
10878        normalized.origin_host = normalized_source.host_label;
10879        Cow::Owned(normalized)
10880    }
10881}
10882
10883impl FrankenStorage {
10884    fn ensure_source_for_conversation(&self, conv: &Conversation) -> Result<()> {
10885        let source = normalized_source_for_conversation(conv);
10886        if is_bootstrap_local_source(&source) {
10887            // `open()` and schema repair always seed the canonical local source row.
10888            // Avoid an autocommit UPDATE on every local conversation insert.
10889            return Ok(());
10890        }
10891        let cache_key = EnsuredConversationSourceKey::from_source(&source);
10892        if self.conversation_source_already_ensured(&cache_key) {
10893            return Ok(());
10894        }
10895        self.upsert_source(&source)?;
10896        self.mark_conversation_source_ensured(cache_key);
10897        Ok(())
10898    }
10899
10900    fn ensure_sources_for_batch(
10901        &self,
10902        conversations: &[(i64, Option<i64>, &Conversation)],
10903    ) -> Result<()> {
10904        let mut seen = HashSet::with_capacity(conversations.len());
10905        for &(_, _, conv) in conversations {
10906            let source = normalized_source_for_conversation(conv);
10907            if seen.insert(source.id.clone()) {
10908                if is_bootstrap_local_source(&source) {
10909                    continue;
10910                }
10911                self.upsert_source(&source)?;
10912                self.mark_conversation_source_ensured(EnsuredConversationSourceKey::from_source(
10913                    &source,
10914                ));
10915            }
10916        }
10917        Ok(())
10918    }
10919}
10920
10921// =========================================================================
10922// FrankenStorage transaction helper functions
10923// =========================================================================
10924
10925/// Get last_insert_rowid from a frankensqlite transaction.
10926fn franken_last_rowid(tx: &FrankenTransaction<'_>) -> Result<i64> {
10927    tx.last_insert_rowid()
10928        .ok()
10929        .filter(|&id| id > 0)
10930        .with_context(|| "last_insert_rowid() returned NULL or 0 after INSERT")
10931}
10932
10933/// Bug #167: Ensure all agents referenced by a batch exist within the
10934/// transaction.  The caller already resolved `agent_id` values via
10935/// `ensure_agent` outside the transaction, but those autocommit writes may
10936/// not be visible inside a frankensqlite transaction snapshot.  This function
10937/// checks each unique agent_id and creates a stub row if it's missing.
10938fn ensure_agents_in_tx(
10939    tx: &FrankenTransaction<'_>,
10940    conversations: &[(i64, Option<i64>, &Conversation)],
10941) -> Result<()> {
10942    let mut seen = HashSet::new();
10943    let now = FrankenStorage::now_millis();
10944    for &(agent_id, _, conv) in conversations {
10945        if !seen.insert(agent_id) {
10946            continue;
10947        }
10948        let exists: i64 = tx.query_row_map(
10949            "SELECT COUNT(*) FROM agents WHERE id = ?1",
10950            fparams![agent_id],
10951            |row| row.get_typed(0),
10952        )?;
10953        if exists == 0 {
10954            tracing::debug!(
10955                target: "cass::fk_guard",
10956                agent_id,
10957                slug = %conv.agent_slug,
10958                "inserting agent row inside transaction to satisfy FK constraint"
10959            );
10960            // INSERT OR IGNORE: the slug might already exist with a different
10961            // id from a concurrent writer.  If the slug row exists, the FK
10962            // constraint is already satisfied (the caller just got a stale id).
10963            tx.execute_compat(
10964                "INSERT OR IGNORE INTO agents(id, slug, name, kind, created_at, updated_at)
10965                 VALUES(?1, ?2, ?3, 'cli', ?4, ?5)",
10966                fparams![
10967                    agent_id,
10968                    conv.agent_slug.as_str(),
10969                    conv.agent_slug.as_str(),
10970                    now,
10971                    now
10972                ],
10973            )?;
10974        }
10975    }
10976    Ok(())
10977}
10978
10979/// Bug #167: Ensure all workspaces referenced by a batch exist within the
10980/// transaction.  Same rationale as `ensure_agents_in_tx`.
10981fn ensure_workspaces_in_tx(
10982    tx: &FrankenTransaction<'_>,
10983    conversations: &[(i64, Option<i64>, &Conversation)],
10984) -> Result<()> {
10985    let mut seen = HashSet::new();
10986    for &(_, workspace_id, conv) in conversations {
10987        let ws_id = match workspace_id {
10988            Some(id) => id,
10989            None => continue,
10990        };
10991        if !seen.insert(ws_id) {
10992            continue;
10993        }
10994        let exists: i64 = tx.query_row_map(
10995            "SELECT COUNT(*) FROM workspaces WHERE id = ?1",
10996            fparams![ws_id],
10997            |row| row.get_typed(0),
10998        )?;
10999        if exists == 0 {
11000            let path_str = conv
11001                .workspace
11002                .as_ref()
11003                .map(|p| p.to_string_lossy().to_string())
11004                .unwrap_or_default();
11005            tracing::debug!(
11006                target: "cass::fk_guard",
11007                workspace_id = ws_id,
11008                path = %path_str,
11009                "inserting workspace row inside transaction to satisfy FK constraint"
11010            );
11011            tx.execute_compat(
11012                "INSERT OR IGNORE INTO workspaces(id, path) VALUES(?1, ?2)",
11013                fparams![ws_id, path_str.as_str()],
11014            )?;
11015        }
11016    }
11017    Ok(())
11018}
11019
11020/// Bug #167: Ensure all sources referenced by a batch exist within the
11021/// transaction.  Same rationale as `ensure_agents_in_tx` — source_id is a
11022/// TEXT FK on the conversations table.
11023fn ensure_sources_in_tx(
11024    tx: &FrankenTransaction<'_>,
11025    conversations: &[(i64, Option<i64>, &Conversation)],
11026) -> Result<()> {
11027    let mut seen = HashSet::new();
11028    for &(_, _, conv) in conversations {
11029        let (source_id, source_kind, host_label) = normalized_storage_source_parts(
11030            Some(conv.source_id.as_str()),
11031            None,
11032            conv.origin_host.as_deref(),
11033        );
11034        if !seen.insert(source_id.clone()) {
11035            continue;
11036        }
11037        let exists: i64 = tx.query_row_map(
11038            "SELECT COUNT(*) FROM sources WHERE id = ?1",
11039            fparams![source_id.as_str()],
11040            |row| row.get_typed(0),
11041        )?;
11042        if exists == 0 {
11043            let kind_str = source_kind.to_string();
11044            let now = FrankenStorage::now_millis();
11045            tracing::debug!(
11046                target: "cass::fk_guard",
11047                source_id = %source_id,
11048                kind = kind_str.as_str(),
11049                "inserting source row inside transaction to satisfy FK constraint"
11050            );
11051            tx.execute_compat(
11052                "INSERT OR IGNORE INTO sources(id, kind, host_label, created_at, updated_at)
11053                 VALUES(?1, ?2, ?3, ?4, ?5)",
11054                fparams![
11055                    source_id.as_str(),
11056                    kind_str.as_str(),
11057                    host_label.as_deref(),
11058                    now,
11059                    now
11060                ],
11061            )?;
11062        }
11063    }
11064    Ok(())
11065}
11066
11067fn env_flag_enabled(name: &str) -> bool {
11068    dotenvy::var(name)
11069        .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
11070        .unwrap_or(false)
11071}
11072
11073fn defer_storage_lexical_updates_enabled() -> bool {
11074    env_flag_enabled("CASS_DEFER_LEXICAL_UPDATES")
11075}
11076
11077fn defer_analytics_updates_enabled() -> bool {
11078    env_flag_enabled("CASS_DEFER_ANALYTICS_UPDATES")
11079}
11080
11081enum ConversationInsertStatus {
11082    Inserted(i64),
11083    Existing(i64),
11084}
11085
11086fn franken_find_external_conversation_tail_lookup(
11087    tx: &FrankenTransaction<'_>,
11088    lookup_key: &str,
11089) -> Result<Option<ExistingConversationWithTail>> {
11090    let params = [SqliteValue::from(lookup_key)];
11091    let row = tx
11092        .query_row_with_params(
11093            "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
11094             FROM conversation_external_tail_lookup
11095             WHERE lookup_key = ?1",
11096            &params,
11097        )
11098        .optional()?;
11099    let Some(row) = row else {
11100        return Ok(None);
11101    };
11102    let id = row.get_typed(0)?;
11103    let ended_at = row.get_typed(1)?;
11104    let last_message_idx = row.get_typed(2)?;
11105    let last_message_created_at = row.get_typed(3)?;
11106    Ok(Some(ExistingConversationWithTail {
11107        id,
11108        tail_state: existing_conversation_tail_state_from_cached(
11109            last_message_idx,
11110            last_message_created_at,
11111            ended_at,
11112        ),
11113    }))
11114}
11115
11116fn franken_find_external_conversation_lookup(
11117    tx: &FrankenTransaction<'_>,
11118    lookup_key: &str,
11119) -> Result<Option<i64>> {
11120    Ok(franken_find_external_conversation_tail_lookup(tx, lookup_key)?.map(|existing| existing.id))
11121}
11122
11123fn franken_insert_external_conversation_tail_lookup_key(
11124    tx: &FrankenTransaction<'_>,
11125    lookup_key: &str,
11126    conversation_id: i64,
11127    ended_at: Option<i64>,
11128    last_message_idx: Option<i64>,
11129    last_message_created_at: Option<i64>,
11130) -> Result<()> {
11131    let params = [
11132        SqliteValue::from(lookup_key),
11133        SqliteValue::from(conversation_id),
11134        SqliteValue::from(ended_at),
11135        SqliteValue::from(last_message_idx),
11136        SqliteValue::from(last_message_created_at),
11137    ];
11138    tx.execute_with_params(
11139        "INSERT OR REPLACE INTO conversation_external_tail_lookup(
11140             lookup_key, conversation_id, ended_at, last_message_idx, last_message_created_at
11141         ) VALUES(?1, ?2, ?3, ?4, ?5)",
11142        &params,
11143    )?;
11144    Ok(())
11145}
11146
11147fn franken_insert_external_conversation_tail_lookup(
11148    tx: &FrankenTransaction<'_>,
11149    source_id: &str,
11150    agent_id: i64,
11151    external_id: &str,
11152    existing: ExistingConversationWithTail,
11153) -> Result<()> {
11154    let lookup_key = conversation_external_lookup_key(source_id, agent_id, external_id);
11155    let ended_at = existing.tail_state.and_then(|state| state.ended_at);
11156    let last_message_idx = existing.tail_state.map(|state| state.last_message_idx);
11157    let last_message_created_at = existing
11158        .tail_state
11159        .map(|state| state.last_message_created_at);
11160    franken_insert_external_conversation_tail_lookup_key(
11161        tx,
11162        &lookup_key,
11163        existing.id,
11164        ended_at,
11165        last_message_idx,
11166        last_message_created_at,
11167    )
11168}
11169
11170fn franken_update_external_conversation_tail_lookup_key(
11171    tx: &FrankenTransaction<'_>,
11172    lookup_key: &str,
11173    ended_at_candidate: Option<i64>,
11174    last_message_idx_candidate: Option<i64>,
11175    last_message_created_at_candidate: Option<i64>,
11176) -> Result<()> {
11177    if ended_at_candidate.is_none()
11178        && last_message_idx_candidate.is_none()
11179        && last_message_created_at_candidate.is_none()
11180    {
11181        return Ok(());
11182    }
11183    tx.execute_compat(
11184        "UPDATE conversation_external_tail_lookup
11185         SET ended_at = CASE
11186                 WHEN ?1 IS NULL THEN ended_at
11187                 ELSE MAX(IFNULL(ended_at, 0), ?1)
11188             END,
11189             last_message_idx = CASE
11190                 WHEN ?2 IS NULL THEN last_message_idx
11191                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
11192                 ELSE last_message_idx
11193             END,
11194             last_message_created_at = CASE
11195                 WHEN ?3 IS NULL THEN last_message_created_at
11196                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
11197                 ELSE last_message_created_at
11198             END
11199         WHERE lookup_key = ?4",
11200        fparams![
11201            ended_at_candidate,
11202            last_message_idx_candidate,
11203            last_message_created_at_candidate,
11204            lookup_key
11205        ],
11206    )?;
11207    Ok(())
11208}
11209
11210fn franken_set_external_conversation_tail_lookup_after_append(
11211    tx: &FrankenTransaction<'_>,
11212    lookup_key: &str,
11213    ended_at: i64,
11214    last_message_idx: i64,
11215    last_message_created_at: i64,
11216) -> Result<()> {
11217    tx.execute_compat(
11218        "UPDATE conversation_external_tail_lookup
11219         SET ended_at = ?1,
11220             last_message_idx = ?2,
11221             last_message_created_at = ?3
11222         WHERE lookup_key = ?4",
11223        fparams![
11224            ended_at,
11225            last_message_idx,
11226            last_message_created_at,
11227            lookup_key
11228        ],
11229    )?;
11230    Ok(())
11231}
11232
11233fn franken_update_external_conversation_tail_after_append(
11234    tx: &FrankenTransaction<'_>,
11235    agent_id: i64,
11236    conv: &Conversation,
11237    used_append_tail_plan: bool,
11238    exact_append_set: bool,
11239    inserted_last_idx: Option<i64>,
11240    inserted_last_created_at: Option<i64>,
11241) -> Result<()> {
11242    let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv) else {
11243        return Ok(());
11244    };
11245
11246    if exact_append_set
11247        && let (Some(last_message_idx), Some(last_message_created_at)) =
11248            (inserted_last_idx, inserted_last_created_at)
11249    {
11250        return franken_set_external_conversation_tail_lookup_after_append(
11251            tx,
11252            &lookup_key,
11253            last_message_created_at,
11254            last_message_idx,
11255            last_message_created_at,
11256        );
11257    }
11258
11259    let ended_at_candidate = if used_append_tail_plan {
11260        inserted_last_created_at
11261    } else {
11262        conv.messages.iter().filter_map(|m| m.created_at).max()
11263    };
11264    franken_update_external_conversation_tail_lookup_key(
11265        tx,
11266        &lookup_key,
11267        ended_at_candidate,
11268        inserted_last_idx,
11269        inserted_last_created_at,
11270    )
11271}
11272
11273fn franken_find_existing_conversation_by_key(
11274    tx: &FrankenTransaction<'_>,
11275    key: &PendingConversationKey,
11276    conv: Option<&Conversation>,
11277) -> Result<Option<i64>> {
11278    franken_find_existing_conversation_by_key_impl(tx, key, conv, false)
11279}
11280
11281fn franken_find_existing_conversation_by_key_after_conflict(
11282    tx: &FrankenTransaction<'_>,
11283    key: &PendingConversationKey,
11284    conv: Option<&Conversation>,
11285) -> Result<Option<i64>> {
11286    franken_find_existing_conversation_by_key_impl(tx, key, conv, true)
11287}
11288
11289fn franken_find_existing_conversation_by_key_impl(
11290    tx: &FrankenTransaction<'_>,
11291    key: &PendingConversationKey,
11292    conv: Option<&Conversation>,
11293    allow_legacy_external_scan: bool,
11294) -> Result<Option<i64>> {
11295    match key {
11296        PendingConversationKey::External {
11297            source_id,
11298            agent_id,
11299            external_id,
11300        } => {
11301            let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
11302            if let Some(existing_id) = franken_find_external_conversation_lookup(tx, &lookup_key)? {
11303                return Ok(Some(existing_id));
11304            }
11305            if !allow_legacy_external_scan {
11306                return Ok(None);
11307            }
11308
11309            let existing_id = tx
11310                .query_row_map(
11311                    "SELECT id
11312                 FROM conversations
11313                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
11314                    fparams![source_id.as_str(), *agent_id, external_id.as_str()],
11315                    |row| row.get_typed(0),
11316                )
11317                .optional()?;
11318            if let Some(existing_id) = existing_id {
11319                let tail_state = franken_existing_conversation_append_tail_state(tx, existing_id)?;
11320                franken_insert_external_conversation_tail_lookup_key(
11321                    tx,
11322                    &lookup_key,
11323                    existing_id,
11324                    tail_state.and_then(|state| state.ended_at),
11325                    tail_state.map(|state| state.last_message_idx),
11326                    tail_state.map(|state| state.last_message_created_at),
11327                )?;
11328                Ok(Some(existing_id))
11329            } else {
11330                Ok(None)
11331            }
11332        }
11333        PendingConversationKey::SourcePath {
11334            source_id,
11335            agent_id,
11336            source_path,
11337            started_at,
11338        } => {
11339            let exact_match = tx
11340                .query_row_map(
11341                    "SELECT c.id
11342                     FROM conversations c
11343                     WHERE c.source_id = ?1
11344                       AND c.agent_id = ?2
11345                       AND c.source_path = ?3
11346                       AND ((
11347                            COALESCE(
11348                                c.started_at,
11349                                (SELECT MIN(created_at)
11350                                 FROM messages
11351                                 WHERE conversation_id = c.id
11352                                   AND created_at IS NOT NULL)
11353                            ) IS NULL
11354                            AND ?4 IS NULL
11355                       ) OR COALESCE(
11356                            c.started_at,
11357                            (SELECT MIN(created_at)
11358                             FROM messages
11359                             WHERE conversation_id = c.id
11360                               AND created_at IS NOT NULL)
11361                       ) = ?4)
11362                     ORDER BY c.id
11363                     LIMIT 1",
11364                    fparams![
11365                        source_id.as_str(),
11366                        *agent_id,
11367                        source_path.as_str(),
11368                        *started_at
11369                    ],
11370                    |row| row.get_typed(0),
11371                )
11372                .optional()?;
11373            if exact_match.is_some() {
11374                return Ok(exact_match);
11375            }
11376
11377            let Some(conv) = conv else {
11378                return Ok(None);
11379            };
11380            let incoming_fingerprints = conversation_message_fingerprints(conv);
11381            if incoming_fingerprints.is_empty() {
11382                return Ok(None);
11383            }
11384            let incoming_replay_fingerprints = conversation_message_replay_fingerprints(conv);
11385
11386            let candidates: Vec<(i64, Option<i64>)> = tx.query_map_collect(
11387                "SELECT
11388                     c.id,
11389                     COALESCE(
11390                         c.started_at,
11391                         (SELECT MIN(created_at)
11392                          FROM messages
11393                          WHERE conversation_id = c.id
11394                            AND created_at IS NOT NULL)
11395                     ) AS effective_started_at
11396                 FROM conversations c
11397                 WHERE c.source_id = ?1
11398                   AND c.agent_id = ?2
11399                   AND c.source_path = ?3
11400                 ORDER BY c.id",
11401                fparams![source_id.as_str(), *agent_id, source_path.as_str()],
11402                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
11403            )?;
11404
11405            let mut best_candidate: Option<(i64, ConversationMergeEvidence)> = None;
11406            for (candidate_id, candidate_started_at) in candidates {
11407                let existing_fingerprints =
11408                    franken_existing_message_fingerprints(tx, candidate_id)?;
11409                let existing_replay_fingerprints =
11410                    replay_fingerprints_from_merge_set(&existing_fingerprints);
11411                let Some(evidence) = conversation_merge_evidence(
11412                    &incoming_fingerprints,
11413                    &incoming_replay_fingerprints,
11414                    &existing_fingerprints,
11415                    &existing_replay_fingerprints,
11416                    *started_at,
11417                    candidate_started_at,
11418                ) else {
11419                    continue;
11420                };
11421
11422                let candidate_key = (
11423                    evidence.exact_overlap,
11424                    evidence.replay_overlap,
11425                    evidence.started_close,
11426                    evidence.smaller_replay_set,
11427                    std::cmp::Reverse(evidence.start_distance_ms),
11428                );
11429                let should_replace = best_candidate
11430                    .as_ref()
11431                    .map(|(_, best_evidence)| {
11432                        candidate_key
11433                            > (
11434                                best_evidence.exact_overlap,
11435                                best_evidence.replay_overlap,
11436                                best_evidence.started_close,
11437                                best_evidence.smaller_replay_set,
11438                                std::cmp::Reverse(best_evidence.start_distance_ms),
11439                            )
11440                    })
11441                    .unwrap_or(true);
11442
11443                if should_replace {
11444                    best_candidate = Some((candidate_id, evidence));
11445                }
11446            }
11447
11448            Ok(best_candidate.map(|(candidate_id, _)| candidate_id))
11449        }
11450    }
11451}
11452
11453fn franken_insert_conversation_or_get_existing(
11454    tx: &FrankenTransaction<'_>,
11455    agent_id: i64,
11456    workspace_id: Option<i64>,
11457    conv: &Conversation,
11458) -> Result<ConversationInsertStatus> {
11459    let conversation_key = conversation_merge_key(agent_id, conv);
11460    if let Some(existing_id) =
11461        franken_find_existing_conversation_by_key(tx, &conversation_key, Some(conv))?
11462    {
11463        return Ok(ConversationInsertStatus::Existing(existing_id));
11464    }
11465
11466    franken_insert_conversation_or_get_existing_after_miss(
11467        tx,
11468        agent_id,
11469        workspace_id,
11470        conv,
11471        &conversation_key,
11472    )
11473}
11474
11475fn franken_insert_conversation_or_get_existing_after_miss(
11476    tx: &FrankenTransaction<'_>,
11477    agent_id: i64,
11478    workspace_id: Option<i64>,
11479    conv: &Conversation,
11480    conversation_key: &PendingConversationKey,
11481) -> Result<ConversationInsertStatus> {
11482    match franken_insert_conversation(tx, agent_id, workspace_id, conv) {
11483        Ok(Some(conv_id)) => Ok(ConversationInsertStatus::Inserted(conv_id)),
11484        Ok(None) => {
11485            // A concurrent writer won the unique-provenance race. Resolve the
11486            // canonical row so callers can merge messages into it.
11487            let existing_id =
11488                franken_find_existing_conversation_by_key_after_conflict(
11489                    tx,
11490                    conversation_key,
11491                    Some(conv),
11492                )?
11493                    .with_context(|| {
11494                        format!(
11495                            "conversation INSERT produced a duplicate conflict but existing row was not found for source_id={} agent_id={} external_id={:?} source_path={}",
11496                            conv.source_id,
11497                            agent_id,
11498                            conv.external_id,
11499                            conv.source_path.display()
11500                        )
11501                    })?;
11502            tracing::warn!(
11503                source_id = %conv.source_id,
11504                agent_id,
11505                external_id = ?conv.external_id,
11506                existing_id,
11507                source_path = %conv.source_path.display(),
11508                "conversation INSERT: duplicate gracefully recovered, reusing existing row"
11509            );
11510            Ok(ConversationInsertStatus::Existing(existing_id))
11511        }
11512        Err(error) => {
11513            tracing::error!(
11514                source_id = %conv.source_id,
11515                agent_id,
11516                external_id = ?conv.external_id,
11517                error = %error,
11518                source_path = %conv.source_path.display(),
11519                "franken_insert_conversation failed"
11520            );
11521            Err(error)
11522        }
11523    }
11524}
11525
11526/// Insert a conversation into the DB within a frankensqlite transaction.
11527///
11528/// Uses a plain `INSERT` so the common miss path stays on the slim direct
11529/// insert lane. Duplicate provenance conflicts are converted into `Ok(None)`
11530/// so callers can recover the canonical row and merge messages into it.
11531fn franken_insert_conversation(
11532    tx: &FrankenTransaction<'_>,
11533    agent_id: i64,
11534    workspace_id: Option<i64>,
11535    conv: &Conversation,
11536) -> Result<Option<i64>> {
11537    let (metadata_json_str, metadata_bin) = franken_metadata_insert_payload(&conv.metadata_json)?;
11538    let (last_message_idx, last_message_created_at) = conversation_tail_state(conv);
11539    let metadata_bin_bytes = metadata_bin.as_deref();
11540
11541    match tx.execute_compat(
11542        "INSERT INTO conversations(
11543            agent_id, workspace_id, source_id, external_id, title, source_path,
11544            started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin,
11545            last_message_idx, last_message_created_at
11546        ) VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14)",
11547        fparams![
11548            agent_id,
11549            workspace_id,
11550            conv.source_id.as_str(),
11551            conv.external_id.as_deref(),
11552            conv.title.as_deref(),
11553            path_to_string(&conv.source_path),
11554            conv.started_at,
11555            conv.ended_at,
11556            conv.approx_tokens,
11557            metadata_json_str.as_deref(),
11558            conv.origin_host.as_deref(),
11559            metadata_bin_bytes,
11560            last_message_idx,
11561            last_message_created_at
11562        ],
11563    ) {
11564        Ok(_) => {
11565            let conv_id = franken_last_rowid(tx)?;
11566            franken_insert_conversation_tail_state(
11567                tx,
11568                conv_id,
11569                conv.ended_at,
11570                last_message_idx,
11571                last_message_created_at,
11572            )?;
11573            if let Some(external_id) = conv.external_id.as_deref() {
11574                franken_insert_external_conversation_tail_lookup(
11575                    tx,
11576                    conv.source_id.as_str(),
11577                    agent_id,
11578                    external_id,
11579                    ExistingConversationWithTail {
11580                        id: conv_id,
11581                        tail_state: existing_conversation_tail_state_from_cached(
11582                            last_message_idx,
11583                            last_message_created_at,
11584                            conv.ended_at,
11585                        ),
11586                    },
11587                )?;
11588            }
11589            Ok(Some(conv_id))
11590        }
11591        Err(frankensqlite::FrankenError::UniqueViolation { .. }) => {
11592            tracing::debug!(
11593                source_id = %conv.source_id,
11594                agent_id,
11595                external_id = ?conv.external_id,
11596                source_path = %conv.source_path.display(),
11597                "conversation INSERT: duplicate provenance conflict"
11598            );
11599            Ok(None)
11600        }
11601        Err(error) => Err(error.into()),
11602    }
11603}
11604
11605type MetadataInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11606
11607fn franken_metadata_insert_payload(value: &serde_json::Value) -> Result<MetadataInsertPayload<'_>> {
11608    if let Some(raw) = historical_raw_json(value) {
11609        Ok((Some(Cow::Borrowed(raw)), None))
11610    } else if value.is_null() {
11611        Ok((Some(Cow::Borrowed("null")), None))
11612    } else if value.as_object().is_some_and(|object| object.is_empty()) {
11613        Ok((None, None))
11614    } else if let Some(metadata_bin) = serialize_json_to_msgpack(value) {
11615        Ok((None, Some(metadata_bin)))
11616    } else {
11617        Ok((Some(Cow::Owned(serde_json::to_string(value)?)), None))
11618    }
11619}
11620
11621fn franken_insert_new_message(
11622    tx: &FrankenTransaction<'_>,
11623    conversation_id: i64,
11624    msg: &Message,
11625) -> Result<i64> {
11626    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11627    let extra_bin_bytes = extra_bin.as_deref();
11628
11629    tx.execute_compat(
11630        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
11631         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
11632            fparams![
11633                conversation_id,
11634                msg.idx,
11635                role_as_str(&msg.role),
11636                msg.author.as_deref(),
11637                msg.created_at,
11638                msg.content.as_str(),
11639                extra_json_str.as_deref(),
11640                extra_bin_bytes
11641        ],
11642    )?;
11643    franken_last_rowid(tx)
11644}
11645
11646type MessageInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11647
11648fn franken_message_insert_payload(msg: &Message) -> Result<MessageInsertPayload<'_>> {
11649    if let Some(raw) = historical_raw_json(&msg.extra_json) {
11650        Ok((Some(Cow::Borrowed(raw)), None))
11651    } else if msg.extra_json.is_null() {
11652        Ok((None, None))
11653    } else {
11654        let extra_bin = serialize_json_to_msgpack(&msg.extra_json);
11655        if extra_bin.is_some() {
11656            Ok((None, extra_bin))
11657        } else {
11658            Ok((
11659                Some(Cow::Owned(serde_json::to_string(&msg.extra_json)?)),
11660                None,
11661            ))
11662        }
11663    }
11664}
11665
11666/// Batch size for proven-new message inserts.
11667///
11668/// Each row binds 8 values, so 100 rows stays well under SQLite's default
11669/// `SQLITE_MAX_VARIABLE_NUMBER` limit of 999 while still amortizing parse cost.
11670const MESSAGE_INSERT_BATCH_SIZE: usize = 100;
11671
11672/// Append workloads profile fastest with larger chunks on current frankensqlite.
11673///
11674/// After the tail-state hot table removed conversation-row rewrites from the
11675/// append path, 50-row chunks beat the old 20-row setting on the append-merge
11676/// profile. 100-row chunks slightly regress the 20-message workload.
11677const APPEND_MESSAGE_INSERT_BATCH_SIZE: usize = 50;
11678
11679fn message_insert_batch_sql(row_count: usize) -> &'static str {
11680    static MESSAGE_INSERT_BATCH_SQL: std::sync::OnceLock<Vec<String>> = std::sync::OnceLock::new();
11681
11682    let max_batch_size = MESSAGE_INSERT_BATCH_SIZE.max(APPEND_MESSAGE_INSERT_BATCH_SIZE);
11683    let cached_sql = MESSAGE_INSERT_BATCH_SQL.get_or_init(|| {
11684        let mut sql_by_row_count = Vec::with_capacity(max_batch_size + 1);
11685        sql_by_row_count.push(String::new());
11686        for row_count in 1..=max_batch_size {
11687            let placeholders = (0..row_count)
11688                .map(|idx| {
11689                    let base = idx * 8;
11690                    format!(
11691                        "(?{},?{},?{},?{},?{},?{},?{},?{})",
11692                        base + 1,
11693                        base + 2,
11694                        base + 3,
11695                        base + 4,
11696                        base + 5,
11697                        base + 6,
11698                        base + 7,
11699                        base + 8
11700                    )
11701                })
11702                .collect::<Vec<_>>()
11703                .join(",");
11704            sql_by_row_count.push(format!(
11705                "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin) VALUES {placeholders}"
11706            ));
11707        }
11708        sql_by_row_count
11709    });
11710
11711    cached_sql
11712        .get(row_count)
11713        .map(String::as_str)
11714        .expect("message insert batch size must be covered by the cached SQL table")
11715}
11716
11717fn franken_batch_insert_new_messages(
11718    tx: &FrankenTransaction<'_>,
11719    conversation_id: i64,
11720    messages: &[&Message],
11721) -> Result<Vec<i64>> {
11722    franken_batch_insert_new_messages_with_batch_size(
11723        tx,
11724        conversation_id,
11725        messages,
11726        MESSAGE_INSERT_BATCH_SIZE,
11727    )
11728}
11729
11730fn franken_append_insert_new_messages(
11731    tx: &FrankenTransaction<'_>,
11732    conversation_id: i64,
11733    messages: &[&Message],
11734) -> Result<Vec<i64>> {
11735    franken_batch_insert_new_messages_with_batch_size(
11736        tx,
11737        conversation_id,
11738        messages,
11739        APPEND_MESSAGE_INSERT_BATCH_SIZE,
11740    )
11741}
11742
11743fn franken_batch_insert_new_messages_with_batch_size(
11744    tx: &FrankenTransaction<'_>,
11745    conversation_id: i64,
11746    messages: &[&Message],
11747    batch_size: usize,
11748) -> Result<Vec<i64>> {
11749    let batch_size = batch_size.max(1);
11750    let mut inserted_ids = Vec::with_capacity(messages.len());
11751    for chunk in messages.chunks(batch_size) {
11752        if chunk.len() == 1 {
11753            inserted_ids.push(franken_insert_new_message(tx, conversation_id, chunk[0])?);
11754            continue;
11755        }
11756        let sql = message_insert_batch_sql(chunk.len());
11757
11758        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
11759        for msg in chunk {
11760            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11761            param_values.push(SqliteValue::from(conversation_id));
11762            param_values.push(SqliteValue::from(msg.idx));
11763            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
11764            param_values.push(SqliteValue::from(msg.author.as_deref()));
11765            param_values.push(SqliteValue::from(msg.created_at));
11766            param_values.push(SqliteValue::from(msg.content.as_str()));
11767            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
11768            param_values.push(SqliteValue::from(extra_bin.as_deref()));
11769        }
11770
11771        tx.execute_with_params(sql, &param_values)?;
11772
11773        let last_id = franken_last_rowid(tx)?;
11774        let first_id = last_id
11775            .checked_sub((chunk.len() - 1) as i64)
11776            .with_context(|| {
11777                format!(
11778                    "inferring rowid range for {}-row message batch ending at {last_id}",
11779                    chunk.len()
11780                )
11781            })?;
11782        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
11783    }
11784
11785    Ok(inserted_ids)
11786}
11787
11788#[cfg(test)]
11789fn franken_insert_new_message_with_profile(
11790    tx: &FrankenTransaction<'_>,
11791    conversation_id: i64,
11792    msg: &Message,
11793    profile: &mut MessageInsertSubstageProfile,
11794) -> Result<i64> {
11795    profile.single_row_calls += 1;
11796    profile.batch_rows += 1;
11797
11798    let payload_start = Instant::now();
11799    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11800    profile.payload_duration += payload_start.elapsed();
11801    let extra_bin_bytes = extra_bin.as_deref();
11802
11803    let execute_start = Instant::now();
11804    tx.execute_compat(
11805        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
11806         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
11807            fparams![
11808                conversation_id,
11809                msg.idx,
11810                role_as_str(&msg.role),
11811                msg.author.as_deref(),
11812                msg.created_at,
11813                msg.content.as_str(),
11814                extra_json_str.as_deref(),
11815                extra_bin_bytes
11816        ],
11817    )?;
11818    profile.execute_duration += execute_start.elapsed();
11819
11820    let rowid_start = Instant::now();
11821    let rowid = franken_last_rowid(tx)?;
11822    profile.rowid_duration += rowid_start.elapsed();
11823    Ok(rowid)
11824}
11825
11826#[cfg(test)]
11827fn franken_batch_insert_new_messages_with_profile(
11828    tx: &FrankenTransaction<'_>,
11829    conversation_id: i64,
11830    messages: &[&Message],
11831    profile: &mut MessageInsertSubstageProfile,
11832) -> Result<Vec<i64>> {
11833    franken_batch_insert_new_messages_with_profile_batch_size(
11834        tx,
11835        conversation_id,
11836        messages,
11837        profile,
11838        MESSAGE_INSERT_BATCH_SIZE,
11839    )
11840}
11841
11842#[cfg(test)]
11843fn franken_append_insert_new_messages_with_profile(
11844    tx: &FrankenTransaction<'_>,
11845    conversation_id: i64,
11846    messages: &[&Message],
11847    profile: &mut MessageInsertSubstageProfile,
11848) -> Result<Vec<i64>> {
11849    franken_batch_insert_new_messages_with_profile_batch_size(
11850        tx,
11851        conversation_id,
11852        messages,
11853        profile,
11854        APPEND_MESSAGE_INSERT_BATCH_SIZE,
11855    )
11856}
11857
11858#[cfg(test)]
11859fn franken_batch_insert_new_messages_with_profile_batch_size(
11860    tx: &FrankenTransaction<'_>,
11861    conversation_id: i64,
11862    messages: &[&Message],
11863    profile: &mut MessageInsertSubstageProfile,
11864    batch_size: usize,
11865) -> Result<Vec<i64>> {
11866    let batch_size = batch_size.max(1);
11867    let mut inserted_ids = Vec::with_capacity(messages.len());
11868    for chunk in messages.chunks(batch_size) {
11869        if chunk.len() == 1 {
11870            inserted_ids.push(franken_insert_new_message_with_profile(
11871                tx,
11872                conversation_id,
11873                chunk[0],
11874                profile,
11875            )?);
11876            continue;
11877        }
11878
11879        profile.batch_calls += 1;
11880        profile.batch_rows += chunk.len();
11881
11882        let sql_build_start = Instant::now();
11883        let sql = message_insert_batch_sql(chunk.len());
11884        profile.sql_build_duration += sql_build_start.elapsed();
11885
11886        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
11887        for msg in chunk {
11888            let payload_start = Instant::now();
11889            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11890            profile.payload_duration += payload_start.elapsed();
11891
11892            let param_build_start = Instant::now();
11893            param_values.push(SqliteValue::from(conversation_id));
11894            param_values.push(SqliteValue::from(msg.idx));
11895            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
11896            param_values.push(SqliteValue::from(msg.author.as_deref()));
11897            param_values.push(SqliteValue::from(msg.created_at));
11898            param_values.push(SqliteValue::from(msg.content.as_str()));
11899            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
11900            param_values.push(SqliteValue::from(extra_bin.as_deref()));
11901            profile.param_build_duration += param_build_start.elapsed();
11902        }
11903
11904        let execute_start = Instant::now();
11905        tx.execute_with_params(sql, &param_values)?;
11906        profile.execute_duration += execute_start.elapsed();
11907
11908        let rowid_start = Instant::now();
11909        let last_id = franken_last_rowid(tx)?;
11910        let first_id = last_id
11911            .checked_sub((chunk.len() - 1) as i64)
11912            .with_context(|| {
11913                format!(
11914                    "inferring rowid range for {}-row message batch ending at {last_id}",
11915                    chunk.len()
11916                )
11917            })?;
11918        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
11919        profile.rowid_duration += rowid_start.elapsed();
11920    }
11921
11922    Ok(inserted_ids)
11923}
11924
11925/// Insert snippets within a frankensqlite transaction.
11926fn franken_insert_snippets(
11927    tx: &FrankenTransaction<'_>,
11928    message_id: i64,
11929    snippets: &[Snippet],
11930) -> Result<()> {
11931    for snip in snippets {
11932        let file_path_str = snip.file_path.as_ref().map(path_to_string);
11933        tx.execute_compat(
11934            "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text)
11935             VALUES(?1,?2,?3,?4,?5,?6)",
11936            fparams![
11937                message_id,
11938                file_path_str.as_deref(),
11939                snip.start_line,
11940                snip.end_line,
11941                snip.language.as_deref(),
11942                snip.snippet_text.as_deref()
11943            ],
11944        )?;
11945    }
11946    Ok(())
11947}
11948
11949fn franken_existing_message_fingerprints(
11950    tx: &FrankenTransaction<'_>,
11951    conversation_id: i64,
11952) -> Result<HashSet<MessageMergeFingerprint>> {
11953    let rows = tx.query_params(
11954        "SELECT idx, role, author, created_at, content
11955         FROM messages
11956         WHERE conversation_id = ?1",
11957        fparams![conversation_id],
11958    )?;
11959    let mut fingerprints = HashSet::with_capacity(rows.len());
11960    for row in rows {
11961        let role: String = row.get_typed(1)?;
11962        let content: String = row.get_typed(4)?;
11963        fingerprints.insert(MessageMergeFingerprint {
11964            idx: row.get_typed(0)?,
11965            created_at: row.get_typed(3)?,
11966            role: role_from_str(&role),
11967            author: row.get_typed(2)?,
11968            content_hash: *blake3::hash(content.as_bytes()).as_bytes(),
11969        });
11970    }
11971    Ok(fingerprints)
11972}
11973
11974struct ExistingMessageLookup {
11975    by_idx: HashMap<i64, MessageMergeFingerprint>,
11976    replay: HashSet<MessageReplayFingerprint>,
11977}
11978
11979fn franken_existing_message_lookup(
11980    tx: &FrankenTransaction<'_>,
11981    conversation_id: i64,
11982    incoming_messages: &[Message],
11983) -> Result<ExistingMessageLookup> {
11984    if incoming_messages.is_empty() {
11985        return Ok(ExistingMessageLookup {
11986            by_idx: HashMap::new(),
11987            replay: HashSet::new(),
11988        });
11989    }
11990
11991    let min_idx = incoming_messages
11992        .iter()
11993        .map(|msg| msg.idx)
11994        .min()
11995        .unwrap_or(0);
11996    let max_idx = incoming_messages
11997        .iter()
11998        .map(|msg| msg.idx)
11999        .max()
12000        .unwrap_or(min_idx);
12001    let requires_full_scan = incoming_messages.iter().any(|msg| msg.created_at.is_none());
12002    let created_bounds = incoming_messages
12003        .iter()
12004        .filter_map(|msg| msg.created_at)
12005        .fold(None, |bounds: Option<(i64, i64)>, created_at| {
12006            Some(match bounds {
12007                Some((min_created_at, max_created_at)) => (
12008                    min_created_at.min(created_at),
12009                    max_created_at.max(created_at),
12010                ),
12011                None => (created_at, created_at),
12012            })
12013        });
12014
12015    let mut indexed_by_idx = HashMap::with_capacity(incoming_messages.len());
12016    let mut indexed_replay = HashSet::with_capacity(incoming_messages.len());
12017    let mut exact_idx_match = true;
12018    for msg in incoming_messages {
12019        record_message_lookup_exact_idx_probe();
12020        let Some((role, author, created_at, content)) = tx
12021            .query_row_map(
12022                "SELECT role, author, created_at, content
12023                 FROM messages INDEXED BY sqlite_autoindex_messages_1
12024                 WHERE conversation_id = ?1 AND idx = ?2
12025                 LIMIT 1",
12026                fparams![conversation_id, msg.idx],
12027                |row| {
12028                    Ok((
12029                        row.get_typed::<String>(0)?,
12030                        row.get_typed::<Option<String>>(1)?,
12031                        row.get_typed::<Option<i64>>(2)?,
12032                        row.get_typed::<String>(3)?,
12033                    ))
12034                },
12035            )
12036            .optional()?
12037        else {
12038            exact_idx_match = false;
12039            break;
12040        };
12041        let role = role_from_str(&role);
12042        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12043        let fingerprint = MessageMergeFingerprint {
12044            idx: msg.idx,
12045            created_at,
12046            role: role.clone(),
12047            author: author.clone(),
12048            content_hash,
12049        };
12050        if fingerprint != message_merge_fingerprint(msg) {
12051            exact_idx_match = false;
12052            break;
12053        }
12054        indexed_by_idx.insert(msg.idx, fingerprint);
12055        indexed_replay.insert(MessageReplayFingerprint {
12056            created_at,
12057            role,
12058            author,
12059            content_hash,
12060        });
12061    }
12062
12063    if exact_idx_match {
12064        return Ok(ExistingMessageLookup {
12065            by_idx: indexed_by_idx,
12066            replay: indexed_replay,
12067        });
12068    }
12069
12070    let (rows, replay_full_scan) = if requires_full_scan {
12071        let rows = tx.query_params(
12072            "SELECT idx, role, author, created_at, content
12073             FROM messages INDEXED BY sqlite_autoindex_messages_1
12074             WHERE conversation_id = ?1",
12075            fparams![conversation_id],
12076        )?;
12077        record_message_lookup_full_scan_query(rows.len());
12078        (rows, true)
12079    } else if let Some((min_created_at, max_created_at)) = created_bounds {
12080        let mut rows = tx.query_params(
12081            "SELECT idx, role, author, created_at, content
12082             FROM messages INDEXED BY sqlite_autoindex_messages_1
12083             WHERE conversation_id = ?1
12084               AND idx >= ?2
12085               AND idx <= ?3",
12086            fparams![conversation_id, min_idx, max_idx],
12087        )?;
12088        rows.extend(tx.query_params(
12089            "SELECT idx, role, author, created_at, content
12090             FROM messages INDEXED BY sqlite_autoindex_messages_1
12091             WHERE conversation_id = ?1
12092               AND created_at IS NOT NULL
12093               AND created_at >= ?2
12094               AND created_at <= ?3",
12095            fparams![conversation_id, min_created_at, max_created_at],
12096        )?);
12097        record_message_lookup_bounded_queries(2, rows.len());
12098        (rows, false)
12099    } else {
12100        let rows = tx.query_params(
12101            "SELECT idx, role, author, created_at, content
12102             FROM messages INDEXED BY sqlite_autoindex_messages_1
12103             WHERE conversation_id = ?1",
12104            fparams![conversation_id],
12105        )?;
12106        record_message_lookup_full_scan_query(rows.len());
12107        (rows, true)
12108    };
12109
12110    let mut by_idx = HashMap::with_capacity(rows.len());
12111    let mut replay = HashSet::with_capacity(rows.len());
12112    for row in rows {
12113        let idx: i64 = row.get_typed(0)?;
12114        let role: String = row.get_typed(1)?;
12115        let author: Option<String> = row.get_typed(2)?;
12116        let created_at: Option<i64> = row.get_typed(3)?;
12117        let content: String = row.get_typed(4)?;
12118        let role = role_from_str(&role);
12119        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12120
12121        if idx >= min_idx && idx <= max_idx {
12122            by_idx.insert(
12123                idx,
12124                MessageMergeFingerprint {
12125                    idx,
12126                    created_at,
12127                    role: role.clone(),
12128                    author: author.clone(),
12129                    content_hash,
12130                },
12131            );
12132        }
12133
12134        let replay_matches = if replay_full_scan {
12135            true
12136        } else if let Some((min_created_at, max_created_at)) = created_bounds {
12137            created_at.is_some_and(|ts| ts >= min_created_at && ts <= max_created_at)
12138        } else {
12139            true
12140        };
12141        if replay_matches {
12142            replay.insert(MessageReplayFingerprint {
12143                created_at,
12144                role,
12145                author,
12146                content_hash,
12147            });
12148        }
12149    }
12150
12151    Ok(ExistingMessageLookup { by_idx, replay })
12152}
12153
12154fn franken_existing_message_lookup_with_pending(
12155    tx: &FrankenTransaction<'_>,
12156    conversation_id: i64,
12157    incoming_messages: &[Message],
12158    pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
12159    pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
12160) -> Result<ExistingMessageLookup> {
12161    if let (Some(by_idx), Some(replay)) = (
12162        pending_message_fingerprints.get(&conversation_id),
12163        pending_message_replay_fingerprints.get(&conversation_id),
12164    ) {
12165        if incoming_messages.iter().all(|msg| {
12166            by_idx.contains_key(&msg.idx) || replay.contains(&message_replay_fingerprint(msg))
12167        }) {
12168            return Ok(ExistingMessageLookup {
12169                by_idx: by_idx.clone(),
12170                replay: replay.clone(),
12171            });
12172        }
12173
12174        let fresh = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12175        let mut merged_by_idx = by_idx.clone();
12176        let mut merged_replay = replay.clone();
12177        merged_by_idx.extend(fresh.by_idx);
12178        merged_replay.extend(fresh.replay);
12179        pending_message_fingerprints.insert(conversation_id, merged_by_idx.clone());
12180        pending_message_replay_fingerprints.insert(conversation_id, merged_replay.clone());
12181        return Ok(ExistingMessageLookup {
12182            by_idx: merged_by_idx,
12183            replay: merged_replay,
12184        });
12185    }
12186
12187    let lookup = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12188    pending_message_fingerprints.insert(conversation_id, lookup.by_idx.clone());
12189    pending_message_replay_fingerprints.insert(conversation_id, lookup.replay.clone());
12190    Ok(lookup)
12191}
12192
12193/// Batch insert FTS5 entries within a frankensqlite transaction.
12194fn franken_batch_insert_fts(tx: &FrankenTransaction<'_>, entries: &[FtsEntry]) -> Result<usize> {
12195    if entries.is_empty() {
12196        return Ok(0);
12197    }
12198
12199    let mut inserted = 0;
12200
12201    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12202        let placeholders: String = chunk
12203            .iter()
12204            .enumerate()
12205            .map(|(i, _)| {
12206                let base = i * 7 + 1; // +1 for 1-indexed params
12207                format!(
12208                    "(?{},?{},?{},?{},?{},?{},?{})",
12209                    base,
12210                    base + 1,
12211                    base + 2,
12212                    base + 3,
12213                    base + 4,
12214                    base + 5,
12215                    base + 6
12216                )
12217            })
12218            .collect::<Vec<_>>()
12219            .join(",");
12220
12221        let sql = format!(
12222            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12223        );
12224
12225        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12226        for entry in chunk {
12227            param_values.push(SqliteValue::from(entry.message_id));
12228            param_values.push(SqliteValue::from(entry.content.as_str()));
12229            param_values.push(SqliteValue::from(entry.title.as_str()));
12230            param_values.push(SqliteValue::from(entry.agent.as_str()));
12231            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12232            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12233            param_values.push(SqliteValue::from(entry.created_at));
12234        }
12235
12236        match tx.execute_with_params(&sql, &param_values) {
12237            Ok(_) => {
12238                inserted += chunk.len();
12239            }
12240            Err(err) => {
12241                tracing::warn!(
12242                    error = %err,
12243                    chunk_docs = chunk.len(),
12244                    "frankensqlite FTS batch insert failed; skipping db-resident FTS maintenance because Tantivy is authoritative"
12245                );
12246                return Ok(inserted);
12247            }
12248        }
12249    }
12250
12251    Ok(inserted)
12252}
12253
12254fn franken_batch_insert_fts_on_connection(
12255    conn: &FrankenConnection,
12256    entries: &[FtsEntry],
12257) -> Result<usize> {
12258    if entries.is_empty() {
12259        return Ok(0);
12260    }
12261
12262    let mut inserted = 0;
12263
12264    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12265        let placeholders: String = chunk
12266            .iter()
12267            .enumerate()
12268            .map(|(i, _)| {
12269                let base = i * 7 + 1;
12270                format!(
12271                    "(?{},?{},?{},?{},?{},?{},?{})",
12272                    base,
12273                    base + 1,
12274                    base + 2,
12275                    base + 3,
12276                    base + 4,
12277                    base + 5,
12278                    base + 6
12279                )
12280            })
12281            .collect::<Vec<_>>()
12282            .join(",");
12283
12284        let sql = format!(
12285            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12286        );
12287
12288        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12289        for entry in chunk {
12290            param_values.push(SqliteValue::from(entry.message_id));
12291            param_values.push(SqliteValue::from(entry.content.as_str()));
12292            param_values.push(SqliteValue::from(entry.title.as_str()));
12293            param_values.push(SqliteValue::from(entry.agent.as_str()));
12294            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12295            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12296            param_values.push(SqliteValue::from(entry.created_at));
12297        }
12298
12299        conn.execute_with_params(&sql, &param_values)
12300            .with_context(|| {
12301                format!(
12302                    "inserting {} rows into fts_messages during streaming FTS maintenance",
12303                    chunk.len()
12304                )
12305            })?;
12306        inserted += chunk.len();
12307    }
12308
12309    Ok(inserted)
12310}
12311
12312/// Update daily stats within a frankensqlite transaction.
12313fn franken_update_daily_stats_in_tx(
12314    storage: &FrankenStorage,
12315    tx: &FrankenTransaction<'_>,
12316    agent_slug: &str,
12317    source_id: &str,
12318    started_at: Option<i64>,
12319    delta: StatsDelta,
12320) -> Result<()> {
12321    let day_id = started_at
12322        .map(FrankenStorage::day_id_from_millis)
12323        .unwrap_or(0);
12324    let now = FrankenStorage::now_millis();
12325
12326    let targets = [
12327        DailyStatsTarget {
12328            day_id,
12329            agent_slug,
12330            source_id,
12331        },
12332        DailyStatsTarget {
12333            day_id,
12334            agent_slug: "all",
12335            source_id,
12336        },
12337        DailyStatsTarget {
12338            day_id,
12339            agent_slug,
12340            source_id: "all",
12341        },
12342        DailyStatsTarget {
12343            day_id,
12344            agent_slug: "all",
12345            source_id: "all",
12346        },
12347    ];
12348
12349    if agent_slug != "all"
12350        && source_id != "all"
12351        && franken_update_ensured_daily_stats_targets_in_tx(storage, tx, &targets, now, delta)?
12352    {
12353        return Ok(());
12354    }
12355
12356    for target in targets {
12357        franken_apply_daily_stats_delta_in_tx(storage, tx, target, now, delta)?;
12358    }
12359
12360    Ok(())
12361}
12362
12363#[derive(Clone, Copy)]
12364struct DailyStatsTarget<'a> {
12365    day_id: i64,
12366    agent_slug: &'a str,
12367    source_id: &'a str,
12368}
12369
12370fn franken_update_ensured_daily_stats_targets_in_tx(
12371    storage: &FrankenStorage,
12372    tx: &FrankenTransaction<'_>,
12373    targets: &[DailyStatsTarget<'_>; 4],
12374    now: i64,
12375    delta: StatsDelta,
12376) -> Result<bool> {
12377    let cache_keys = targets.map(|target| {
12378        EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id)
12379    });
12380    if !storage.daily_stats_keys_already_ensured(&cache_keys) {
12381        return Ok(false);
12382    }
12383
12384    let primary = targets[0];
12385    let rows_changed = tx.execute_compat(
12386        "UPDATE daily_stats
12387         SET session_count = session_count + ?4,
12388             message_count = message_count + ?5,
12389             total_chars = total_chars + ?6,
12390             last_updated = ?7
12391         WHERE day_id = ?1
12392           AND ((agent_slug = ?2 AND source_id = ?3)
12393                OR (agent_slug = 'all' AND source_id = ?3)
12394                OR (agent_slug = ?2 AND source_id = 'all')
12395                OR (agent_slug = 'all' AND source_id = 'all'))",
12396        fparams![
12397            primary.day_id,
12398            primary.agent_slug,
12399            primary.source_id,
12400            delta.session_count_delta,
12401            delta.message_count_delta,
12402            delta.total_chars_delta,
12403            now
12404        ],
12405    )?;
12406    if rows_changed == targets.len() {
12407        return Ok(true);
12408    }
12409
12410    for (target, cache_key) in targets.iter().copied().zip(cache_keys) {
12411        let exists = tx
12412            .query_row_map(
12413                "SELECT 1 FROM daily_stats
12414                 WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3
12415                 LIMIT 1",
12416                fparams![target.day_id, target.agent_slug, target.source_id],
12417                |row| row.get_typed::<i64>(0),
12418            )
12419            .optional()?
12420            .is_some();
12421        if exists {
12422            continue;
12423        }
12424
12425        tx.execute_compat(
12426            "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12427             VALUES(?1,?2,?3,?4,?5,?6,?7)",
12428            fparams![
12429                target.day_id,
12430                target.agent_slug,
12431                target.source_id,
12432                delta.session_count_delta,
12433                delta.message_count_delta,
12434                delta.total_chars_delta,
12435                now
12436            ],
12437        )?;
12438        storage.mark_daily_stats_key_ensured(cache_key);
12439    }
12440
12441    Ok(true)
12442}
12443
12444fn franken_apply_daily_stats_delta_in_tx(
12445    storage: &FrankenStorage,
12446    tx: &FrankenTransaction<'_>,
12447    target: DailyStatsTarget<'_>,
12448    now: i64,
12449    delta: StatsDelta,
12450) -> Result<()> {
12451    let cache_key = EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id);
12452    if storage.daily_stats_key_already_ensured(&cache_key) {
12453        let rows_changed = tx.execute_compat(
12454            "UPDATE daily_stats
12455             SET session_count = session_count + ?4,
12456                 message_count = message_count + ?5,
12457                 total_chars = total_chars + ?6,
12458                 last_updated = ?7
12459             WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3",
12460            fparams![
12461                target.day_id,
12462                target.agent_slug,
12463                target.source_id,
12464                delta.session_count_delta,
12465                delta.message_count_delta,
12466                delta.total_chars_delta,
12467                now
12468            ],
12469        )?;
12470        if rows_changed > 0 {
12471            return Ok(());
12472        }
12473    }
12474
12475    tx.execute_compat(
12476        "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12477         VALUES(?1,?2,?3,?4,?5,?6,?7)
12478         ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12479            session_count = session_count + excluded.session_count,
12480            message_count = message_count + excluded.message_count,
12481            total_chars = total_chars + excluded.total_chars,
12482            last_updated = excluded.last_updated",
12483        fparams![
12484            target.day_id,
12485            target.agent_slug,
12486            target.source_id,
12487            delta.session_count_delta,
12488            delta.message_count_delta,
12489            delta.total_chars_delta,
12490            now
12491        ],
12492    )?;
12493    storage.mark_daily_stats_key_ensured(cache_key);
12494    Ok(())
12495}
12496
12497// -------------------------------------------------------------------------
12498// Frankensqlite batch helpers
12499// -------------------------------------------------------------------------
12500
12501/// Batch upsert daily_stats within a frankensqlite transaction.
12502fn franken_update_daily_stats_batched_in_tx(
12503    tx: &FrankenTransaction<'_>,
12504    entries: &[(i64, String, String, StatsDelta)],
12505) -> Result<usize> {
12506    if entries.is_empty() {
12507        return Ok(0);
12508    }
12509
12510    let now = FrankenStorage::now_millis();
12511    let mut total_affected = 0;
12512
12513    // Keep frankensqlite UPSERTs row-wise inside the transaction. The
12514    // multi-row VALUES ... ON CONFLICT form still falls back through
12515    // INSERT...SELECT in fsqlite-core, which rejects UPSERT/RETURNING during
12516    // real cass indexing.
12517    for (day_id, agent, source, delta) in entries {
12518        total_affected += tx.execute_compat(
12519            "INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12520             VALUES(?1,?2,?3,?4,?5,?6,?7)
12521             ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12522                 session_count = session_count + excluded.session_count,
12523                 message_count = message_count + excluded.message_count,
12524                 total_chars = total_chars + excluded.total_chars,
12525                 last_updated = excluded.last_updated",
12526            fparams![
12527                *day_id,
12528                agent.as_str(),
12529                source.as_str(),
12530                delta.session_count_delta,
12531                delta.message_count_delta,
12532                delta.total_chars_delta,
12533                now
12534            ],
12535        )?;
12536    }
12537
12538    Ok(total_affected)
12539}
12540
12541/// Batch insert token_usage rows within a frankensqlite transaction.
12542///
12543/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12544/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12545/// UPSERT/OR IGNORE conflict clauses.
12546fn franken_insert_token_usage_batched_in_tx(
12547    tx: &FrankenTransaction<'_>,
12548    entries: &[TokenUsageEntry],
12549) -> Result<usize> {
12550    if entries.is_empty() {
12551        return Ok(0);
12552    }
12553
12554    let mut total_inserted = 0;
12555
12556    for e in entries {
12557        let params_vec: Vec<ParamValue> = vec![
12558            ParamValue::from(e.message_id),
12559            ParamValue::from(e.conversation_id),
12560            ParamValue::from(e.agent_id),
12561            ParamValue::from(e.workspace_id),
12562            ParamValue::from(e.source_id.clone()),
12563            ParamValue::from(e.timestamp_ms),
12564            ParamValue::from(e.day_id),
12565            ParamValue::from(e.model_name.clone()),
12566            ParamValue::from(e.model_family.clone()),
12567            ParamValue::from(e.model_tier.clone()),
12568            ParamValue::from(e.service_tier.clone()),
12569            ParamValue::from(e.provider.clone()),
12570            ParamValue::from(e.input_tokens),
12571            ParamValue::from(e.output_tokens),
12572            ParamValue::from(e.cache_read_tokens),
12573            ParamValue::from(e.cache_creation_tokens),
12574            ParamValue::from(e.thinking_tokens),
12575            ParamValue::from(e.total_tokens),
12576            ParamValue::from(e.estimated_cost_usd),
12577            ParamValue::from(e.role.clone()),
12578            ParamValue::from(e.content_chars),
12579            ParamValue::from(e.has_tool_calls as i64),
12580            ParamValue::from(e.tool_call_count as i64),
12581            ParamValue::from(e.data_source.clone()),
12582        ];
12583
12584        let values = param_slice_to_values(&params_vec);
12585        total_inserted += tx.execute_with_params(
12586            "INSERT OR IGNORE INTO token_usage (
12587                message_id, conversation_id, agent_id, workspace_id, source_id,
12588                timestamp_ms, day_id,
12589                model_name, model_family, model_tier, service_tier, provider,
12590                input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
12591                thinking_tokens, total_tokens, estimated_cost_usd,
12592                role, content_chars, has_tool_calls, tool_call_count, data_source
12593            )
12594            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12595            &values,
12596        )?;
12597    }
12598
12599    Ok(total_inserted)
12600}
12601
12602/// Batch upsert token_daily_stats within a frankensqlite transaction.
12603fn franken_update_token_daily_stats_batched_in_tx(
12604    tx: &FrankenTransaction<'_>,
12605    entries: &[(i64, String, String, String, TokenStatsDelta)],
12606) -> Result<usize> {
12607    if entries.is_empty() {
12608        return Ok(0);
12609    }
12610
12611    let now = FrankenStorage::now_millis();
12612    let mut total_affected = 0;
12613
12614    for (day_id, agent, source, model, delta) in entries {
12615        total_affected += tx.execute_compat(
12616            "INSERT INTO token_daily_stats (
12617                day_id, agent_slug, source_id, model_family,
12618                api_call_count, user_message_count, assistant_message_count, tool_message_count,
12619                total_input_tokens, total_output_tokens, total_cache_read_tokens,
12620                total_cache_creation_tokens, total_thinking_tokens, grand_total_tokens,
12621                total_content_chars, total_tool_calls, estimated_cost_usd, session_count,
12622                last_updated
12623            )
12624            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19)
12625            ON CONFLICT(day_id, agent_slug, source_id, model_family) DO UPDATE SET
12626                api_call_count = api_call_count + excluded.api_call_count,
12627                user_message_count = user_message_count + excluded.user_message_count,
12628                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12629                tool_message_count = tool_message_count + excluded.tool_message_count,
12630                total_input_tokens = total_input_tokens + excluded.total_input_tokens,
12631                total_output_tokens = total_output_tokens + excluded.total_output_tokens,
12632                total_cache_read_tokens = total_cache_read_tokens + excluded.total_cache_read_tokens,
12633                total_cache_creation_tokens = total_cache_creation_tokens + excluded.total_cache_creation_tokens,
12634                total_thinking_tokens = total_thinking_tokens + excluded.total_thinking_tokens,
12635                grand_total_tokens = grand_total_tokens + excluded.grand_total_tokens,
12636                total_content_chars = total_content_chars + excluded.total_content_chars,
12637                total_tool_calls = total_tool_calls + excluded.total_tool_calls,
12638                estimated_cost_usd = estimated_cost_usd + excluded.estimated_cost_usd,
12639                session_count = session_count + excluded.session_count,
12640                last_updated = excluded.last_updated",
12641            fparams![
12642                *day_id,
12643                agent.as_str(),
12644                source.as_str(),
12645                model.as_str(),
12646                delta.api_call_count,
12647                delta.user_message_count,
12648                delta.assistant_message_count,
12649                delta.tool_message_count,
12650                delta.total_input_tokens,
12651                delta.total_output_tokens,
12652                delta.total_cache_read_tokens,
12653                delta.total_cache_creation_tokens,
12654                delta.total_thinking_tokens,
12655                delta.grand_total_tokens,
12656                delta.total_content_chars,
12657                delta.total_tool_calls,
12658                delta.estimated_cost_usd,
12659                delta.session_count,
12660                now
12661            ],
12662        )?;
12663    }
12664
12665    Ok(total_affected)
12666}
12667
12668/// Batch insert message_metrics rows within a frankensqlite transaction.
12669///
12670/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12671/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12672/// UPSERT/OR IGNORE conflict clauses.
12673fn franken_insert_message_metrics_batched_in_tx(
12674    tx: &FrankenTransaction<'_>,
12675    entries: &[MessageMetricsEntry],
12676) -> Result<usize> {
12677    if entries.is_empty() {
12678        return Ok(0);
12679    }
12680
12681    let mut total_inserted = 0;
12682
12683    for e in entries {
12684        let params_vec: Vec<ParamValue> = vec![
12685            ParamValue::from(e.message_id),
12686            ParamValue::from(e.created_at_ms),
12687            ParamValue::from(e.hour_id),
12688            ParamValue::from(e.day_id),
12689            ParamValue::from(e.agent_slug.clone()),
12690            ParamValue::from(e.workspace_id),
12691            ParamValue::from(e.source_id.clone()),
12692            ParamValue::from(e.role.clone()),
12693            ParamValue::from(e.content_chars),
12694            ParamValue::from(e.content_tokens_est),
12695            ParamValue::from(e.model_name.clone()),
12696            ParamValue::from(e.model_family.clone()),
12697            ParamValue::from(e.model_tier.clone()),
12698            ParamValue::from(e.provider.clone()),
12699            ParamValue::from(e.api_input_tokens),
12700            ParamValue::from(e.api_output_tokens),
12701            ParamValue::from(e.api_cache_read_tokens),
12702            ParamValue::from(e.api_cache_creation_tokens),
12703            ParamValue::from(e.api_thinking_tokens),
12704            ParamValue::from(e.api_service_tier.clone()),
12705            ParamValue::from(e.api_data_source.clone()),
12706            ParamValue::from(e.tool_call_count),
12707            ParamValue::from(e.has_tool_calls as i64),
12708            ParamValue::from(e.has_plan as i64),
12709        ];
12710
12711        let values = param_slice_to_values(&params_vec);
12712        total_inserted += tx.execute_with_params(
12713            "INSERT OR IGNORE INTO message_metrics (
12714                message_id, created_at_ms, hour_id, day_id,
12715                agent_slug, workspace_id, source_id, role,
12716                content_chars, content_tokens_est,
12717                model_name, model_family, model_tier, provider,
12718                api_input_tokens, api_output_tokens, api_cache_read_tokens,
12719                api_cache_creation_tokens, api_thinking_tokens,
12720                api_service_tier, api_data_source,
12721                tool_call_count, has_tool_calls, has_plan
12722            )
12723            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12724            &values,
12725        )?;
12726    }
12727
12728    Ok(total_inserted)
12729}
12730
12731/// Flush one rollup table (shared logic for hourly + daily) within a frankensqlite transaction.
12732fn franken_flush_rollup_table(
12733    tx: &FrankenTransaction<'_>,
12734    table: &str,
12735    bucket_col: &str,
12736    deltas: &HashMap<(i64, String, i64, String), UsageRollupDelta>,
12737    now: i64,
12738) -> Result<usize> {
12739    if deltas.is_empty() {
12740        return Ok(0);
12741    }
12742
12743    let mut total_affected = 0;
12744
12745    for ((bucket_id, agent, workspace_id, source), d) in deltas {
12746        let sql = format!(
12747            "INSERT INTO {table} (
12748                {bucket_col}, agent_slug, workspace_id, source_id,
12749                message_count, user_message_count, assistant_message_count,
12750                tool_call_count, plan_message_count, plan_content_tokens_est_total,
12751                plan_api_tokens_total, api_coverage_message_count,
12752                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
12753                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
12754                api_cache_read_tokens_total, api_cache_creation_tokens_total,
12755                api_thinking_tokens_total, last_updated
12756            )
12757            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
12758            ON CONFLICT({bucket_col}, agent_slug, workspace_id, source_id) DO UPDATE SET
12759                message_count = message_count + excluded.message_count,
12760                user_message_count = user_message_count + excluded.user_message_count,
12761                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12762                tool_call_count = tool_call_count + excluded.tool_call_count,
12763                plan_message_count = plan_message_count + excluded.plan_message_count,
12764                plan_content_tokens_est_total = plan_content_tokens_est_total + excluded.plan_content_tokens_est_total,
12765                plan_api_tokens_total = plan_api_tokens_total + excluded.plan_api_tokens_total,
12766                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
12767                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
12768                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
12769                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
12770                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
12771                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
12772                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
12773                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
12774                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
12775                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
12776                last_updated = excluded.last_updated"
12777        );
12778
12779        total_affected += tx.execute_compat(
12780            &sql,
12781            fparams![
12782                *bucket_id,
12783                agent.as_str(),
12784                *workspace_id,
12785                source.as_str(),
12786                d.message_count,
12787                d.user_message_count,
12788                d.assistant_message_count,
12789                d.tool_call_count,
12790                d.plan_message_count,
12791                d.plan_content_tokens_est_total,
12792                d.plan_api_tokens_total,
12793                d.api_coverage_message_count,
12794                d.content_tokens_est_total,
12795                d.content_tokens_est_user,
12796                d.content_tokens_est_assistant,
12797                d.api_tokens_total,
12798                d.api_input_tokens_total,
12799                d.api_output_tokens_total,
12800                d.api_cache_read_tokens_total,
12801                d.api_cache_creation_tokens_total,
12802                d.api_thinking_tokens_total,
12803                now
12804            ],
12805        )?;
12806    }
12807
12808    Ok(total_affected)
12809}
12810
12811/// Flush usage_models_daily rollup within a frankensqlite transaction.
12812fn franken_flush_model_daily_rollup_table(
12813    tx: &FrankenTransaction<'_>,
12814    deltas: &HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
12815    now: i64,
12816) -> Result<usize> {
12817    if deltas.is_empty() {
12818        return Ok(0);
12819    }
12820
12821    let mut total_affected = 0;
12822
12823    for ((day_id, agent, workspace_id, source, model_family, model_tier), d) in deltas {
12824        total_affected += tx.execute_compat(
12825            "INSERT INTO usage_models_daily (
12826                day_id, agent_slug, workspace_id, source_id, model_family, model_tier,
12827                message_count, user_message_count, assistant_message_count,
12828                tool_call_count, plan_message_count, api_coverage_message_count,
12829                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
12830                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
12831                api_cache_read_tokens_total, api_cache_creation_tokens_total,
12832                api_thinking_tokens_total, last_updated
12833            )
12834            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
12835            ON CONFLICT(day_id, agent_slug, workspace_id, source_id, model_family, model_tier) DO UPDATE SET
12836                message_count = message_count + excluded.message_count,
12837                user_message_count = user_message_count + excluded.user_message_count,
12838                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12839                tool_call_count = tool_call_count + excluded.tool_call_count,
12840                plan_message_count = plan_message_count + excluded.plan_message_count,
12841                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
12842                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
12843                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
12844                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
12845                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
12846                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
12847                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
12848                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
12849                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
12850                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
12851                last_updated = excluded.last_updated",
12852            fparams![
12853                *day_id,
12854                agent.as_str(),
12855                *workspace_id,
12856                source.as_str(),
12857                model_family.as_str(),
12858                model_tier.as_str(),
12859                d.message_count,
12860                d.user_message_count,
12861                d.assistant_message_count,
12862                d.tool_call_count,
12863                d.plan_message_count,
12864                d.api_coverage_message_count,
12865                d.content_tokens_est_total,
12866                d.content_tokens_est_user,
12867                d.content_tokens_est_assistant,
12868                d.api_tokens_total,
12869                d.api_input_tokens_total,
12870                d.api_output_tokens_total,
12871                d.api_cache_read_tokens_total,
12872                d.api_cache_creation_tokens_total,
12873                d.api_thinking_tokens_total,
12874                now
12875            ],
12876        )?;
12877    }
12878
12879    Ok(total_affected)
12880}
12881
12882/// Flush AnalyticsRollupAggregator deltas via frankensqlite transaction.
12883fn franken_flush_analytics_rollups_in_tx(
12884    tx: &FrankenTransaction<'_>,
12885    agg: &AnalyticsRollupAggregator,
12886) -> Result<(usize, usize, usize)> {
12887    let now = FrankenStorage::now_millis();
12888
12889    let hourly_affected =
12890        franken_flush_rollup_table(tx, "usage_hourly", "hour_id", &agg.hourly, now)?;
12891    let daily_affected = franken_flush_rollup_table(tx, "usage_daily", "day_id", &agg.daily, now)?;
12892    let models_daily_affected = franken_flush_model_daily_rollup_table(tx, &agg.models_daily, now)?;
12893
12894    Ok((hourly_affected, daily_affected, models_daily_affected))
12895}
12896
12897/// Update conversation-level token summary columns via frankensqlite transaction.
12898fn franken_update_conversation_token_summaries_in_tx(
12899    tx: &FrankenTransaction<'_>,
12900    conversation_id: i64,
12901) -> Result<()> {
12902    tx.execute_compat(
12903        "UPDATE conversations SET
12904            total_input_tokens = (SELECT SUM(input_tokens) FROM token_usage WHERE conversation_id = ?1),
12905            total_output_tokens = (SELECT SUM(output_tokens) FROM token_usage WHERE conversation_id = ?1),
12906            total_cache_read_tokens = (SELECT SUM(cache_read_tokens) FROM token_usage WHERE conversation_id = ?1),
12907            total_cache_creation_tokens = (SELECT SUM(cache_creation_tokens) FROM token_usage WHERE conversation_id = ?1),
12908            grand_total_tokens = (SELECT SUM(total_tokens) FROM token_usage WHERE conversation_id = ?1),
12909            estimated_cost_usd = (SELECT SUM(estimated_cost_usd) FROM token_usage WHERE conversation_id = ?1),
12910            primary_model = (SELECT model_name FROM token_usage WHERE conversation_id = ?1
12911                             AND model_name IS NOT NULL
12912                             GROUP BY model_name ORDER BY COUNT(*) DESC LIMIT 1),
12913            api_call_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12914                              AND data_source = 'api'),
12915            tool_call_count = (SELECT SUM(tool_call_count) FROM token_usage WHERE conversation_id = ?1),
12916            user_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12917                                  AND role = 'user'),
12918            assistant_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12919                                       AND role IN ('assistant', 'agent'))
12920         WHERE id = ?1",
12921        fparams![conversation_id],
12922    )?;
12923    Ok(())
12924}
12925
12926impl FrankenStorage {
12927    /// Rebuild token_daily_stats from the token_usage ledger.
12928    pub fn rebuild_token_daily_stats(&self) -> Result<usize> {
12929        const CONVERSATION_BATCH_SIZE: usize = 1_000;
12930        const TOKEN_USAGE_BATCH_SIZE: usize = 10_000;
12931
12932        let total_usage_rows: i64 =
12933            self.conn
12934                .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
12935                    row.get_typed(0)
12936                })?;
12937        tracing::info!(
12938            target: "cass::analytics",
12939            total_usage_rows,
12940            "token_daily_stats_rebuild_start"
12941        );
12942
12943        let mut tx = self.conn.transaction()?;
12944        tx.execute("DELETE FROM token_daily_stats")?;
12945
12946        let mut last_conversation_id = 0_i64;
12947        let mut rows_created = 0_usize;
12948
12949        loop {
12950            let conversation_rows = tx.query_map_collect(
12951                "SELECT c.id, c.started_at, c.source_id,
12952                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown')
12953                 FROM conversations c
12954                 WHERE c.id > ?1
12955                 ORDER BY c.id
12956                 LIMIT ?2",
12957                fparams![last_conversation_id, CONVERSATION_BATCH_SIZE as i64],
12958                |row| {
12959                    Ok((
12960                        row.get_typed::<i64>(0)?,
12961                        row.get_typed::<Option<i64>>(1)?,
12962                        row.get_typed::<String>(2)?,
12963                        row.get_typed::<String>(3)?,
12964                    ))
12965                },
12966            )?;
12967            if conversation_rows.is_empty() {
12968                break;
12969            }
12970
12971            let mut aggregate = TokenStatsAggregator::new();
12972
12973            for (conversation_id, started_at, source_id, agent_slug) in conversation_rows {
12974                last_conversation_id = conversation_id;
12975                let conversation_day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
12976                let mut last_token_usage_id = 0_i64;
12977                let mut session_model_family = String::from("unknown");
12978
12979                loop {
12980                    let usage_rows = tx.query_map_collect(
12981                        "SELECT id, day_id, role,
12982                                COALESCE(model_family, 'unknown'),
12983                                input_tokens, output_tokens, cache_read_tokens,
12984                                cache_creation_tokens, thinking_tokens,
12985                                has_tool_calls, tool_call_count,
12986                                content_chars, estimated_cost_usd
12987                         FROM token_usage
12988                         WHERE conversation_id = ?1
12989                           AND id > ?2
12990                         ORDER BY id
12991                         LIMIT ?3",
12992                        fparams![
12993                            conversation_id,
12994                            last_token_usage_id,
12995                            TOKEN_USAGE_BATCH_SIZE as i64
12996                        ],
12997                        |row| {
12998                            Ok((
12999                                row.get_typed::<i64>(0)?,
13000                                row.get_typed::<i64>(1)?,
13001                                row.get_typed::<String>(2)?,
13002                                row.get_typed::<String>(3)?,
13003                                row.get_typed::<Option<i64>>(4)?,
13004                                row.get_typed::<Option<i64>>(5)?,
13005                                row.get_typed::<Option<i64>>(6)?,
13006                                row.get_typed::<Option<i64>>(7)?,
13007                                row.get_typed::<Option<i64>>(8)?,
13008                                row.get_typed::<i64>(9)?,
13009                                row.get_typed::<i64>(10)?,
13010                                row.get_typed::<i64>(11)?,
13011                                row.get_typed::<Option<f64>>(12)?,
13012                            ))
13013                        },
13014                    )?;
13015                    if usage_rows.is_empty() {
13016                        break;
13017                    }
13018
13019                    for (
13020                        token_usage_id,
13021                        day_id,
13022                        role,
13023                        model_family,
13024                        input_tokens,
13025                        output_tokens,
13026                        cache_read_tokens,
13027                        cache_creation_tokens,
13028                        thinking_tokens,
13029                        has_tool_calls,
13030                        tool_call_count,
13031                        content_chars,
13032                        estimated_cost_usd,
13033                    ) in usage_rows
13034                    {
13035                        last_token_usage_id = token_usage_id;
13036                        if model_family != "unknown" {
13037                            session_model_family = model_family.clone();
13038                        }
13039                        let usage = crate::connectors::ExtractedTokenUsage {
13040                            model_name: None,
13041                            provider: None,
13042                            input_tokens,
13043                            output_tokens,
13044                            cache_read_tokens,
13045                            cache_creation_tokens,
13046                            thinking_tokens,
13047                            service_tier: None,
13048                            has_tool_calls: has_tool_calls != 0,
13049                            tool_call_count: u32::try_from(tool_call_count.max(0)).unwrap_or(0),
13050                            data_source: franken_agent_detection::TokenDataSource::Api,
13051                        };
13052                        aggregate.record(
13053                            &agent_slug,
13054                            &source_id,
13055                            day_id,
13056                            &model_family,
13057                            &role,
13058                            &usage,
13059                            content_chars,
13060                            estimated_cost_usd.unwrap_or(0.0),
13061                        );
13062                    }
13063                }
13064
13065                aggregate.record_session(
13066                    &agent_slug,
13067                    &source_id,
13068                    conversation_day_id,
13069                    &session_model_family,
13070                );
13071            }
13072
13073            let entries = aggregate.expand();
13074            rows_created = rows_created.saturating_add(entries.len());
13075            franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
13076        }
13077
13078        tx.commit()?;
13079
13080        tracing::info!(
13081            target: "cass::analytics",
13082            rows_created,
13083            "token_daily_stats_rebuild_complete"
13084        );
13085
13086        Ok(rows_created)
13087    }
13088
13089    /// Rebuild analytics tables (message_metrics + rollups) from existing
13090    /// messages in the database. Does NOT re-parse raw agent session files.
13091    pub fn rebuild_analytics(&self) -> Result<AnalyticsRebuildResult> {
13092        let start = Instant::now();
13093
13094        let total_messages: i64 =
13095            self.conn
13096                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13097                    row.get_typed(0)
13098                })?;
13099        tracing::info!(
13100            target: "cass::analytics",
13101            total_messages,
13102            "analytics_rebuild_start"
13103        );
13104
13105        let mut tx = self.conn.transaction()?;
13106
13107        tx.execute("DELETE FROM message_metrics")?;
13108        tx.execute("DELETE FROM usage_hourly")?;
13109        tx.execute("DELETE FROM usage_daily")?;
13110        tx.execute("DELETE FROM usage_models_daily")?;
13111
13112        const CHUNK_SIZE: i64 = 10_000;
13113        let mut offset: i64 = 0;
13114        let mut total_inserted: usize = 0;
13115        let mut usage_hourly_rows: usize = 0;
13116        let mut usage_daily_rows: usize = 0;
13117        let mut usage_models_daily_rows: usize = 0;
13118
13119        loop {
13120            #[allow(clippy::type_complexity)]
13121            let rows: Vec<(
13122                i64,
13123                String,
13124                String,
13125                Option<serde_json::Value>,
13126                Option<i64>,
13127                Option<i64>,
13128                String,
13129                Option<i64>,
13130                String,
13131            )> = tx.query_map_collect(
13132                // Avoid the 3-table JOIN with LIMIT/OFFSET that triggers
13133                // frankensqlite's materialization fallback (see 860acb12).
13134                // Inline the agent slug lookup as a correlated subquery and
13135                // fall back to 'unknown' for NULL agent_id, matching the
13136                // FTS / lexical rebuild paths.
13137                "SELECT m.id, m.idx, m.role, m.content, m.extra_json, m.extra_bin,
13138                        m.created_at,
13139                        c.id AS conv_id, c.started_at AS conv_started_at,
13140                        c.source_id, c.workspace_id,
13141                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown') AS agent_slug
13142                 FROM messages m
13143                 JOIN conversations c ON m.conversation_id = c.id
13144                 ORDER BY m.id
13145                 LIMIT ?1 OFFSET ?2",
13146                fparams![CHUNK_SIZE, offset],
13147                |row| {
13148                    let msg_id: i64 = row.get_typed(0)?;
13149                    let role: String = row.get_typed(2)?;
13150                    let content: String = row.get_typed(3)?;
13151                    let extra_json = row
13152                        .get_typed::<Option<String>>(4)?
13153                        .and_then(|s| serde_json::from_str(&s).ok())
13154                        .or_else(|| {
13155                            row.get_typed::<Option<Vec<u8>>>(5)
13156                                .ok()
13157                                .flatten()
13158                                .and_then(|b| rmp_serde::from_slice(&b).ok())
13159                        });
13160                    let msg_ts: Option<i64> = row.get_typed(6)?;
13161                    let conv_started_at: Option<i64> = row.get_typed(8)?;
13162                    let source_id: String = row.get_typed(9)?;
13163                    let workspace_id: Option<i64> = row.get_typed(10)?;
13164                    let agent_slug: String = row.get_typed(11)?;
13165                    let effective_ts = msg_ts.or(conv_started_at).unwrap_or(0);
13166
13167                    Ok((
13168                        msg_id,
13169                        role,
13170                        content,
13171                        extra_json,
13172                        Some(effective_ts),
13173                        workspace_id,
13174                        source_id,
13175                        conv_started_at,
13176                        agent_slug,
13177                    ))
13178                },
13179            )?;
13180
13181            if rows.is_empty() {
13182                break;
13183            }
13184
13185            let chunk_len = rows.len();
13186            let mut entries = Vec::with_capacity(chunk_len);
13187            let mut rollup_agg = AnalyticsRollupAggregator::new();
13188
13189            for (
13190                msg_id,
13191                role,
13192                content,
13193                extra_json,
13194                effective_ts,
13195                workspace_id,
13196                source_id,
13197                _conv_started_at,
13198                agent_slug,
13199            ) in &rows
13200            {
13201                let ts = effective_ts.unwrap_or(0);
13202                let day_id = Self::day_id_from_millis(ts);
13203                let hour_id = Self::hour_id_from_millis(ts);
13204                let content_chars = content.len() as i64;
13205                let content_tokens_est = content_chars / 4;
13206                let extra = extra_json
13207                    .as_ref()
13208                    .cloned()
13209                    .unwrap_or(serde_json::Value::Null);
13210                let usage =
13211                    crate::connectors::extract_tokens_for_agent(agent_slug, &extra, content, role);
13212                let model_info = usage
13213                    .model_name
13214                    .as_deref()
13215                    .map(crate::connectors::normalize_model);
13216                let model_family = model_info
13217                    .as_ref()
13218                    .map(|i| i.family.clone())
13219                    .unwrap_or_else(|| "unknown".into());
13220                let model_tier = model_info
13221                    .as_ref()
13222                    .map(|i| i.tier.clone())
13223                    .unwrap_or_else(|| "unknown".into());
13224                let provider = usage
13225                    .provider
13226                    .clone()
13227                    .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
13228                    .unwrap_or_else(|| "unknown".into());
13229
13230                let entry = MessageMetricsEntry {
13231                    message_id: *msg_id,
13232                    created_at_ms: ts,
13233                    hour_id,
13234                    day_id,
13235                    agent_slug: agent_slug.clone(),
13236                    workspace_id: workspace_id.unwrap_or(0),
13237                    source_id: source_id.clone(),
13238                    role: role.clone(),
13239                    content_chars,
13240                    content_tokens_est,
13241                    model_name: usage.model_name.clone(),
13242                    model_family,
13243                    model_tier,
13244                    provider,
13245                    api_input_tokens: usage.input_tokens,
13246                    api_output_tokens: usage.output_tokens,
13247                    api_cache_read_tokens: usage.cache_read_tokens,
13248                    api_cache_creation_tokens: usage.cache_creation_tokens,
13249                    api_thinking_tokens: usage.thinking_tokens,
13250                    api_service_tier: usage.service_tier,
13251                    api_data_source: usage.data_source.as_str().to_string(),
13252                    tool_call_count: usage.tool_call_count as i64,
13253                    has_tool_calls: usage.has_tool_calls,
13254                    has_plan: has_plan_for_role(role, content),
13255                };
13256                rollup_agg.record(&entry);
13257                entries.push(entry);
13258            }
13259
13260            total_inserted += franken_insert_message_metrics_batched_in_tx(&tx, &entries)?;
13261            let (hourly, daily, models_daily) =
13262                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
13263            usage_hourly_rows += hourly;
13264            usage_daily_rows += daily;
13265            usage_models_daily_rows += models_daily;
13266            offset += chunk_len as i64;
13267
13268            tracing::debug!(
13269                target: "cass::analytics",
13270                offset,
13271                chunk = chunk_len,
13272                inserted = entries.len(),
13273                total = total_inserted,
13274                "analytics_rebuild_chunk"
13275            );
13276
13277            if (chunk_len as i64) < CHUNK_SIZE {
13278                break;
13279            }
13280        }
13281
13282        tx.commit()?;
13283
13284        let elapsed = start.elapsed();
13285        let elapsed_ms = elapsed.as_millis() as u64;
13286        let msgs_per_sec = if elapsed_ms > 0 {
13287            (total_inserted as f64) / (elapsed_ms as f64 / 1000.0)
13288        } else {
13289            0.0
13290        };
13291
13292        tracing::info!(
13293            target: "cass::analytics",
13294            message_metrics_rows = total_inserted,
13295            usage_hourly_rows,
13296            usage_daily_rows,
13297            usage_models_daily_rows,
13298            elapsed_ms,
13299            messages_per_sec = format!("{:.0}", msgs_per_sec),
13300            "analytics_rebuild_complete"
13301        );
13302
13303        Ok(AnalyticsRebuildResult {
13304            message_metrics_rows: total_inserted,
13305            usage_hourly_rows,
13306            usage_daily_rows,
13307            usage_models_daily_rows,
13308            elapsed_ms,
13309            messages_per_sec: msgs_per_sec,
13310        })
13311    }
13312
13313    /// Rebuild all daily stats from scratch.
13314    pub fn rebuild_daily_stats(&self) -> Result<DailyStatsRebuildResult> {
13315        const DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE: usize = 1_000;
13316        const DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE: usize = 10_000;
13317
13318        let mut conversation_batch_size = rebuild_batch_size_env(
13319            "CASS_DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE",
13320            DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE,
13321        );
13322        let mut message_batch_size = rebuild_batch_size_env(
13323            "CASS_DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE",
13324            DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE,
13325        );
13326
13327        let total_messages: i64 =
13328            self.conn
13329                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13330                    row.get_typed(0)
13331                })?;
13332        let message_metrics_rows: i64 =
13333            self.conn
13334                .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
13335                    row.get_typed(0)
13336                })?;
13337        let use_message_metrics = total_messages > 0 && total_messages == message_metrics_rows;
13338
13339        tracing::info!(
13340            target: "cass::perf::daily_stats",
13341            total_messages,
13342            message_metrics_rows,
13343            use_message_metrics,
13344            "daily_stats rebuild selected message source"
13345        );
13346
13347        let mut tx = self.conn.transaction()?;
13348        tx.execute("DELETE FROM daily_stats")?;
13349
13350        let mut last_conversation_id = 0_i64;
13351        let mut conversation_batch_count = 0_usize;
13352        let mut conversations_processed = 0_usize;
13353        let mut messages_processed = 0_usize;
13354        let mut message_batch_count = 0_usize;
13355        let mut raw_entries_flushed = 0_usize;
13356        let mut expanded_entries_flushed = 0_usize;
13357        let message_scan_sql = if use_message_metrics {
13358            "SELECT m.idx, mm.content_chars
13359             FROM messages m
13360             JOIN message_metrics mm ON mm.message_id = m.id
13361             WHERE m.conversation_id = ?1
13362               AND m.idx > ?2
13363             ORDER BY m.conversation_id, m.idx
13364             LIMIT ?3"
13365        } else {
13366            "SELECT m.idx, COALESCE(LENGTH(CAST(m.content AS BLOB)), 0)
13367             FROM messages m
13368             WHERE m.conversation_id = ?1
13369               AND m.idx > ?2
13370             ORDER BY m.conversation_id, m.idx
13371             LIMIT ?3"
13372        };
13373
13374        loop {
13375            // Avoid the 2-table JOIN with LIMIT that triggers frankensqlite's
13376            // materialization fallback (which is what the OOM retry below is
13377            // defending against — see 860acb12).  Inline agent slug via
13378            // correlated subquery and degrade NULL agent_id to 'unknown' for
13379            // consistency with the lexical/FTS rebuild paths.
13380            let conversation_rows = match self.conn.query_with_params(
13381                "SELECT c.id, c.started_at,
13382                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
13383                        c.source_id
13384                 FROM conversations c
13385                 WHERE c.id > ?1
13386                 ORDER BY c.id
13387                 LIMIT ?2",
13388                &params_from_iter([
13389                    ParamValue::from(last_conversation_id),
13390                    ParamValue::from(conversation_batch_size as i64),
13391                ]),
13392            ) {
13393                Ok(rows) => rows,
13394                Err(err) if is_out_of_memory_error(&err) && conversation_batch_size > 1 => {
13395                    let previous_batch_size = conversation_batch_size;
13396                    conversation_batch_size = (conversation_batch_size / 2).max(1);
13397                    tracing::warn!(
13398                        previous_batch_size,
13399                        conversation_batch_size,
13400                        last_conversation_id,
13401                        "daily_stats conversation scan ran out of memory; retrying with smaller batch"
13402                    );
13403                    continue;
13404                }
13405                Err(err) => return Err(err.into()),
13406            };
13407            if conversation_rows.is_empty() {
13408                break;
13409            }
13410
13411            let mut aggregate = StatsAggregator::new();
13412            let mut conversation_batch_meta: Vec<(i64, i64, String, String)> =
13413                Vec::with_capacity(conversation_rows.len());
13414            for row in &conversation_rows {
13415                let conversation_id: i64 = row.get_typed(0)?;
13416                let started_at: Option<i64> = row.get_typed(1)?;
13417                let agent_slug: String = row.get_typed(2)?;
13418                let source_id: String = row.get_typed(3)?;
13419                last_conversation_id = conversation_id;
13420                let day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13421                aggregate.record_delta(&agent_slug, &source_id, day_id, 1, 0, 0);
13422                conversation_batch_meta.push((conversation_id, day_id, agent_slug, source_id));
13423                conversations_processed += 1;
13424            }
13425
13426            conversation_batch_count += 1;
13427            raw_entries_flushed += aggregate.raw_entry_count();
13428            let entries = aggregate.expand();
13429            expanded_entries_flushed += entries.len();
13430            if !entries.is_empty() {
13431                franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13432            }
13433            if conversation_batch_count.is_multiple_of(25) {
13434                tracing::info!(
13435                    target: "cass::perf::daily_stats",
13436                    conversations_processed,
13437                    batches = conversation_batch_count,
13438                    batch_size = conversation_batch_size,
13439                    last_conversation_id,
13440                    "daily_stats rebuild conversation scan progress"
13441                );
13442            }
13443            if conversation_batch_meta.is_empty() {
13444                continue;
13445            }
13446
13447            for (conversation_id, day_id, agent_slug, source_id) in conversation_batch_meta {
13448                let mut cursor_message_idx = -1_i64;
13449                loop {
13450                    let message_rows = match self.conn.query_with_params(
13451                        message_scan_sql,
13452                        &params_from_iter([
13453                            ParamValue::from(conversation_id),
13454                            ParamValue::from(cursor_message_idx),
13455                            ParamValue::from(message_batch_size as i64),
13456                        ]),
13457                    ) {
13458                        Ok(rows) => rows,
13459                        Err(err) if is_out_of_memory_error(&err) && message_batch_size > 1 => {
13460                            let previous_batch_size = message_batch_size;
13461                            message_batch_size = (message_batch_size / 2).max(1);
13462                            tracing::warn!(
13463                                previous_batch_size,
13464                                message_batch_size,
13465                                conversation_id,
13466                                cursor_message_idx,
13467                                "daily_stats message scan ran out of memory; retrying with smaller batch"
13468                            );
13469                            continue;
13470                        }
13471                        Err(err) => return Err(err.into()),
13472                    };
13473                    if message_rows.is_empty() {
13474                        break;
13475                    }
13476
13477                    let mut aggregate = StatsAggregator::new();
13478                    for row in &message_rows {
13479                        let message_idx: i64 = row.get_typed(0)?;
13480                        let content_len: i64 = row.get_typed(1)?;
13481                        cursor_message_idx = message_idx;
13482                        aggregate.record_delta(&agent_slug, &source_id, day_id, 0, 1, content_len);
13483                        messages_processed += 1;
13484                    }
13485
13486                    message_batch_count += 1;
13487                    raw_entries_flushed += aggregate.raw_entry_count();
13488                    let entries = aggregate.expand();
13489                    expanded_entries_flushed += entries.len();
13490                    if !entries.is_empty() {
13491                        franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13492                    }
13493                    if message_batch_count.is_multiple_of(50) {
13494                        tracing::info!(
13495                            target: "cass::perf::daily_stats",
13496                            messages_processed,
13497                            batches = message_batch_count,
13498                            batch_size = message_batch_size,
13499                            source = if use_message_metrics {
13500                                "message_metrics"
13501                            } else {
13502                                "messages"
13503                            },
13504                            conversation_id,
13505                            cursor_message_idx,
13506                            "daily_stats rebuild message scan progress"
13507                        );
13508                    }
13509                }
13510            }
13511        }
13512
13513        let rows_created: i64 =
13514            tx.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
13515                row.get_typed(0)
13516            })?;
13517        let total_sessions: i64 = tx.query_row_map(
13518            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
13519            fparams![],
13520            |row| row.get_typed(0),
13521        )?;
13522
13523        tx.commit()?;
13524
13525        tracing::info!(
13526            target: "cass::perf::daily_stats",
13527            rows_created,
13528            total_sessions,
13529            conversations_processed,
13530            conversation_batches = conversation_batch_count,
13531            conversation_batch_size,
13532            message_batches = message_batch_count,
13533            message_batch_size,
13534            messages_processed,
13535            use_message_metrics,
13536            raw_entries_flushed,
13537            expanded_entries_flushed,
13538            "Daily stats rebuilt from conversations"
13539        );
13540
13541        Ok(DailyStatsRebuildResult {
13542            rows_created,
13543            total_sessions,
13544        })
13545    }
13546}
13547
13548// SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
13549// All methods are available through FrankenStorage.
13550
13551// -------------------------------------------------------------------------
13552// IndexingCache (Opt 7.2) - N+1 Prevention for Agent/Workspace IDs
13553// -------------------------------------------------------------------------
13554
13555/// Cache for agent and workspace IDs during batch indexing.
13556///
13557/// Prevents N+1 database queries by caching the results of ensure_agent
13558/// and ensure_workspace calls within a batch. This is per-batch and
13559/// single-threaded, so no synchronization is needed.
13560///
13561/// # Usage
13562/// ```ignore
13563/// let mut cache = IndexingCache::new();
13564/// for conv in conversations {
13565///     let agent_id = cache.get_or_insert_agent(storage, &agent)?;
13566///     let workspace_id = cache.get_or_insert_workspace(storage, workspace)?;
13567///     // ... use agent_id and workspace_id
13568/// }
13569/// ```
13570///
13571/// # Rollback
13572/// Set environment variable `CASS_SQLITE_CACHE=0` to bypass caching
13573/// and use direct DB calls (useful for debugging).
13574#[derive(Debug, Default)]
13575pub struct IndexingCache {
13576    agent_ids: HashMap<String, i64>,
13577    workspace_ids: HashMap<PathBuf, i64>,
13578    hits: u64,
13579    misses: u64,
13580}
13581
13582pub trait IndexingCacheStorage {
13583    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64>;
13584    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64>;
13585}
13586
13587impl IndexingCacheStorage for FrankenStorage {
13588    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64> {
13589        self.ensure_agent(agent)
13590    }
13591
13592    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
13593        self.ensure_workspace(path, display_name)
13594    }
13595}
13596
13597// IndexingCacheStorage for SqliteStorage removed: SqliteStorage is a type alias for FrankenStorage.
13598
13599impl IndexingCache {
13600    /// Create a new empty cache.
13601    pub fn new() -> Self {
13602        Self {
13603            agent_ids: HashMap::new(),
13604            workspace_ids: HashMap::new(),
13605            hits: 0,
13606            misses: 0,
13607        }
13608    }
13609
13610    /// Check if caching is enabled via environment variable.
13611    /// Returns true unless CASS_SQLITE_CACHE is set to "0" or "false".
13612    pub fn is_enabled() -> bool {
13613        dotenvy::var("CASS_SQLITE_CACHE")
13614            .map(|v| v != "0" && v.to_lowercase() != "false")
13615            .unwrap_or(true)
13616    }
13617
13618    /// Get or insert an agent ID, using cache if available.
13619    ///
13620    /// Returns the cached ID if present, otherwise calls ensure_agent
13621    /// and caches the result.
13622    pub fn get_or_insert_agent<S>(&mut self, storage: &S, agent: &Agent) -> Result<i64>
13623    where
13624        S: IndexingCacheStorage + ?Sized,
13625    {
13626        if let Some(&cached) = self.agent_ids.get(&agent.slug) {
13627            self.hits += 1;
13628            return Ok(cached);
13629        }
13630
13631        self.misses += 1;
13632        let id = storage.ensure_indexing_agent(agent)?;
13633        self.agent_ids.insert(agent.slug.clone(), id);
13634        Ok(id)
13635    }
13636
13637    /// Get or insert a workspace ID, using cache if available.
13638    ///
13639    /// Returns the cached ID if present, otherwise calls ensure_workspace
13640    /// and caches the result.
13641    pub fn get_or_insert_workspace(
13642        &mut self,
13643        storage: &(impl IndexingCacheStorage + ?Sized),
13644        path: &Path,
13645        display_name: Option<&str>,
13646    ) -> Result<i64> {
13647        if let Some(&cached) = self.workspace_ids.get(path) {
13648            self.hits += 1;
13649            return Ok(cached);
13650        }
13651
13652        self.misses += 1;
13653        let id = storage.ensure_indexing_workspace(path, display_name)?;
13654        self.workspace_ids.insert(path.to_path_buf(), id);
13655        Ok(id)
13656    }
13657
13658    /// Get cache statistics: (hits, misses, hit_rate).
13659    pub fn stats(&self) -> (u64, u64, f64) {
13660        let total = self.hits + self.misses;
13661        let hit_rate = if total > 0 {
13662            self.hits as f64 / total as f64
13663        } else {
13664            0.0
13665        };
13666        (self.hits, self.misses, hit_rate)
13667    }
13668
13669    /// Clear the cache, resetting all state.
13670    pub fn clear(&mut self) {
13671        self.agent_ids.clear();
13672        self.workspace_ids.clear();
13673        self.hits = 0;
13674        self.misses = 0;
13675    }
13676
13677    /// Number of cached agents.
13678    pub fn agent_count(&self) -> usize {
13679        self.agent_ids.len()
13680    }
13681
13682    /// Number of cached workspaces.
13683    pub fn workspace_count(&self) -> usize {
13684        self.workspace_ids.len()
13685    }
13686}
13687
13688// -------------------------------------------------------------------------
13689// StatsAggregator (kzxu) - Batched Daily Stats Updates
13690// -------------------------------------------------------------------------
13691// Aggregates daily stats in memory during batch ingestion, then flushes
13692// to the database in a single batched INSERT...ON CONFLICT operation.
13693// This prevents N×4 database writes (4 permutations per conversation).
13694
13695/// Accumulated statistics delta for a single (day_id, agent, source) combination.
13696#[derive(Clone, Copy, Debug, Default)]
13697pub struct StatsDelta {
13698    pub session_count_delta: i64,
13699    pub message_count_delta: i64,
13700    pub total_chars_delta: i64,
13701}
13702
13703/// In-memory aggregator for batched daily stats updates.
13704///
13705/// During batch ingestion, we accumulate deltas per (day_id, agent, source) key.
13706/// After processing all conversations, call `expand()` to generate the 4
13707/// permutations per raw entry, then flush via `SqliteStorage::update_daily_stats_batched`.
13708///
13709/// # Example
13710/// ```ignore
13711/// let mut agg = StatsAggregator::new();
13712/// for conv in conversations {
13713///     agg.record(&conv.agent_slug, source_id, day_id, msg_count, char_count);
13714/// }
13715/// let entries = agg.expand();
13716/// storage.update_daily_stats_batched(&entries)?;
13717/// ```
13718#[derive(Debug, Default)]
13719pub struct StatsAggregator {
13720    /// Raw deltas keyed by (day_id, agent_slug, source_id).
13721    /// Only stores specific (non-"all") combinations.
13722    deltas: HashMap<(i64, String, String), StatsDelta>,
13723}
13724
13725impl StatsAggregator {
13726    /// Create a new empty aggregator.
13727    pub fn new() -> Self {
13728        Self {
13729            deltas: HashMap::new(),
13730        }
13731    }
13732
13733    /// Record a conversation's contribution to stats (session + messages + chars).
13734    ///
13735    /// This increments session_count by 1.
13736    ///
13737    /// # Arguments
13738    /// * `agent_slug` - The specific agent slug (not "all")
13739    /// * `source_id` - The specific source ID (not "all")
13740    /// * `day_id` - Days since 2020-01-01 (from `SqliteStorage::day_id_from_millis`)
13741    /// * `message_count` - Number of messages in the conversation
13742    /// * `total_chars` - Total character count across all messages
13743    pub fn record(
13744        &mut self,
13745        agent_slug: &str,
13746        source_id: &str,
13747        day_id: i64,
13748        message_count: i64,
13749        total_chars: i64,
13750    ) {
13751        self.record_delta(agent_slug, source_id, day_id, 1, message_count, total_chars);
13752    }
13753
13754    /// Record an arbitrary delta. Use this for append-only updates where
13755    /// `session_count_delta` may be 0 but message/char deltas are non-zero.
13756    pub fn record_delta(
13757        &mut self,
13758        agent_slug: &str,
13759        source_id: &str,
13760        day_id: i64,
13761        session_count_delta: i64,
13762        message_count_delta: i64,
13763        total_chars_delta: i64,
13764    ) {
13765        if session_count_delta == 0 && message_count_delta == 0 && total_chars_delta == 0 {
13766            return;
13767        }
13768        let key = (day_id, agent_slug.to_owned(), source_id.to_owned());
13769        let delta = self.deltas.entry(key).or_default();
13770        delta.session_count_delta += session_count_delta;
13771        delta.message_count_delta += message_count_delta;
13772        delta.total_chars_delta += total_chars_delta;
13773    }
13774
13775    /// Expand raw deltas into the 4 permutation keys:
13776    /// - (agent, source) - specific both
13777    /// - ("all", source) - all agents, specific source
13778    /// - (agent, "all") - specific agent, all sources
13779    /// - ("all", "all") - totals
13780    ///
13781    /// Returns entries sorted by (day_id, agent_slug, source_id) for deterministic batching.
13782    pub fn expand(&self) -> Vec<(i64, String, String, StatsDelta)> {
13783        let mut expanded: HashMap<(i64, String, String), StatsDelta> = HashMap::new();
13784
13785        for ((day_id, agent, source), delta) in &self.deltas {
13786            let permutations = [
13787                (agent.as_str(), source.as_str()),
13788                ("all", source.as_str()),
13789                (agent.as_str(), "all"),
13790                ("all", "all"),
13791            ];
13792
13793            // Ensure we don't double-apply deltas if agent/source is already "all".
13794            for idx in 0..permutations.len() {
13795                let (a, s) = permutations[idx];
13796                if permutations[..idx].contains(&(a, s)) {
13797                    continue;
13798                }
13799                let key = (*day_id, a.to_owned(), s.to_owned());
13800                let entry = expanded.entry(key).or_default();
13801                entry.session_count_delta += delta.session_count_delta;
13802                entry.message_count_delta += delta.message_count_delta;
13803                entry.total_chars_delta += delta.total_chars_delta;
13804            }
13805        }
13806
13807        let mut out: Vec<(i64, String, String, StatsDelta)> = expanded
13808            .into_iter()
13809            .map(|((d, a, s), delta)| (d, a, s, delta))
13810            .collect();
13811        out.sort_by(|(d1, a1, s1, _), (d2, a2, s2, _)| {
13812            d1.cmp(d2).then_with(|| a1.cmp(a2)).then_with(|| s1.cmp(s2))
13813        });
13814        out
13815    }
13816
13817    /// Check if the aggregator is empty (no data recorded).
13818    pub fn is_empty(&self) -> bool {
13819        self.deltas.is_empty()
13820    }
13821
13822    /// Get number of distinct raw (day, agent, source) combinations recorded.
13823    pub fn raw_entry_count(&self) -> usize {
13824        self.deltas.len()
13825    }
13826}
13827
13828// -------------------------------------------------------------------------
13829// TokenStatsAggregator — Batched Token Analytics Daily Stats
13830// -------------------------------------------------------------------------
13831// Mirrors StatsAggregator pattern for token-level metrics.
13832// Aggregates token usage in memory during batch ingestion, then flushes
13833// to token_daily_stats in a single batched INSERT...ON CONFLICT operation.
13834
13835/// Accumulated token statistics delta for a single (day_id, agent, source, model_family) combination.
13836#[derive(Clone, Debug, Default)]
13837pub struct TokenStatsDelta {
13838    pub api_call_count: i64,
13839    pub user_message_count: i64,
13840    pub assistant_message_count: i64,
13841    pub tool_message_count: i64,
13842    pub total_input_tokens: i64,
13843    pub total_output_tokens: i64,
13844    pub total_cache_read_tokens: i64,
13845    pub total_cache_creation_tokens: i64,
13846    pub total_thinking_tokens: i64,
13847    pub grand_total_tokens: i64,
13848    pub total_content_chars: i64,
13849    pub total_tool_calls: i64,
13850    pub estimated_cost_usd: f64,
13851    pub session_count: i64,
13852}
13853
13854/// In-memory aggregator for batched token daily stats updates.
13855///
13856/// During batch ingestion, accumulate token deltas per (day_id, agent, source, model_family) key.
13857/// After processing, call `expand()` to generate the 5 permutation keys, then flush via
13858/// `update_token_daily_stats_batched_in_tx`.
13859#[derive(Debug, Default)]
13860pub struct TokenStatsAggregator {
13861    /// Raw deltas keyed by (day_id, agent_slug, source_id, model_family).
13862    deltas: HashMap<(i64, String, String, String), TokenStatsDelta>,
13863}
13864
13865impl TokenStatsAggregator {
13866    pub fn new() -> Self {
13867        Self {
13868            deltas: HashMap::new(),
13869        }
13870    }
13871
13872    /// Record a single message's token contribution.
13873    #[allow(clippy::too_many_arguments)]
13874    pub fn record(
13875        &mut self,
13876        agent_slug: &str,
13877        source_id: &str,
13878        day_id: i64,
13879        model_family: &str,
13880        role: &str,
13881        usage: &crate::connectors::ExtractedTokenUsage,
13882        content_chars: i64,
13883        estimated_cost_usd: f64,
13884    ) {
13885        let key = (
13886            day_id,
13887            agent_slug.to_owned(),
13888            source_id.to_owned(),
13889            model_family.to_owned(),
13890        );
13891        let delta = self.deltas.entry(key).or_default();
13892
13893        delta.api_call_count += 1;
13894        match role {
13895            "user" => delta.user_message_count += 1,
13896            "assistant" | "agent" => delta.assistant_message_count += 1,
13897            "tool" => delta.tool_message_count += 1,
13898            _ => {}
13899        }
13900
13901        delta.total_input_tokens += usage.input_tokens.unwrap_or(0);
13902        delta.total_output_tokens += usage.output_tokens.unwrap_or(0);
13903        delta.total_cache_read_tokens += usage.cache_read_tokens.unwrap_or(0);
13904        delta.total_cache_creation_tokens += usage.cache_creation_tokens.unwrap_or(0);
13905        delta.total_thinking_tokens += usage.thinking_tokens.unwrap_or(0);
13906        delta.grand_total_tokens += usage.total_tokens().unwrap_or(0);
13907        delta.total_content_chars += content_chars;
13908        delta.total_tool_calls += usage.tool_call_count as i64;
13909        delta.estimated_cost_usd += estimated_cost_usd;
13910    }
13911
13912    /// Record a session count bump for a given day/agent/source/model.
13913    pub fn record_session(
13914        &mut self,
13915        agent_slug: &str,
13916        source_id: &str,
13917        day_id: i64,
13918        model_family: &str,
13919    ) {
13920        let key = (
13921            day_id,
13922            agent_slug.to_owned(),
13923            source_id.to_owned(),
13924            model_family.to_owned(),
13925        );
13926        self.deltas.entry(key).or_default().session_count += 1;
13927    }
13928
13929    /// Expand raw deltas into 5 permutation keys for the 4-dimensional composite PK:
13930    /// - (agent, source, model)  — specific all three
13931    /// - ("all", source, model)  — all agents
13932    /// - (agent, "all", model)   — all sources
13933    /// - (agent, source, "all")  — all models
13934    /// - ("all", "all", "all")   — global total
13935    pub fn expand(&self) -> Vec<(i64, String, String, String, TokenStatsDelta)> {
13936        let mut expanded: HashMap<(i64, String, String, String), TokenStatsDelta> = HashMap::new();
13937
13938        for ((day_id, agent, source, model), delta) in &self.deltas {
13939            let permutations = [
13940                (agent.as_str(), source.as_str(), model.as_str()),
13941                ("all", source.as_str(), model.as_str()),
13942                (agent.as_str(), "all", model.as_str()),
13943                (agent.as_str(), source.as_str(), "all"),
13944                ("all", "all", "all"),
13945            ];
13946
13947            for idx in 0..permutations.len() {
13948                let (a, s, m) = permutations[idx];
13949                // Deduplicate if agent/source/model is already "all"
13950                if permutations[..idx].contains(&(a, s, m)) {
13951                    continue;
13952                }
13953                let key = (*day_id, a.to_owned(), s.to_owned(), m.to_owned());
13954                let entry = expanded.entry(key).or_default();
13955                entry.api_call_count += delta.api_call_count;
13956                entry.user_message_count += delta.user_message_count;
13957                entry.assistant_message_count += delta.assistant_message_count;
13958                entry.tool_message_count += delta.tool_message_count;
13959                entry.total_input_tokens += delta.total_input_tokens;
13960                entry.total_output_tokens += delta.total_output_tokens;
13961                entry.total_cache_read_tokens += delta.total_cache_read_tokens;
13962                entry.total_cache_creation_tokens += delta.total_cache_creation_tokens;
13963                entry.total_thinking_tokens += delta.total_thinking_tokens;
13964                entry.grand_total_tokens += delta.grand_total_tokens;
13965                entry.total_content_chars += delta.total_content_chars;
13966                entry.total_tool_calls += delta.total_tool_calls;
13967                entry.estimated_cost_usd += delta.estimated_cost_usd;
13968                entry.session_count += delta.session_count;
13969            }
13970        }
13971
13972        let mut out: Vec<(i64, String, String, String, TokenStatsDelta)> = expanded
13973            .into_iter()
13974            .map(|((d, a, s, m), delta)| (d, a, s, m, delta))
13975            .collect();
13976        out.sort_by(|(d1, a1, s1, m1, _), (d2, a2, s2, m2, _)| {
13977            d1.cmp(d2)
13978                .then_with(|| a1.cmp(a2))
13979                .then_with(|| s1.cmp(s2))
13980                .then_with(|| m1.cmp(m2))
13981        });
13982        out
13983    }
13984
13985    pub fn is_empty(&self) -> bool {
13986        self.deltas.is_empty()
13987    }
13988
13989    pub fn raw_entry_count(&self) -> usize {
13990        self.deltas.len()
13991    }
13992}
13993
13994// -------------------------------------------------------------------------
13995// AnalyticsRollupAggregator — Batched usage_hourly + usage_daily Updates
13996// -------------------------------------------------------------------------
13997// Accumulates per-message deltas in memory, then flushes to both
13998// usage_hourly and usage_daily in a single batched operation.
13999
14000/// Delta for a single (bucket, agent_slug, workspace_id, source_id) rollup key.
14001#[derive(Clone, Debug, Default)]
14002pub struct UsageRollupDelta {
14003    pub message_count: i64,
14004    pub user_message_count: i64,
14005    pub assistant_message_count: i64,
14006    pub tool_call_count: i64,
14007    pub plan_message_count: i64,
14008    pub plan_content_tokens_est_total: i64,
14009    pub plan_api_tokens_total: i64,
14010    pub api_coverage_message_count: i64,
14011    pub content_tokens_est_total: i64,
14012    pub content_tokens_est_user: i64,
14013    pub content_tokens_est_assistant: i64,
14014    pub api_tokens_total: i64,
14015    pub api_input_tokens_total: i64,
14016    pub api_output_tokens_total: i64,
14017    pub api_cache_read_tokens_total: i64,
14018    pub api_cache_creation_tokens_total: i64,
14019    pub api_thinking_tokens_total: i64,
14020}
14021
14022/// Pending message_metrics row for batch insertion.
14023#[derive(Debug, Clone)]
14024pub struct MessageMetricsEntry {
14025    pub message_id: i64,
14026    pub created_at_ms: i64,
14027    pub hour_id: i64,
14028    pub day_id: i64,
14029    pub agent_slug: String,
14030    pub workspace_id: i64,
14031    pub source_id: String,
14032    pub role: String,
14033    pub content_chars: i64,
14034    pub content_tokens_est: i64,
14035    pub model_name: Option<String>,
14036    pub model_family: String,
14037    pub model_tier: String,
14038    pub provider: String,
14039    pub api_input_tokens: Option<i64>,
14040    pub api_output_tokens: Option<i64>,
14041    pub api_cache_read_tokens: Option<i64>,
14042    pub api_cache_creation_tokens: Option<i64>,
14043    pub api_thinking_tokens: Option<i64>,
14044    pub api_service_tier: Option<String>,
14045    pub api_data_source: String,
14046    pub tool_call_count: i64,
14047    pub has_tool_calls: bool,
14048    pub has_plan: bool,
14049}
14050
14051/// In-memory aggregator for batched usage_hourly and usage_daily rollup updates.
14052///
14053/// Keyed by (bucket_id, agent_slug, workspace_id, source_id).
14054/// Maintains separate hourly and daily delta maps.
14055#[derive(Debug, Default)]
14056pub struct AnalyticsRollupAggregator {
14057    hourly: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14058    daily: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14059    models_daily: HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
14060}
14061
14062impl AnalyticsRollupAggregator {
14063    pub fn new() -> Self {
14064        Self::default()
14065    }
14066
14067    /// Record a single message's contribution to both hourly and daily rollups.
14068    pub fn record(&mut self, entry: &MessageMetricsEntry) {
14069        let content_est = entry.content_tokens_est;
14070        let api_total = entry.api_input_tokens.unwrap_or(0)
14071            + entry.api_output_tokens.unwrap_or(0)
14072            + entry.api_cache_read_tokens.unwrap_or(0)
14073            + entry.api_cache_creation_tokens.unwrap_or(0)
14074            + entry.api_thinking_tokens.unwrap_or(0);
14075        let is_api = entry.api_data_source == "api";
14076        let is_user = entry.role == "user";
14077        let is_assistant = entry.role == "assistant" || entry.role == "agent";
14078
14079        // Apply to both hourly and daily
14080        for (map, bucket_id) in [
14081            (&mut self.hourly, entry.hour_id),
14082            (&mut self.daily, entry.day_id),
14083        ] {
14084            let key = (
14085                bucket_id,
14086                entry.agent_slug.clone(),
14087                entry.workspace_id,
14088                entry.source_id.clone(),
14089            );
14090            let d = map.entry(key).or_default();
14091            d.message_count += 1;
14092            if is_user {
14093                d.user_message_count += 1;
14094                d.content_tokens_est_user += content_est;
14095            }
14096            if is_assistant {
14097                d.assistant_message_count += 1;
14098                d.content_tokens_est_assistant += content_est;
14099            }
14100            d.tool_call_count += entry.tool_call_count;
14101            if entry.has_plan {
14102                d.plan_message_count += 1;
14103                d.plan_content_tokens_est_total += content_est;
14104                if is_api {
14105                    d.plan_api_tokens_total += api_total;
14106                }
14107            }
14108            if is_api {
14109                d.api_coverage_message_count += 1;
14110                d.api_tokens_total += api_total;
14111                d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14112                d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14113                d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14114                d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14115                d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14116            }
14117            d.content_tokens_est_total += content_est;
14118        }
14119
14120        let model_key = (
14121            entry.day_id,
14122            entry.agent_slug.clone(),
14123            entry.workspace_id,
14124            entry.source_id.clone(),
14125            entry.model_family.clone(),
14126            entry.model_tier.clone(),
14127        );
14128        let d = self.models_daily.entry(model_key).or_default();
14129        d.message_count += 1;
14130        if is_user {
14131            d.user_message_count += 1;
14132            d.content_tokens_est_user += content_est;
14133        }
14134        if is_assistant {
14135            d.assistant_message_count += 1;
14136            d.content_tokens_est_assistant += content_est;
14137        }
14138        d.tool_call_count += entry.tool_call_count;
14139        if entry.has_plan {
14140            d.plan_message_count += 1;
14141            d.plan_content_tokens_est_total += content_est;
14142            if is_api {
14143                d.plan_api_tokens_total += api_total;
14144            }
14145        }
14146        if is_api {
14147            d.api_coverage_message_count += 1;
14148            d.api_tokens_total += api_total;
14149            d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14150            d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14151            d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14152            d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14153            d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14154        }
14155        d.content_tokens_est_total += content_est;
14156    }
14157
14158    pub fn is_empty(&self) -> bool {
14159        self.hourly.is_empty() && self.daily.is_empty() && self.models_daily.is_empty()
14160    }
14161
14162    pub fn hourly_entry_count(&self) -> usize {
14163        self.hourly.len()
14164    }
14165
14166    pub fn daily_entry_count(&self) -> usize {
14167        self.daily.len()
14168    }
14169
14170    pub fn models_daily_entry_count(&self) -> usize {
14171        self.models_daily.len()
14172    }
14173}
14174
14175/// Whether the current role should be considered for plan attribution.
14176///
14177/// Plan attribution v2 defaults to assistant/agent messages only.
14178fn has_plan_for_role(role: &str, content: &str) -> bool {
14179    let role = role.trim();
14180    (role.eq_ignore_ascii_case("assistant") || role.eq_ignore_ascii_case("agent"))
14181        && has_plan_heuristic(content)
14182}
14183
14184/// Heuristic to detect "plan" messages.
14185///
14186/// v2 behavior:
14187/// - Require an explicit plan marker near the top of the message.
14188/// - Require structured steps (numbered or bullets) to reduce false positives.
14189/// - Avoid classifying tool-output blobs as plans.
14190fn has_plan_heuristic(content: &str) -> bool {
14191    if content.len() < 24 {
14192        return false;
14193    }
14194
14195    let lower = content.to_lowercase();
14196
14197    // Ignore tool-output-like blobs unless they also have a strong plan header.
14198    let looks_like_tool_blob = lower.contains("```")
14199        || lower.contains("\"tool\"")
14200        || lower.contains("stdout:")
14201        || lower.contains("stderr:")
14202        || lower.contains("exit code:");
14203
14204    let mut lines: Vec<&str> = Vec::with_capacity(60);
14205    let mut in_fenced_code = false;
14206    for raw in lower.lines() {
14207        let line = raw.trim();
14208        if line.starts_with("```") {
14209            in_fenced_code = !in_fenced_code;
14210            continue;
14211        }
14212        if in_fenced_code || line.is_empty() {
14213            continue;
14214        }
14215        lines.push(line);
14216        if lines.len() >= 60 {
14217            break;
14218        }
14219    }
14220
14221    let header_pos = lines.iter().position(|line| {
14222        line.starts_with("## plan")
14223            || line.starts_with("# plan")
14224            || line.starts_with("plan:")
14225            || line.starts_with("implementation plan")
14226            || line.starts_with("next steps:")
14227            || line.starts_with("action plan:")
14228    });
14229    let preview_top = lines.iter().take(8).copied().collect::<Vec<_>>().join("\n");
14230    let header_near_top = header_pos.is_some_and(|idx| idx <= 6) || preview_top.contains("plan:");
14231
14232    if !header_near_top {
14233        return false;
14234    }
14235    if looks_like_tool_blob && header_pos.is_none() {
14236        return false;
14237    }
14238
14239    let numbered_steps = lines
14240        .iter()
14241        .filter(|line| is_numbered_step_line(line))
14242        .count();
14243    let bullet_steps = lines
14244        .iter()
14245        .filter(|line| {
14246            line.starts_with("- ")
14247                || line.starts_with("* ")
14248                || line.starts_with("+ ")
14249                || line.starts_with("- [ ] ")
14250                || line.starts_with("- [x] ")
14251        })
14252        .count();
14253
14254    numbered_steps >= 2 || (numbered_steps >= 1 && bullet_steps >= 1) || bullet_steps >= 3
14255}
14256
14257fn is_numbered_step_line(line: &str) -> bool {
14258    let trimmed = line.trim_start();
14259    let digit_count = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
14260    if digit_count == 0 || digit_count > 3 {
14261        return false;
14262    }
14263    let rest = &trimmed[digit_count..];
14264    rest.starts_with(". ") || rest.starts_with(") ")
14265}
14266
14267/// Pending token_usage row to be batch-inserted.
14268#[derive(Debug, Clone)]
14269pub struct TokenUsageEntry {
14270    pub message_id: i64,
14271    pub conversation_id: i64,
14272    pub agent_id: i64,
14273    pub workspace_id: Option<i64>,
14274    pub source_id: String,
14275    pub timestamp_ms: i64,
14276    pub day_id: i64,
14277    pub model_name: Option<String>,
14278    pub model_family: Option<String>,
14279    pub model_tier: Option<String>,
14280    pub service_tier: Option<String>,
14281    pub provider: Option<String>,
14282    pub input_tokens: Option<i64>,
14283    pub output_tokens: Option<i64>,
14284    pub cache_read_tokens: Option<i64>,
14285    pub cache_creation_tokens: Option<i64>,
14286    pub thinking_tokens: Option<i64>,
14287    pub total_tokens: Option<i64>,
14288    pub estimated_cost_usd: Option<f64>,
14289    pub role: String,
14290    pub content_chars: i64,
14291    pub has_tool_calls: bool,
14292    pub tool_call_count: u32,
14293    pub data_source: String,
14294}
14295
14296// -------------------------------------------------------------------------
14297// PricingTable — In-memory cache for model_pricing lookups (bead z9fse.10)
14298// -------------------------------------------------------------------------
14299
14300/// One pricing row loaded from the `model_pricing` table.
14301#[derive(Debug, Clone)]
14302pub struct PricingEntry {
14303    pub model_pattern: String,
14304    pub provider: String,
14305    pub input_cost_per_mtok: f64,
14306    pub output_cost_per_mtok: f64,
14307    pub cache_read_cost_per_mtok: Option<f64>,
14308    pub cache_creation_cost_per_mtok: Option<f64>,
14309    /// Effective date as day_id (days since 2020-01-01).
14310    pub effective_day_id: i64,
14311}
14312
14313/// Diagnostics for pricing coverage during a batch operation.
14314#[derive(Debug, Clone, Default)]
14315pub struct PricingDiagnostics {
14316    pub priced_count: u64,
14317    pub unpriced_count: u64,
14318    /// Top unknown model names → count.
14319    pub unknown_models: HashMap<String, u64>,
14320}
14321
14322impl PricingDiagnostics {
14323    fn record_priced(&mut self) {
14324        self.priced_count += 1;
14325    }
14326
14327    fn record_unpriced(&mut self, model_name: Option<&str>) {
14328        self.unpriced_count += 1;
14329        let key = model_name.unwrap_or("(none)").to_string();
14330        *self.unknown_models.entry(key).or_insert(0) += 1;
14331    }
14332
14333    /// Log a summary of pricing coverage.
14334    pub fn log_summary(&self) {
14335        let total = self.priced_count + self.unpriced_count;
14336        if total == 0 {
14337            return;
14338        }
14339        let pct = (self.priced_count as f64 / total as f64) * 100.0;
14340        tracing::info!(
14341            target: "cass::analytics::pricing",
14342            priced = self.priced_count,
14343            unpriced = self.unpriced_count,
14344            total = total,
14345            coverage_pct = format!("{pct:.1}%"),
14346            "pricing coverage"
14347        );
14348        if !self.unknown_models.is_empty() {
14349            let mut sorted: Vec<_> = self.unknown_models.iter().collect();
14350            sorted.sort_by(|a, b| b.1.cmp(a.1));
14351            for (model, count) in sorted.iter().take(5) {
14352                tracing::debug!(
14353                    target: "cass::analytics::pricing",
14354                    model = model.as_str(),
14355                    count = count,
14356                    "unknown model (no pricing)"
14357                );
14358            }
14359        }
14360    }
14361}
14362
14363/// In-memory pricing table loaded from `model_pricing` for fast lookups.
14364#[derive(Debug, Clone)]
14365pub struct PricingTable {
14366    entries: Vec<PricingEntry>,
14367}
14368
14369impl PricingTable {
14370    /// Load all pricing entries from the database.
14371    pub fn load(conn: &FrankenConnection) -> Result<Self> {
14372        Self::franken_load(conn)
14373    }
14374
14375    /// Load all pricing entries from a frankensqlite connection.
14376    pub fn franken_load(conn: &FrankenConnection) -> Result<Self> {
14377        let rows = conn.query(
14378            "SELECT model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
14379                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
14380             FROM model_pricing
14381             ORDER BY effective_date DESC",
14382        )?;
14383        let mut entries = Vec::with_capacity(rows.len());
14384        for row in &rows {
14385            let effective_date: String = row.get_typed(6)?;
14386            let effective_day_id = date_str_to_day_id(&effective_date)?;
14387            entries.push(PricingEntry {
14388                model_pattern: row.get_typed(0)?,
14389                provider: row.get_typed(1)?,
14390                input_cost_per_mtok: row.get_typed(2)?,
14391                output_cost_per_mtok: row.get_typed(3)?,
14392                cache_read_cost_per_mtok: row.get_typed(4)?,
14393                cache_creation_cost_per_mtok: row.get_typed(5)?,
14394                effective_day_id,
14395            });
14396        }
14397        Ok(Self { entries })
14398    }
14399
14400    /// Look up the best pricing entry for a given model name and date.
14401    ///
14402    /// Selection rules:
14403    /// 1. Pattern must match model_name (SQL LIKE semantics).
14404    /// 2. effective_day_id must be <= message_day_id.
14405    /// 3. Among matches, prefer the most recent effective_date.
14406    /// 4. Tie-break by pattern specificity (longest pattern wins).
14407    pub fn lookup(&self, model_name: &str, message_day_id: i64) -> Option<&PricingEntry> {
14408        let mut best: Option<&PricingEntry> = None;
14409
14410        for entry in &self.entries {
14411            if entry.effective_day_id > message_day_id {
14412                continue;
14413            }
14414            if !sql_like_match(model_name, &entry.model_pattern) {
14415                continue;
14416            }
14417
14418            match best {
14419                None => best = Some(entry),
14420                Some(current) => {
14421                    if entry.effective_day_id > current.effective_day_id
14422                        || (entry.effective_day_id == current.effective_day_id
14423                            && entry.model_pattern.len() > current.model_pattern.len())
14424                    {
14425                        best = Some(entry);
14426                    }
14427                }
14428            }
14429        }
14430
14431        best
14432    }
14433
14434    /// Compute estimated cost in USD for a set of token counts.
14435    ///
14436    /// Returns `None` if no pricing entry matches or if no token counts are available.
14437    pub fn compute_cost(
14438        &self,
14439        model_name: Option<&str>,
14440        message_day_id: i64,
14441        input_tokens: Option<i64>,
14442        output_tokens: Option<i64>,
14443        cache_read_tokens: Option<i64>,
14444        cache_creation_tokens: Option<i64>,
14445    ) -> Option<f64> {
14446        let model = model_name?;
14447        let pricing = self.lookup(model, message_day_id)?;
14448
14449        if input_tokens.is_none() && output_tokens.is_none() {
14450            return None;
14451        }
14452
14453        let mut cost = 0.0;
14454        let cache_read = cache_read_tokens.unwrap_or(0);
14455        let cache_creation = cache_creation_tokens.unwrap_or(0);
14456        // input_tokens includes cache tokens as a subset; subtract them
14457        // so we don't charge at both the full input rate AND the cache rate.
14458        let non_cache_input = input_tokens
14459            .unwrap_or(0)
14460            .saturating_sub(cache_read)
14461            .saturating_sub(cache_creation)
14462            .max(0);
14463        cost += non_cache_input as f64 * pricing.input_cost_per_mtok / 1_000_000.0;
14464        cost += output_tokens.unwrap_or(0) as f64 * pricing.output_cost_per_mtok / 1_000_000.0;
14465
14466        if let Some(cache_price) = pricing.cache_read_cost_per_mtok {
14467            cost += cache_read as f64 * cache_price / 1_000_000.0;
14468        }
14469        if let Some(cache_price) = pricing.cache_creation_cost_per_mtok {
14470            cost += cache_creation as f64 * cache_price / 1_000_000.0;
14471        }
14472
14473        Some(cost)
14474    }
14475
14476    /// Whether the pricing table has any entries.
14477    pub fn is_empty(&self) -> bool {
14478        self.entries.is_empty()
14479    }
14480}
14481
14482/// Convert "YYYY-MM-DD" date string to day_id (days since 2020-01-01),
14483/// matching the format produced by `day_id_from_millis`.
14484fn date_str_to_day_id(s: &str) -> Result<i64> {
14485    use chrono::NaiveDate;
14486    const EPOCH_2020: NaiveDate = match NaiveDate::from_ymd_opt(2020, 1, 1) {
14487        Some(d) => d,
14488        None => unreachable!(),
14489    };
14490    NaiveDate::parse_from_str(s, "%Y-%m-%d")
14491        .map(|d| (d - EPOCH_2020).num_days())
14492        .with_context(|| format!("invalid effective_date '{s}'"))
14493}
14494
14495/// SQL LIKE pattern matcher (case-insensitive). `%` = any sequence, `_` = any single char.
14496fn sql_like_match(value: &str, pattern: &str) -> bool {
14497    sql_like_match_bytes(
14498        value.to_ascii_lowercase().as_bytes(),
14499        pattern.to_ascii_lowercase().as_bytes(),
14500    )
14501}
14502
14503/// Determine the byte length of the UTF-8 character starting at `b`.
14504fn utf8_char_len(b: u8) -> usize {
14505    if b < 0x80 {
14506        1
14507    } else if b < 0xE0 {
14508        2
14509    } else if b < 0xF0 {
14510        3
14511    } else {
14512        4
14513    }
14514}
14515
14516fn sql_like_match_bytes(val: &[u8], pat: &[u8]) -> bool {
14517    if pat.is_empty() {
14518        return val.is_empty();
14519    }
14520    match pat[0] {
14521        b'%' => {
14522            let mut p = 1;
14523            while p < pat.len() && pat[p] == b'%' {
14524                p += 1;
14525            }
14526            let rest = &pat[p..];
14527            // Iterate only at UTF-8 char boundaries
14528            let mut i = 0;
14529            while i <= val.len() {
14530                if sql_like_match_bytes(&val[i..], rest) {
14531                    return true;
14532                }
14533                if i < val.len() {
14534                    i += utf8_char_len(val[i]);
14535                } else {
14536                    break;
14537                }
14538            }
14539            false
14540        }
14541        b'_' => {
14542            // Match one full UTF-8 character, not just one byte
14543            if val.is_empty() {
14544                return false;
14545            }
14546            let char_len = utf8_char_len(val[0]);
14547            val.len() >= char_len && sql_like_match_bytes(&val[char_len..], &pat[1..])
14548        }
14549        c => !val.is_empty() && val[0] == c && sql_like_match_bytes(&val[1..], &pat[1..]),
14550    }
14551}
14552
14553fn rebuild_batch_size_env(var: &str, default: usize) -> usize {
14554    dotenvy::var(var)
14555        .ok()
14556        .and_then(|raw| raw.parse::<usize>().ok())
14557        .filter(|value| *value > 0)
14558        .unwrap_or(default)
14559}
14560
14561fn is_out_of_memory_error(err: &impl std::fmt::Display) -> bool {
14562    err.to_string()
14563        .to_ascii_lowercase()
14564        .contains("out of memory")
14565}
14566
14567// Second SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
14568// All methods (insert_conversation_tree, list_agents, list_conversations, etc.) are
14569// available through FrankenStorage.
14570
14571/// Daily count data for histogram display.
14572#[derive(Debug, Clone)]
14573pub struct DailyCount {
14574    pub day_id: i64,
14575    pub sessions: i64,
14576    pub messages: i64,
14577    pub chars: i64,
14578}
14579
14580/// Result of an analytics rebuild operation.
14581#[derive(Debug, Clone)]
14582pub struct AnalyticsRebuildResult {
14583    pub message_metrics_rows: usize,
14584    pub usage_hourly_rows: usize,
14585    pub usage_daily_rows: usize,
14586    pub usage_models_daily_rows: usize,
14587    pub elapsed_ms: u64,
14588    pub messages_per_sec: f64,
14589}
14590
14591/// Result of rebuilding daily stats.
14592#[derive(Debug, Clone)]
14593pub struct DailyStatsRebuildResult {
14594    pub rows_created: i64,
14595    pub total_sessions: i64,
14596}
14597
14598/// Result of purging archived data for a single agent.
14599#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
14600pub struct AgentArchivePurgeResult {
14601    pub conversations_deleted: usize,
14602    pub messages_deleted: usize,
14603}
14604
14605/// Health status of daily stats table.
14606#[derive(Debug, Clone)]
14607pub struct DailyStatsHealth {
14608    pub populated: bool,
14609    pub row_count: i64,
14610    pub oldest_update_ms: Option<i64>,
14611    pub conversation_count: i64,
14612    pub materialized_total: i64,
14613    pub drift: i64,
14614}
14615
14616// -------------------------------------------------------------------------
14617// FTS5 Batch Insert (P2 Opt 2.1)
14618// -------------------------------------------------------------------------
14619
14620/// Batch size for FTS5 inserts. With 7 columns per row (rowid + 6 cols) and
14621/// SQLite's SQLITE_MAX_VARIABLE_NUMBER default of 999, max batch is ~142 rows.
14622/// Using 100 for safety margin and memory efficiency.
14623const FTS5_BATCH_SIZE: usize = 100;
14624
14625#[derive(Debug, Clone)]
14626struct FtsRebuildMessageRow {
14627    rowid: i64,
14628    message_id: i64,
14629    conversation_id: i64,
14630    content: String,
14631    created_at: Option<i64>,
14632}
14633
14634#[derive(Debug, Clone)]
14635struct FtsConversationProjection {
14636    title: String,
14637    agent_id: Option<i64>,
14638    workspace_id: Option<i64>,
14639    source_path: String,
14640}
14641
14642/// Entry for pending FTS5 insert.
14643#[derive(Debug, Clone)]
14644pub struct FtsEntry {
14645    pub content: String,
14646    pub title: String,
14647    pub agent: String,
14648    pub workspace: String,
14649    pub source_path: String,
14650    pub created_at: Option<i64>,
14651    pub message_id: i64,
14652}
14653
14654impl FtsEntry {
14655    /// Create an FTS entry from a message and conversation.
14656    pub fn from_message(message_id: i64, msg: &Message, conv: &Conversation) -> Self {
14657        FtsEntry {
14658            content: msg.content.clone(),
14659            title: conv.title.clone().unwrap_or_default(),
14660            agent: conv.agent_slug.clone(),
14661            workspace: conv
14662                .workspace
14663                .as_ref()
14664                .map(|p| p.to_string_lossy().into_owned())
14665                .unwrap_or_default(),
14666            source_path: path_to_string(&conv.source_path),
14667            created_at: msg.created_at.or(conv.started_at),
14668            message_id,
14669        }
14670    }
14671}
14672
14673const FTS_ENTRY_BATCH_MAX_DOCS: usize = 512;
14674const FTS_ENTRY_BATCH_MAX_CHARS: usize = 1024 * 1024;
14675
14676/// Default batch size for the FTS rebuild INSERT (Bug #168).  When
14677/// `fts_messages` is empty but `messages` has 100K+ rows, a single unbounded
14678/// INSERT-SELECT OOMs.  This constant caps each batch so peak memory stays
14679/// bounded.  Override via `CASS_FTS_REBUILD_BATCH_SIZE` for tuning.
14680const FTS_REBUILD_BATCH_SIZE_DEFAULT: usize = 5_000;
14681
14682/// Read the FTS rebuild batch size from the environment, falling back to the
14683/// compiled-in default.
14684fn fts_rebuild_batch_size() -> usize {
14685    dotenvy::var("CASS_FTS_REBUILD_BATCH_SIZE")
14686        .ok()
14687        .and_then(|v| v.parse::<usize>().ok())
14688        .filter(|&n| n > 0)
14689        .unwrap_or(FTS_REBUILD_BATCH_SIZE_DEFAULT)
14690}
14691
14692fn flush_pending_fts_entries(
14693    storage: &FrankenStorage,
14694    tx: &FrankenTransaction<'_>,
14695    entries: &mut Vec<FtsEntry>,
14696    pending_chars: &mut usize,
14697    inserted_total: &mut usize,
14698) -> Result<()> {
14699    if entries.is_empty() {
14700        return Ok(());
14701    }
14702
14703    if storage.fts_messages_present_cached(tx) {
14704        *inserted_total += franken_batch_insert_fts(tx, entries)?;
14705    }
14706    entries.clear();
14707    *pending_chars = 0;
14708    Ok(())
14709}
14710
14711fn path_to_string<P: AsRef<Path>>(p: P) -> String {
14712    p.as_ref().to_string_lossy().into_owned()
14713}
14714
14715fn role_str(role: &MessageRole) -> String {
14716    role_as_str(role).to_owned()
14717}
14718
14719fn role_as_str(role: &MessageRole) -> &str {
14720    match role {
14721        MessageRole::User => "user",
14722        MessageRole::Agent => "agent",
14723        MessageRole::Tool => "tool",
14724        MessageRole::System => "system",
14725        MessageRole::Other(v) => v.as_str(),
14726    }
14727}
14728
14729fn agent_kind_str(kind: AgentKind) -> String {
14730    match kind {
14731        AgentKind::Cli => "cli".into(),
14732        AgentKind::VsCode => "vscode".into(),
14733        AgentKind::Hybrid => "hybrid".into(),
14734    }
14735}
14736
14737// =============================================================================
14738// Tests (bead yln.4)
14739// =============================================================================
14740
14741#[cfg(test)]
14742mod tests {
14743    use super::*;
14744    use serial_test::serial;
14745    use tempfile::TempDir;
14746
14747    struct EnvGuard {
14748        key: &'static str,
14749        previous: Option<String>,
14750    }
14751
14752    impl Drop for EnvGuard {
14753        fn drop(&mut self) {
14754            if let Some(value) = &self.previous {
14755                // SAFETY: test helper restores prior process env for isolation.
14756                unsafe {
14757                    std::env::set_var(self.key, value);
14758                }
14759            } else {
14760                // SAFETY: test helper restores prior process env for isolation.
14761                unsafe {
14762                    std::env::remove_var(self.key);
14763                }
14764            }
14765        }
14766    }
14767
14768    fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
14769        let previous = dotenvy::var(key).ok();
14770        // SAFETY: test helper toggles a process-local env var for isolation.
14771        unsafe {
14772            std::env::set_var(key, value.as_ref());
14773        }
14774        EnvGuard { key, previous }
14775    }
14776
14777    #[test]
14778    fn doctor_mutation_open_guard_only_targets_canonical_archive_db() {
14779        let dir = TempDir::new().unwrap();
14780        let canonical = dir.path().join("agent_search.db");
14781        let scratch = dir.path().join("scratch.db");
14782
14783        assert_eq!(
14784            doctor_mutation_lock_path_for_db_open(&canonical),
14785            Some(dir.path().join("doctor/locks/doctor-repair.lock"))
14786        );
14787        assert_eq!(doctor_mutation_lock_path_for_db_open(&scratch), None);
14788    }
14789
14790    #[test]
14791    fn doctor_lock_metadata_pid_detection_is_exact() {
14792        let current = std::process::id();
14793
14794        assert!(doctor_lock_metadata_pid_is_current_process(&format!(
14795            "schema_version=1\npid={current}\nmode=safe_auto_run\n"
14796        )));
14797        assert!(!doctor_lock_metadata_pid_is_current_process(
14798            "schema_version=1\npid=not-a-pid\n"
14799        ));
14800        assert!(!doctor_lock_metadata_pid_is_current_process(&format!(
14801            "pid={}\n",
14802            current.saturating_add(1)
14803        )));
14804    }
14805
14806    #[test]
14807    fn doctor_storage_open_refuses_active_doctor_mutation_lock_from_other_process() {
14808        use std::io::Write as _;
14809
14810        let dir = TempDir::new().unwrap();
14811        let db_path = dir.path().join("agent_search.db");
14812        {
14813            let storage = FrankenStorage::open(&db_path).unwrap();
14814            storage.close().unwrap();
14815        }
14816
14817        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
14818        let mut lock_file = fs::OpenOptions::new()
14819            .create(true)
14820            .truncate(false)
14821            .read(true)
14822            .write(true)
14823            .open(&lock_path)
14824            .unwrap();
14825        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
14826        lock_file.set_len(0).unwrap();
14827        lock_file.write_all(b"schema_version=1\npid=1\n").unwrap();
14828        lock_file.sync_all().unwrap();
14829
14830        let err =
14831            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
14832                .expect_err("active doctor mutation lock must block canonical DB opens");
14833        let message = err.to_string();
14834        assert!(
14835            message.contains("doctor mutation lock") && message.contains("active"),
14836            "error should identify the active doctor mutation lock: {message}"
14837        );
14838
14839        fs2::FileExt::unlock(&lock_file).unwrap();
14840    }
14841
14842    #[test]
14843    fn doctor_storage_open_allows_current_doctor_process_probe() {
14844        use std::io::Write as _;
14845
14846        let dir = TempDir::new().unwrap();
14847        let db_path = dir.path().join("agent_search.db");
14848        {
14849            let storage = FrankenStorage::open(&db_path).unwrap();
14850            storage.close().unwrap();
14851        }
14852
14853        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
14854        let mut lock_file = fs::OpenOptions::new()
14855            .create(true)
14856            .truncate(false)
14857            .read(true)
14858            .write(true)
14859            .open(&lock_path)
14860            .unwrap();
14861        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
14862        lock_file.set_len(0).unwrap();
14863        write!(lock_file, "schema_version=1\npid={}\n", std::process::id()).unwrap();
14864        lock_file.sync_all().unwrap();
14865
14866        let conn =
14867            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
14868                .expect(
14869                    "doctor process must be able to run post-repair read probes under its own lock",
14870                );
14871        drop(conn);
14872
14873        fs2::FileExt::unlock(&lock_file).unwrap();
14874    }
14875
14876    #[test]
14877    fn autocommit_retain_disable_tries_compat_then_canonical_pragma() {
14878        let mut attempts = Vec::new();
14879
14880        let selected = disable_autocommit_retain(|pragma| {
14881            attempts.push(pragma);
14882            if pragma == "PRAGMA fsqlite.autocommit_retain = OFF;" {
14883                Err("compat namespace unavailable")
14884            } else {
14885                Ok(())
14886            }
14887        })
14888        .expect("canonical pragma should disable autocommit retain");
14889
14890        assert_eq!(selected, "PRAGMA autocommit_retain = OFF;");
14891        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
14892    }
14893
14894    #[test]
14895    fn autocommit_retain_disable_fails_closed_when_no_pragma_works() {
14896        let mut attempts = Vec::new();
14897
14898        let err = disable_autocommit_retain(|pragma| {
14899            attempts.push(pragma);
14900            Err("unsupported pragma")
14901        })
14902        .expect_err("unsupported autocommit retain controls should fail closed");
14903
14904        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
14905        let message = err.to_string();
14906        assert!(
14907            message.contains("refusing to keep a long-lived MVCC connection"),
14908            "error should force callers away from unbounded snapshot retention: {message}"
14909        );
14910        assert!(
14911            message.contains("PRAGMA fsqlite.autocommit_retain = OFF;")
14912                && message.contains("PRAGMA autocommit_retain = OFF;"),
14913            "error should preserve attempted PRAGMAs for diagnostics: {message}"
14914        );
14915    }
14916
14917    /// Open a rusqlite connection on `db_path` for the narrow purpose of
14918    /// injecting (or inspecting the raw projection of) sqlite_master
14919    /// corruption patterns in test fixtures. Frankensqlite intentionally does
14920    /// not support `PRAGMA writable_schema` writes or raw inserts to
14921    /// sqlite_master (see AGENTS.md: "PRAGMA writable_schema: Not supported for
14922    /// write operations"), so these fixtures retain rusqlite as the standard-
14923    /// SQLite interop layer. All callers are in this test module and run under
14924    /// #[cfg(test)]; no production code path touches rusqlite here.
14925    fn rusqlite_test_fixture_conn(db_path: &Path) -> rusqlite::Connection {
14926        rusqlite::Connection::open(db_path).expect("open rusqlite test fixture connection")
14927    }
14928
14929    fn seed_historical_db_direct(
14930        db_path: &Path,
14931        conversations: &[crate::model::types::Conversation],
14932    ) {
14933        if let Some(parent) = db_path.parent() {
14934            fs::create_dir_all(parent).unwrap();
14935        }
14936
14937        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
14938        conn.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA).unwrap();
14939        conn.execute_compat(
14940            "INSERT INTO agents(id, slug, name, version, kind, created_at, updated_at)
14941             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
14942            fparams![1_i64, "codex", "Codex", "0.2.3", "cli", 0_i64, 0_i64],
14943        )
14944        .unwrap();
14945
14946        let mut next_message_id = 1_i64;
14947        for (conv_index, conv) in conversations.iter().enumerate() {
14948            let conversation_id = i64::try_from(conv_index + 1).unwrap();
14949            let workspace_id = conv.workspace.as_ref().map(|workspace| {
14950                let workspace_id = conversation_id;
14951                let workspace_path = workspace.to_string_lossy().into_owned();
14952                conn.execute_compat(
14953                    "INSERT INTO workspaces(id, path, display_name) VALUES(?1, ?2, ?3)",
14954                    fparams![
14955                        workspace_id,
14956                        workspace_path.as_str(),
14957                        workspace_path.as_str()
14958                    ],
14959                )
14960                .unwrap();
14961                workspace_id
14962            });
14963            let source_path = conv.source_path.to_string_lossy().into_owned();
14964            let metadata_json = conv.metadata_json.to_string();
14965            conn.execute_compat(
14966                "INSERT INTO conversations (
14967                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
14968                    started_at, ended_at, approx_tokens, metadata_json, origin_host
14969                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
14970                fparams![
14971                    conversation_id,
14972                    1_i64,
14973                    workspace_id,
14974                    conv.source_id.as_str(),
14975                    conv.external_id.as_deref(),
14976                    conv.title.as_deref(),
14977                    source_path.as_str(),
14978                    conv.started_at,
14979                    conv.ended_at,
14980                    conv.approx_tokens,
14981                    metadata_json.as_str(),
14982                    conv.origin_host.as_deref()
14983                ],
14984            )
14985            .unwrap();
14986
14987            for msg in &conv.messages {
14988                let extra_json = msg.extra_json.to_string();
14989                let role = role_str(&msg.role);
14990                conn.execute_compat(
14991                    "INSERT INTO messages(
14992                        id, conversation_id, idx, role, author, created_at, content, extra_json
14993                     ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
14994                    fparams![
14995                        next_message_id,
14996                        conversation_id,
14997                        msg.idx,
14998                        role.as_str(),
14999                        msg.author.as_deref(),
15000                        msg.created_at,
15001                        msg.content.as_str(),
15002                        extra_json.as_str()
15003                    ],
15004                )
15005                .unwrap();
15006                next_message_id += 1;
15007            }
15008        }
15009    }
15010
15011    // =========================================================================
15012    // User data file protection tests (bead yln.4)
15013    // =========================================================================
15014
15015    #[test]
15016    fn is_user_data_file_detects_bookmarks() {
15017        assert!(is_user_data_file(Path::new("/data/bookmarks.db")));
15018        assert!(is_user_data_file(Path::new("bookmarks.db")));
15019    }
15020
15021    #[test]
15022    fn is_user_data_file_detects_tui_state() {
15023        assert!(is_user_data_file(Path::new("/data/tui_state.json")));
15024    }
15025
15026    #[test]
15027    fn is_user_data_file_detects_sources_toml() {
15028        assert!(is_user_data_file(Path::new("/config/sources.toml")));
15029    }
15030
15031    #[test]
15032    fn is_user_data_file_detects_env() {
15033        assert!(is_user_data_file(Path::new(".env")));
15034    }
15035
15036    #[test]
15037    fn is_user_data_file_rejects_other_files() {
15038        assert!(!is_user_data_file(Path::new("index.db")));
15039        assert!(!is_user_data_file(Path::new("conversations.db")));
15040        assert!(!is_user_data_file(Path::new("random.txt")));
15041    }
15042
15043    // =========================================================================
15044    // Backup creation tests (bead yln.4)
15045    // =========================================================================
15046
15047    #[test]
15048    fn create_backup_returns_none_for_nonexistent() {
15049        let dir = TempDir::new().unwrap();
15050        let db_path = dir.path().join("nonexistent.db");
15051        let result = create_backup(&db_path).unwrap();
15052        assert!(result.is_none());
15053    }
15054
15055    #[test]
15056    fn create_backup_creates_named_file() {
15057        let dir = TempDir::new().unwrap();
15058        let db_path = dir.path().join("test.db");
15059        std::fs::write(&db_path, b"test data").unwrap();
15060
15061        let backup_path = create_backup(&db_path).unwrap();
15062        assert!(backup_path.is_some());
15063        let backup = backup_path.unwrap();
15064        assert!(backup.exists());
15065        assert!(
15066            backup
15067                .file_name()
15068                .unwrap()
15069                .to_str()
15070                .unwrap()
15071                .contains("backup")
15072        );
15073    }
15074
15075    #[test]
15076    fn create_backup_paths_are_unique() {
15077        let dir = TempDir::new().unwrap();
15078        let db_path = dir.path().join("test.db");
15079        std::fs::write(&db_path, b"test data").unwrap();
15080
15081        let first = create_backup(&db_path).unwrap().unwrap();
15082        let second = create_backup(&db_path).unwrap().unwrap();
15083
15084        assert_ne!(first, second);
15085        assert!(first.exists());
15086        assert!(second.exists());
15087    }
15088
15089    #[test]
15090    fn lexical_rebuild_messages_query_uses_conversation_idx_access_path() {
15091        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
15092        use std::path::PathBuf;
15093
15094        let dir = TempDir::new().unwrap();
15095        let db_path = dir.path().join("agent_search.db");
15096        let storage = SqliteStorage::open(&db_path).unwrap();
15097
15098        let agent = Agent {
15099            id: None,
15100            slug: "claude_code".into(),
15101            name: "Claude Code".into(),
15102            version: None,
15103            kind: AgentKind::Cli,
15104        };
15105        let agent_id = storage.ensure_agent(&agent).unwrap();
15106        let conversation = Conversation {
15107            id: None,
15108            agent_slug: "claude_code".into(),
15109            workspace: Some(PathBuf::from("/tmp/workspace")),
15110            external_id: Some("conv-1".into()),
15111            title: Some("Lexical rebuild".into()),
15112            source_path: PathBuf::from("/tmp/conv-1.jsonl"),
15113            started_at: Some(1_700_000_000_000),
15114            ended_at: Some(1_700_000_000_100),
15115            approx_tokens: None,
15116            metadata_json: serde_json::Value::Null,
15117            messages: vec![
15118                Message {
15119                    id: None,
15120                    idx: 0,
15121                    role: MessageRole::User,
15122                    author: Some("user".into()),
15123                    created_at: Some(1_700_000_000_010),
15124                    content: "first".into(),
15125                    extra_json: serde_json::Value::Null,
15126                    snippets: Vec::new(),
15127                },
15128                Message {
15129                    id: None,
15130                    idx: 1,
15131                    role: MessageRole::Agent,
15132                    author: Some("assistant".into()),
15133                    created_at: Some(1_700_000_000_020),
15134                    content: "second".into(),
15135                    extra_json: serde_json::Value::Null,
15136                    snippets: Vec::new(),
15137                },
15138            ],
15139            source_id: LOCAL_SOURCE_ID.into(),
15140            origin_host: None,
15141        };
15142        storage
15143            .insert_conversation_tree(agent_id, None, &conversation)
15144            .unwrap();
15145        let conversation_id = storage
15146            .conn
15147            .query_row_map(
15148                "SELECT id FROM conversations WHERE external_id = ?1",
15149                fparams!["conv-1"],
15150                |row| row.get_typed::<i64>(0),
15151            )
15152            .unwrap();
15153
15154        let opcodes: Vec<String> = storage
15155            .conn
15156            .query_map_collect(
15157                "EXPLAIN \
15158                 SELECT id, idx, role, author, created_at, content \
15159                 FROM messages \
15160                 WHERE conversation_id = ?1 ORDER BY idx",
15161                fparams![conversation_id],
15162                |row| row.get_typed(1),
15163            )
15164            .unwrap();
15165
15166        assert!(
15167            opcodes.iter().any(|opcode| opcode == "SeekGE"),
15168            "expected lexical rebuild message fetch to seek into the conversation_id/idx access path, got {opcodes:?}"
15169        );
15170        assert!(
15171            !opcodes.iter().any(|opcode| opcode == "SorterOpen"),
15172            "expected lexical rebuild message fetch to avoid sorter temp b-trees, got {opcodes:?}"
15173        );
15174    }
15175
15176    #[test]
15177    fn schema_check_rebuild_classification_ignores_transient_errors() {
15178        assert!(!schema_check_error_requires_rebuild(
15179            &frankensqlite::FrankenError::Busy
15180        ));
15181        assert!(!schema_check_error_requires_rebuild(
15182            &frankensqlite::FrankenError::DatabaseLocked {
15183                path: PathBuf::from("/tmp/test.db"),
15184            }
15185        ));
15186        assert!(!schema_check_error_requires_rebuild(
15187            &frankensqlite::FrankenError::CannotOpen {
15188                path: PathBuf::from("/tmp/test.db"),
15189            }
15190        ));
15191        assert!(!schema_check_error_requires_rebuild(
15192            &frankensqlite::FrankenError::Io(std::io::Error::other("disk hiccup"))
15193        ));
15194    }
15195
15196    #[test]
15197    fn schema_check_rebuild_classification_keeps_corruption_errors() {
15198        assert!(schema_check_error_requires_rebuild(
15199            &frankensqlite::FrankenError::DatabaseCorrupt {
15200                detail: "bad header".to_string(),
15201            }
15202        ));
15203        assert!(schema_check_error_requires_rebuild(
15204            &frankensqlite::FrankenError::WalCorrupt {
15205                detail: "bad wal".to_string(),
15206            }
15207        ));
15208        assert!(schema_check_error_requires_rebuild(
15209            &frankensqlite::FrankenError::NotADatabase {
15210                path: PathBuf::from("/tmp/test.db"),
15211            }
15212        ));
15213        assert!(schema_check_error_requires_rebuild(
15214            &frankensqlite::FrankenError::ShortRead {
15215                expected: 4096,
15216                actual: 64,
15217            }
15218        ));
15219    }
15220
15221    #[test]
15222    fn create_backup_refuses_raw_copy_after_retryable_vacuum_errors() {
15223        let retryable_errors = [
15224            frankensqlite::FrankenError::Busy,
15225            frankensqlite::FrankenError::BusyRecovery,
15226            frankensqlite::FrankenError::BusySnapshot {
15227                conflicting_pages: "1,2".to_string(),
15228            },
15229            frankensqlite::FrankenError::DatabaseLocked {
15230                path: PathBuf::from("/tmp/test.db"),
15231            },
15232            frankensqlite::FrankenError::LockFailed {
15233                detail: "fcntl lock still held".to_string(),
15234            },
15235            frankensqlite::FrankenError::WriteConflict { page: 7, holder: 9 },
15236            frankensqlite::FrankenError::SerializationFailure { page: 11 },
15237            frankensqlite::FrankenError::Internal("database is locked".to_string()),
15238        ];
15239
15240        for err in retryable_errors {
15241            assert!(
15242                backup_vacuum_error_requires_consistent_retry(&err),
15243                "retryable VACUUM failure must not fall back to raw bundle copy: {err}"
15244            );
15245        }
15246
15247        assert!(!backup_vacuum_error_requires_consistent_retry(
15248            &frankensqlite::FrankenError::NotADatabase {
15249                path: PathBuf::from("/tmp/test.db")
15250            }
15251        ));
15252        assert!(!backup_vacuum_error_requires_consistent_retry(
15253            &frankensqlite::FrankenError::DatabaseCorrupt {
15254                detail: "bad header".to_string()
15255            }
15256        ));
15257    }
15258
15259    #[test]
15260    fn create_backup_uses_hidden_vacuum_stage_path() {
15261        let backup_path = PathBuf::from("/tmp/test.db.backup.123.456.0");
15262        let stage_path = vacuum_stage_backup_path(&backup_path);
15263        let stage_name = stage_path
15264            .file_name()
15265            .and_then(|name| name.to_str())
15266            .unwrap_or_default();
15267
15268        assert!(stage_name.starts_with('.'));
15269        assert!(stage_name.ends_with(".vacuum-in-progress"));
15270        assert!(
15271            !is_backup_root_name(stage_name, "test.db.backup."),
15272            "incomplete VACUUM output must not be discoverable as a backup root"
15273        );
15274    }
15275
15276    #[test]
15277    fn create_backup_preserves_content() {
15278        let dir = TempDir::new().unwrap();
15279        let db_path = dir.path().join("test.db");
15280        let original_content = b"test database content 12345";
15281        std::fs::write(&db_path, original_content).unwrap();
15282
15283        let backup_path = create_backup(&db_path).unwrap().unwrap();
15284        let backup_content = std::fs::read(&backup_path).unwrap();
15285        assert_eq!(backup_content, original_content);
15286    }
15287
15288    #[test]
15289    fn create_backup_copies_sidecars_when_present() {
15290        let dir = TempDir::new().unwrap();
15291        let db_path = dir.path().join("test.db");
15292        std::fs::write(&db_path, b"db").unwrap();
15293        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15294        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15295
15296        let backup_path = create_backup(&db_path).unwrap().unwrap();
15297
15298        assert_eq!(
15299            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15300            b"wal"
15301        );
15302        assert_eq!(
15303            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15304            b"shm"
15305        );
15306    }
15307
15308    #[test]
15309    #[cfg(unix)]
15310    fn create_backup_rejects_symlink_root_during_raw_fallback() {
15311        use std::os::unix::fs::symlink;
15312
15313        let dir = TempDir::new().unwrap();
15314        let outside_db = dir.path().join("outside.db");
15315        let db_path = dir.path().join("test.db");
15316        std::fs::write(&outside_db, b"not sqlite").unwrap();
15317        symlink(&outside_db, &db_path).unwrap();
15318
15319        let err = create_backup(&db_path).unwrap_err();
15320
15321        assert!(
15322            err.to_string().contains("bundle symlink"),
15323            "unexpected error: {err:#}"
15324        );
15325        assert_eq!(std::fs::read(&outside_db).unwrap(), b"not sqlite");
15326        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15327            .unwrap()
15328            .filter_map(|entry| entry.ok())
15329            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15330            .filter(|name| name.starts_with("test.db.backup."))
15331            .collect();
15332        assert!(
15333            backup_roots.is_empty(),
15334            "symlinked backup source must not publish backup roots: {backup_roots:?}"
15335        );
15336    }
15337
15338    #[test]
15339    #[cfg(unix)]
15340    fn create_backup_rejects_symlink_sidecar_without_partial_backup() {
15341        use std::os::unix::fs::symlink;
15342
15343        let dir = TempDir::new().unwrap();
15344        let db_path = dir.path().join("test.db");
15345        let outside_wal = dir.path().join("outside.wal");
15346        let wal_path = database_sidecar_path(&db_path, "-wal");
15347        std::fs::write(&db_path, b"not sqlite").unwrap();
15348        std::fs::write(&outside_wal, b"outside wal").unwrap();
15349        symlink(&outside_wal, &wal_path).unwrap();
15350
15351        let err = create_backup(&db_path).unwrap_err();
15352
15353        assert!(
15354            err.to_string().contains("bundle symlink"),
15355            "unexpected error: {err:#}"
15356        );
15357        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15358        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15359            .unwrap()
15360            .filter_map(|entry| entry.ok())
15361            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15362            .filter(|name| name.starts_with("test.db.backup."))
15363            .collect();
15364        assert!(
15365            backup_roots.is_empty(),
15366            "sidecar preflight failure must not leave a partial backup root: {backup_roots:?}"
15367        );
15368    }
15369
15370    // =========================================================================
15371    // Backup cleanup tests (bead yln.4)
15372    // =========================================================================
15373
15374    #[test]
15375    fn cleanup_old_backups_keeps_recent() {
15376        let dir = TempDir::new().unwrap();
15377        let db_path = dir.path().join("test.db");
15378
15379        // Create 5 backup files with different timestamps
15380        for i in 0..5 {
15381            let backup_name = format!("test.db.backup.{}", 1000 + i);
15382            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15383        }
15384
15385        cleanup_old_backups(&db_path, 3).unwrap();
15386
15387        // Count remaining backup files
15388        let backups: Vec<_> = std::fs::read_dir(dir.path())
15389            .unwrap()
15390            .filter_map(|e| e.ok())
15391            .filter(|e| e.file_name().to_str().unwrap_or("").contains("backup"))
15392            .collect();
15393
15394        assert_eq!(backups.len(), 3);
15395    }
15396
15397    #[test]
15398    fn cleanup_old_backups_ignores_wal_and_shm_sidecars() {
15399        let dir = TempDir::new().unwrap();
15400        let db_path = dir.path().join("test.db");
15401
15402        for i in 0..3 {
15403            let backup_name = format!("test.db.backup.{}", 1000 + i);
15404            let backup_path = dir.path().join(&backup_name);
15405            std::fs::write(&backup_path, format!("backup {i}")).unwrap();
15406            std::fs::write(format!("{}-wal", backup_path.display()), b"wal").unwrap();
15407            std::fs::write(format!("{}-shm", backup_path.display()), b"shm").unwrap();
15408            std::thread::sleep(std::time::Duration::from_millis(20));
15409        }
15410
15411        cleanup_old_backups(&db_path, 2).unwrap();
15412
15413        let mut roots = Vec::new();
15414        let mut wals = Vec::new();
15415        let mut shms = Vec::new();
15416        for entry in std::fs::read_dir(dir.path())
15417            .unwrap()
15418            .filter_map(|e| e.ok())
15419        {
15420            let name = entry.file_name().to_string_lossy().into_owned();
15421            if name.ends_with("-wal") {
15422                wals.push(name);
15423            } else if name.ends_with("-shm") {
15424                shms.push(name);
15425            } else if name.contains("backup") {
15426                roots.push(name);
15427            }
15428        }
15429
15430        assert_eq!(roots.len(), 2, "should keep two backup roots");
15431        assert_eq!(
15432            wals.len(),
15433            2,
15434            "should keep WAL sidecars only for retained backups"
15435        );
15436        assert_eq!(
15437            shms.len(),
15438            2,
15439            "should keep SHM sidecars only for retained backups"
15440        );
15441    }
15442
15443    #[test]
15444    fn move_database_bundle_moves_database_and_sidecars() {
15445        let dir = TempDir::new().unwrap();
15446        let db_path = dir.path().join("test.db");
15447        let backup_path = dir.path().join("test.db.corrupt");
15448
15449        std::fs::write(&db_path, b"db").unwrap();
15450        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15451        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15452
15453        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15454        assert_eq!(
15455            moved,
15456            DatabaseBundleMoveResult {
15457                database: true,
15458                wal: true,
15459                shm: true
15460            }
15461        );
15462        assert!(moved.moved_any());
15463
15464        assert!(!db_path.exists());
15465        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15466        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15467
15468        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15469        assert_eq!(
15470            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15471            b"wal"
15472        );
15473        assert_eq!(
15474            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15475            b"shm"
15476        );
15477    }
15478
15479    #[test]
15480    fn move_database_bundle_preserves_orphan_sidecars_without_main_db() {
15481        let dir = TempDir::new().unwrap();
15482        let db_path = dir.path().join("test.db");
15483        let backup_path = dir.path().join("test.db.corrupt");
15484
15485        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15486        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15487
15488        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15489        assert_eq!(
15490            moved,
15491            DatabaseBundleMoveResult {
15492                database: false,
15493                wal: true,
15494                shm: true
15495            }
15496        );
15497        assert!(moved.moved_any());
15498        assert!(!db_path.exists());
15499        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15500        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15501        assert_eq!(
15502            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15503            b"wal"
15504        );
15505        assert_eq!(
15506            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15507            b"shm"
15508        );
15509    }
15510
15511    #[test]
15512    #[cfg(unix)]
15513    fn move_database_bundle_moves_dangling_symlink_database_root() {
15514        use std::os::unix::fs::symlink;
15515
15516        let dir = TempDir::new().unwrap();
15517        let db_path = dir.path().join("test.db");
15518        let backup_path = dir.path().join("test.db.corrupt");
15519        let missing_target = dir.path().join("missing-target.db");
15520
15521        symlink(&missing_target, &db_path).unwrap();
15522
15523        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15524
15525        assert_eq!(
15526            moved,
15527            DatabaseBundleMoveResult {
15528                database: true,
15529                wal: false,
15530                shm: false
15531            }
15532        );
15533        assert!(std::fs::symlink_metadata(&db_path).is_err());
15534        assert!(
15535            std::fs::symlink_metadata(&backup_path)
15536                .unwrap()
15537                .file_type()
15538                .is_symlink()
15539        );
15540        assert!(!missing_target.exists());
15541    }
15542
15543    #[test]
15544    #[cfg(unix)]
15545    fn move_database_bundle_moves_dangling_symlink_sidecars_without_main_db() {
15546        use std::os::unix::fs::symlink;
15547
15548        let dir = TempDir::new().unwrap();
15549        let db_path = dir.path().join("test.db");
15550        let backup_path = dir.path().join("test.db.corrupt");
15551        let missing_wal_target = dir.path().join("missing-wal");
15552        let missing_shm_target = dir.path().join("missing-shm");
15553        let wal_path = database_sidecar_path(&db_path, "-wal");
15554        let shm_path = database_sidecar_path(&db_path, "-shm");
15555
15556        symlink(&missing_wal_target, &wal_path).unwrap();
15557        symlink(&missing_shm_target, &shm_path).unwrap();
15558
15559        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15560
15561        assert_eq!(
15562            moved,
15563            DatabaseBundleMoveResult {
15564                database: false,
15565                wal: true,
15566                shm: true
15567            }
15568        );
15569        assert!(std::fs::symlink_metadata(&wal_path).is_err());
15570        assert!(std::fs::symlink_metadata(&shm_path).is_err());
15571        assert!(
15572            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-wal"))
15573                .unwrap()
15574                .file_type()
15575                .is_symlink()
15576        );
15577        assert!(
15578            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-shm"))
15579                .unwrap()
15580                .file_type()
15581                .is_symlink()
15582        );
15583        assert!(!missing_wal_target.exists());
15584        assert!(!missing_shm_target.exists());
15585    }
15586
15587    #[test]
15588    fn copy_database_bundle_copies_database_and_sidecars() {
15589        let dir = TempDir::new().unwrap();
15590        let db_path = dir.path().join("test.db");
15591        let copied_path = dir.path().join("copy.db");
15592
15593        std::fs::write(&db_path, b"db").unwrap();
15594        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15595        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15596
15597        copy_database_bundle(&db_path, &copied_path).unwrap();
15598
15599        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15600        assert_eq!(
15601            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15602            b"wal"
15603        );
15604        assert_eq!(
15605            std::fs::read(database_sidecar_path(&copied_path, "-shm")).unwrap(),
15606            b"shm"
15607        );
15608        assert_eq!(std::fs::read(&db_path).unwrap(), b"db");
15609    }
15610
15611    #[test]
15612    fn copy_database_bundle_creates_destination_parent() {
15613        let dir = TempDir::new().unwrap();
15614        let db_path = dir.path().join("test.db");
15615        let copied_path = dir.path().join("nested/copies/copy.db");
15616
15617        std::fs::write(&db_path, b"db").unwrap();
15618        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15619
15620        copy_database_bundle(&db_path, &copied_path).unwrap();
15621
15622        assert!(copied_path.parent().unwrap().is_dir());
15623        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15624        assert_eq!(
15625            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15626            b"wal"
15627        );
15628    }
15629
15630    #[test]
15631    #[cfg(unix)]
15632    fn copy_database_bundle_rejects_symlink_source_root() {
15633        use std::os::unix::fs::symlink;
15634
15635        let dir = TempDir::new().unwrap();
15636        let outside_db = dir.path().join("outside.db");
15637        let db_path = dir.path().join("test.db");
15638        let copied_path = dir.path().join("copy.db");
15639
15640        std::fs::write(&outside_db, b"outside").unwrap();
15641        symlink(&outside_db, &db_path).unwrap();
15642
15643        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15644
15645        assert!(
15646            err.to_string().contains("bundle symlink"),
15647            "unexpected error: {err:#}"
15648        );
15649        assert!(!copied_path.exists());
15650        assert_eq!(std::fs::read(&outside_db).unwrap(), b"outside");
15651    }
15652
15653    #[test]
15654    #[cfg(unix)]
15655    fn copy_database_bundle_rejects_symlink_sidecar() {
15656        use std::os::unix::fs::symlink;
15657
15658        let dir = TempDir::new().unwrap();
15659        let db_path = dir.path().join("test.db");
15660        let copied_path = dir.path().join("copy.db");
15661        let outside_wal = dir.path().join("outside.wal");
15662        let wal_path = database_sidecar_path(&db_path, "-wal");
15663
15664        std::fs::write(&db_path, b"db").unwrap();
15665        std::fs::write(&outside_wal, b"outside wal").unwrap();
15666        symlink(&outside_wal, &wal_path).unwrap();
15667
15668        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15669
15670        assert!(
15671            err.to_string().contains("bundle symlink"),
15672            "unexpected error: {err:#}"
15673        );
15674        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15675        assert!(!copied_path.exists());
15676        assert!(!database_sidecar_path(&copied_path, "-wal").exists());
15677    }
15678
15679    #[test]
15680    fn move_database_bundle_creates_destination_parent_and_moves_sidecars() {
15681        let dir = TempDir::new().unwrap();
15682        let db_path = dir.path().join("test.db");
15683        let backup_path = dir.path().join("nested/backups/test.db.corrupt");
15684
15685        std::fs::write(&db_path, b"db").unwrap();
15686        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15687        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15688
15689        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15690        assert_eq!(
15691            moved,
15692            DatabaseBundleMoveResult {
15693                database: true,
15694                wal: true,
15695                shm: true
15696            }
15697        );
15698        assert!(backup_path.parent().unwrap().is_dir());
15699        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15700        assert_eq!(
15701            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15702            b"wal"
15703        );
15704        assert_eq!(
15705            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15706            b"shm"
15707        );
15708    }
15709
15710    #[test]
15711    fn remove_database_files_removes_orphan_sidecars_without_main_db() {
15712        let dir = TempDir::new().unwrap();
15713        let db_path = dir.path().join("test.db");
15714
15715        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15716        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15717
15718        remove_database_files(&db_path).unwrap();
15719
15720        assert!(!db_path.exists());
15721        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15722        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15723    }
15724
15725    #[test]
15726    fn cleanup_old_backups_ignores_backup_named_directories() {
15727        let dir = TempDir::new().unwrap();
15728        let db_path = dir.path().join("test.db");
15729
15730        for i in 0..3 {
15731            let backup_name = format!("test.db.backup.{}", 1000 + i);
15732            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15733        }
15734        std::fs::create_dir(dir.path().join("test.db.backup.directory")).unwrap();
15735
15736        cleanup_old_backups(&db_path, 2).unwrap();
15737
15738        let mut backup_files = Vec::new();
15739        let mut backup_dirs = Vec::new();
15740        for entry in std::fs::read_dir(dir.path())
15741            .unwrap()
15742            .filter_map(|e| e.ok())
15743        {
15744            let name = entry.file_name().to_string_lossy().into_owned();
15745            if !name.starts_with("test.db.backup.") {
15746                continue;
15747            }
15748            if entry.path().is_dir() {
15749                backup_dirs.push(name);
15750            } else {
15751                backup_files.push(name);
15752            }
15753        }
15754
15755        assert_eq!(
15756            backup_files.len(),
15757            2,
15758            "only real backup files count toward retention"
15759        );
15760        assert_eq!(
15761            backup_dirs.len(),
15762            1,
15763            "backup-named directories should be ignored"
15764        );
15765    }
15766
15767    // =========================================================================
15768    // Storage open/create tests (bead yln.4)
15769    // =========================================================================
15770
15771    #[test]
15772    fn open_creates_new_database() {
15773        let dir = TempDir::new().unwrap();
15774        let db_path = dir.path().join("new.db");
15775        assert!(!db_path.exists());
15776
15777        let storage = SqliteStorage::open(&db_path).unwrap();
15778        assert!(db_path.exists());
15779        storage.close().unwrap();
15780    }
15781
15782    #[test]
15783    fn open_readonly_fails_for_nonexistent() {
15784        let dir = TempDir::new().unwrap();
15785        let db_path = dir.path().join("nonexistent.db");
15786        let result = SqliteStorage::open_readonly(&db_path);
15787        assert!(result.is_err());
15788    }
15789
15790    #[test]
15791    fn open_readonly_succeeds_for_existing() {
15792        let dir = TempDir::new().unwrap();
15793        let db_path = dir.path().join("existing.db");
15794
15795        // Create first
15796        let _storage = SqliteStorage::open(&db_path).unwrap();
15797        drop(_storage);
15798
15799        // Now open readonly
15800        let storage = SqliteStorage::open_readonly(&db_path).unwrap();
15801        assert!(storage.schema_version().is_ok());
15802    }
15803
15804    #[test]
15805    fn reopen_existing_current_schema_is_idempotent() {
15806        let dir = TempDir::new().unwrap();
15807        let db_path = dir.path().join("existing.db");
15808
15809        // First open creates and migrates to current schema.
15810        {
15811            let storage = SqliteStorage::open(&db_path).unwrap();
15812            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15813        }
15814
15815        // Re-open should not fail on current schema.
15816        let reopened = SqliteStorage::open(&db_path).unwrap();
15817        assert_eq!(
15818            reopened.schema_version().unwrap(),
15819            CURRENT_SCHEMA_VERSION,
15820            "reopening current schema DB should be idempotent"
15821        );
15822    }
15823
15824    #[test]
15825    fn open_or_rebuild_current_schema_does_not_trigger_rebuild() {
15826        let dir = TempDir::new().unwrap();
15827        let db_path = dir.path().join("existing.db");
15828
15829        // Create DB at current schema.
15830        {
15831            let storage = SqliteStorage::open(&db_path).unwrap();
15832            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15833        }
15834
15835        // Should open normally, not require rebuild.
15836        let reopened = SqliteStorage::open_or_rebuild(&db_path)
15837            .expect("current schema DB should open without rebuild");
15838        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15839    }
15840
15841    #[test]
15842    fn open_or_rebuild_does_not_treat_non_database_paths_as_corruption() {
15843        let dir = TempDir::new().unwrap();
15844        let db_path = dir.path().join("db_dir");
15845        std::fs::create_dir(&db_path).unwrap();
15846
15847        let result = SqliteStorage::open_or_rebuild(&db_path);
15848
15849        match result {
15850            Err(MigrationError::Database(_)) | Err(MigrationError::Io(_)) => {}
15851            Err(MigrationError::RebuildRequired { reason, .. }) => {
15852                panic!("should not rebuild non-database path: {reason}")
15853            }
15854            Err(MigrationError::Other(msg)) => {
15855                panic!("should preserve underlying open error, got Other: {msg}")
15856            }
15857            Ok(_) => panic!("directory path must not open as a database"),
15858        }
15859
15860        assert!(
15861            db_path.is_dir(),
15862            "non-database directory must be left in place"
15863        );
15864    }
15865
15866    // =========================================================================
15867    // Schema version tests (bead yln.4)
15868    // =========================================================================
15869
15870    #[test]
15871    fn schema_version_returns_current() {
15872        let dir = TempDir::new().unwrap();
15873        let db_path = dir.path().join("test.db");
15874        let storage = SqliteStorage::open(&db_path).unwrap();
15875        let version = storage.schema_version().unwrap();
15876        assert!(version >= 5, "Schema version should be at least 5");
15877    }
15878
15879    // =========================================================================
15880    // Current analytics/schema smoke test (bead z9fse.11)
15881    // =========================================================================
15882
15883    #[test]
15884    fn migration_v13_creates_analytics_tables() {
15885        let dir = TempDir::new().unwrap();
15886        let db_path = dir.path().join("test.db");
15887        let storage = SqliteStorage::open(&db_path).unwrap();
15888
15889        // Schema version should be current.
15890        let version = storage.schema_version().unwrap();
15891        assert_eq!(
15892            version, CURRENT_SCHEMA_VERSION,
15893            "Schema version must match CURRENT_SCHEMA_VERSION after migration"
15894        );
15895
15896        let conn = storage.raw();
15897
15898        // Helper: collect column names from PRAGMA table_info
15899        fn col_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
15900            conn.query_map_collect(
15901                &format!("PRAGMA table_info({})", table),
15902                fparams![],
15903                |row: &FrankenRow| row.get_typed(1),
15904            )
15905            .unwrap()
15906        }
15907
15908        // Helper: collect index names from PRAGMA index_list
15909        fn idx_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
15910            conn.query_map_collect(
15911                &format!("PRAGMA index_list({})", table),
15912                fparams![],
15913                |row: &FrankenRow| row.get_typed(1),
15914            )
15915            .unwrap()
15916        }
15917
15918        // Verify message_metrics table exists with expected columns
15919        let mm_cols = col_names(conn, "message_metrics");
15920        for expected in &[
15921            "message_id",
15922            "hour_id",
15923            "day_id",
15924            "content_tokens_est",
15925            "model_name",
15926            "model_family",
15927            "model_tier",
15928            "provider",
15929            "api_input_tokens",
15930            "has_plan",
15931            "agent_slug",
15932            "role",
15933            "api_data_source",
15934        ] {
15935            assert!(
15936                mm_cols.contains(&expected.to_string()),
15937                "message_metrics missing column: {expected}"
15938            );
15939        }
15940
15941        // Verify usage_hourly table
15942        let uh_cols = col_names(conn, "usage_hourly");
15943        for expected in &[
15944            "hour_id",
15945            "plan_message_count",
15946            "plan_content_tokens_est_total",
15947            "plan_api_tokens_total",
15948            "api_coverage_message_count",
15949            "content_tokens_est_user",
15950            "api_thinking_tokens_total",
15951        ] {
15952            assert!(
15953                uh_cols.contains(&expected.to_string()),
15954                "usage_hourly missing column: {expected}"
15955            );
15956        }
15957
15958        // Verify usage_daily table
15959        let ud_cols = col_names(conn, "usage_daily");
15960        for expected in &[
15961            "day_id",
15962            "plan_content_tokens_est_total",
15963            "plan_api_tokens_total",
15964            "api_thinking_tokens_total",
15965            "content_tokens_est_assistant",
15966            "message_count",
15967        ] {
15968            assert!(
15969                ud_cols.contains(&expected.to_string()),
15970                "usage_daily missing column: {expected}"
15971            );
15972        }
15973
15974        // Verify usage_models_daily table
15975        let umd_cols = col_names(conn, "usage_models_daily");
15976        for expected in &[
15977            "day_id",
15978            "model_family",
15979            "model_tier",
15980            "message_count",
15981            "api_tokens_total",
15982            "api_coverage_message_count",
15983        ] {
15984            assert!(
15985                umd_cols.contains(&expected.to_string()),
15986                "usage_models_daily missing column: {expected}"
15987            );
15988        }
15989
15990        // Verify indexes on message_metrics
15991        let mm_idxs = idx_names(conn, "message_metrics");
15992        assert!(
15993            mm_idxs.iter().any(|n| n.contains("idx_mm_hour")),
15994            "message_metrics must have hour index"
15995        );
15996        assert!(
15997            mm_idxs.iter().any(|n| n.contains("idx_mm_agent_day")),
15998            "message_metrics must have agent+day index"
15999        );
16000        assert!(
16001            mm_idxs
16002                .iter()
16003                .any(|n| n.contains("idx_mm_model_family_day")),
16004            "message_metrics must have model_family+day index"
16005        );
16006
16007        // Verify indexes on usage_hourly
16008        let uh_idxs = idx_names(conn, "usage_hourly");
16009        assert!(
16010            uh_idxs.iter().any(|n| n.contains("idx_uh_agent")),
16011            "usage_hourly must have agent index"
16012        );
16013
16014        // Verify indexes on usage_daily
16015        let ud_idxs = idx_names(conn, "usage_daily");
16016        assert!(
16017            ud_idxs.iter().any(|n| n.contains("idx_ud_agent")),
16018            "usage_daily must have agent index"
16019        );
16020
16021        // Verify indexes on usage_models_daily
16022        let umd_idxs = idx_names(conn, "usage_models_daily");
16023        assert!(
16024            umd_idxs.iter().any(|n| n.contains("idx_umd_model_day")),
16025            "usage_models_daily must have model+day index"
16026        );
16027
16028        let conversation_cols = col_names(conn, "conversations");
16029        assert!(
16030            conversation_cols.contains(&"last_message_idx".to_string())
16031                && conversation_cols.contains(&"last_message_created_at".to_string()),
16032            "fresh schema must include V15 tail columns without ALTER TABLE on conversations"
16033        );
16034        let fts_schema_rows: i64 = conn
16035            .query_row_map(
16036                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
16037                fparams![],
16038                |row: &FrankenRow| row.get_typed(0),
16039            )
16040            .unwrap();
16041        assert_eq!(
16042            fts_schema_rows, 0,
16043            "fresh schema should not create and immediately drop derived fts_messages"
16044        );
16045        let integrity: Vec<String> = conn
16046            .query_map_collect("PRAGMA integrity_check;", fparams![], |row: &FrankenRow| {
16047                row.get_typed(0)
16048            })
16049            .unwrap();
16050        assert_eq!(
16051            integrity,
16052            vec!["ok".to_string()],
16053            "fresh schema must pass SQLite integrity_check"
16054        );
16055    }
16056
16057    #[test]
16058    fn hour_id_round_trip() {
16059        // 2026-02-06 12:00:00 UTC
16060        let ts_ms = 1_770_508_800_000_i64;
16061        let hour_id = SqliteStorage::hour_id_from_millis(ts_ms);
16062        let day_id = SqliteStorage::day_id_from_millis(ts_ms);
16063
16064        // hour_id should be 24x day_id (approximately)
16065        assert_eq!(hour_id / 24, day_id, "hour_id/24 should equal day_id");
16066
16067        // Round-trip: millis_from_hour_id should give start of that hour
16068        let back = SqliteStorage::millis_from_hour_id(hour_id);
16069        assert!(
16070            back <= ts_ms && ts_ms - back < 3_600_000,
16071            "Round-trip should land within the same hour"
16072        );
16073    }
16074
16075    #[test]
16076    fn day_and_hour_ids_floor_negative_millis() {
16077        // One millisecond before the Unix epoch should still floor into the
16078        // previous second/hour/day rather than truncating toward zero.
16079        let ts_ms = -1_i64;
16080        let expected_secs = -1_i64;
16081        let epoch_2020_secs = 1_577_836_800_i64;
16082
16083        assert_eq!(
16084            SqliteStorage::day_id_from_millis(ts_ms),
16085            (expected_secs - epoch_2020_secs).div_euclid(86_400)
16086        );
16087        assert_eq!(
16088            SqliteStorage::hour_id_from_millis(ts_ms),
16089            (expected_secs - epoch_2020_secs).div_euclid(3_600)
16090        );
16091    }
16092
16093    #[test]
16094    fn migration_v13_from_v10() {
16095        let dir = TempDir::new().unwrap();
16096        let db_path = dir.path().join("test.db");
16097
16098        // Open at v10 first by faking it
16099        {
16100            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
16101            conn.execute_batch("PRAGMA journal_mode=WAL;").unwrap();
16102            conn.execute_batch(
16103                "CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT);",
16104            )
16105            .unwrap();
16106            conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', '10')")
16107                .unwrap();
16108            // Apply V1-V10 so schema is correct
16109            let mut tx = conn.transaction().unwrap();
16110            tx.execute_batch(MIGRATION_V1).unwrap();
16111            tx.execute_batch(MIGRATION_V2).unwrap();
16112            tx.execute_batch(MIGRATION_V4).unwrap();
16113            tx.execute_batch(MIGRATION_V5).unwrap();
16114            tx.execute_batch(MIGRATION_V6).unwrap();
16115            tx.execute_batch(MIGRATION_V7).unwrap();
16116            tx.execute_batch(MIGRATION_V8).unwrap();
16117            tx.execute_batch(MIGRATION_V9).unwrap();
16118            tx.execute_batch(MIGRATION_V10).unwrap();
16119            tx.execute("UPDATE meta SET value = '10' WHERE key = 'schema_version'")
16120                .unwrap();
16121            tx.commit().unwrap();
16122        }
16123        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
16124
16125        // Now open with SqliteStorage — should auto-migrate to current schema
16126        let storage = SqliteStorage::open(&db_path).unwrap();
16127        let version = storage.schema_version().unwrap();
16128        assert_eq!(
16129            version, CURRENT_SCHEMA_VERSION,
16130            "Should have migrated from v10 to the current schema"
16131        );
16132
16133        // Verify new tables exist
16134        let count: i64 = storage
16135            .raw()
16136            .query_row_map(
16137                "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name IN ('message_metrics', 'usage_hourly', 'usage_daily', 'usage_models_daily')",
16138                &[],
16139                |row: &FrankenRow| row.get_typed::<i64>(0),
16140            )
16141            .unwrap();
16142        assert_eq!(count, 4, "All 4 analytics tables should exist");
16143    }
16144
16145    // =========================================================================
16146    // Analytics ingest integration test (bead z9fse.2)
16147    // =========================================================================
16148
16149    #[test]
16150    fn analytics_ingest_populates_metrics_and_rollups() {
16151        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16152        use std::path::PathBuf;
16153
16154        let dir = TempDir::new().unwrap();
16155        let db_path = dir.path().join("test.db");
16156        let storage = SqliteStorage::open(&db_path).unwrap();
16157
16158        // Register agent + workspace
16159        let agent = Agent {
16160            id: None,
16161            slug: "claude_code".into(),
16162            name: "Claude Code".into(),
16163            version: Some("1.0".into()),
16164            kind: AgentKind::Cli,
16165        };
16166        let agent_id = storage.ensure_agent(&agent).unwrap();
16167
16168        // Create a synthetic conversation with 3 messages at a known timestamp
16169        // 2026-02-06 10:30:00 UTC → day_id = 2228, hour_id = 53472
16170        let ts_ms = 1_770_551_400_000_i64;
16171        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16172        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16173
16174        // Include a JSON usage block on the assistant message (like Claude Code data)
16175        let usage_json = serde_json::json!({
16176            "message": {
16177                "model": "claude-opus-4-6",
16178                "usage": {
16179                    "input_tokens": 100,
16180                    "output_tokens": 50,
16181                    "cache_read_input_tokens": 200,
16182                    "cache_creation_input_tokens": 30,
16183                    "service_tier": "standard"
16184                }
16185            }
16186        });
16187
16188        let conv = Conversation {
16189            id: None,
16190            agent_slug: "claude_code".into(),
16191            workspace: None,
16192            external_id: Some("test-conv-1".into()),
16193            title: Some("Test conversation".into()),
16194            source_path: PathBuf::from("/tmp/test.jsonl"),
16195            started_at: Some(ts_ms),
16196            ended_at: Some(ts_ms + 60_000),
16197            approx_tokens: None,
16198            metadata_json: serde_json::Value::Null,
16199            messages: vec![
16200                Message {
16201                    id: None,
16202                    idx: 0,
16203                    role: MessageRole::User,
16204                    author: None,
16205                    created_at: Some(ts_ms),
16206                    content: "Hello, can you help me with a plan?".into(),
16207                    extra_json: serde_json::Value::Null,
16208                    snippets: vec![],
16209                },
16210                Message {
16211                    id: None,
16212                    idx: 1,
16213                    role: MessageRole::Agent,
16214                    author: None,
16215                    created_at: Some(ts_ms + 30_000),
16216                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16217                    extra_json: usage_json,
16218                    snippets: vec![],
16219                },
16220                Message {
16221                    id: None,
16222                    idx: 2,
16223                    role: MessageRole::User,
16224                    author: None,
16225                    created_at: Some(ts_ms + 60_000),
16226                    content: "Great, let's proceed!".into(),
16227                    extra_json: serde_json::Value::Null,
16228                    snippets: vec![],
16229                },
16230            ],
16231            source_id: "local".into(),
16232            origin_host: None,
16233        };
16234
16235        let outcomes = storage
16236            .insert_conversations_batched(&[(agent_id, None, &conv)])
16237            .unwrap();
16238        assert_eq!(outcomes.len(), 1);
16239        assert_eq!(outcomes[0].inserted_indices.len(), 3);
16240
16241        let conn = storage.raw();
16242
16243        // Verify message_metrics rows
16244        let mm_count: i64 = conn
16245            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16246                row.get_typed::<i64>(0)
16247            })
16248            .unwrap();
16249        assert_eq!(mm_count, 3, "Should have 3 message_metrics rows");
16250
16251        // Verify hour_id and day_id are correct
16252        #[allow(clippy::type_complexity)]
16253        let rows: Vec<(i64, i64, String, i64, i64, String, String, String, String)> = conn
16254            .query_map_collect(
16255                "SELECT hour_id, day_id, role, content_tokens_est, has_plan, api_data_source, model_family, model_tier, provider FROM message_metrics ORDER BY message_id",
16256                fparams![],
16257                |row: &FrankenRow| {
16258                    Ok((
16259                        row.get_typed(0)?,
16260                        row.get_typed(1)?,
16261                        row.get_typed(2)?,
16262                        row.get_typed(3)?,
16263                        row.get_typed(4)?,
16264                        row.get_typed(5)?,
16265                        row.get_typed(6)?,
16266                        row.get_typed(7)?,
16267                        row.get_typed(8)?,
16268                    ))
16269                },
16270            )
16271            .unwrap();
16272
16273        assert_eq!(rows.len(), 3);
16274        // All messages in the same hour/day
16275        assert_eq!(rows[0].0, expected_hour);
16276        assert_eq!(rows[0].1, expected_day);
16277        // First message is user
16278        assert_eq!(rows[0].2, "user");
16279        // Second message (assistant) should have has_plan=1 (contains "## Plan" + numbered steps)
16280        assert_eq!(
16281            rows[1].4, 1,
16282            "Assistant message with plan should have has_plan=1"
16283        );
16284        // Second message should have api data source
16285        assert_eq!(
16286            rows[1].5, "api",
16287            "Claude Code assistant message should have api data source"
16288        );
16289        // First and third (user) messages should be estimated
16290        assert_eq!(rows[0].5, "estimated");
16291        assert_eq!(rows[2].5, "estimated");
16292        assert_eq!(rows[1].6, "claude");
16293        assert_eq!(rows[1].7, "opus");
16294        assert_eq!(rows[1].8, "anthropic");
16295        assert_eq!(rows[0].6, "unknown");
16296        // content_tokens_est = chars / 4
16297        let user_chars = "Hello, can you help me with a plan?".len() as i64;
16298        assert_eq!(rows[0].3, user_chars / 4);
16299
16300        // Verify usage_hourly rollup
16301        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api, uh_api_cov): (
16302            i64,
16303            i64,
16304            i64,
16305            i64,
16306            i64,
16307            i64,
16308            i64,
16309        ) = conn
16310            .query_row_map(
16311                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16312                        plan_content_tokens_est_total, plan_api_tokens_total, api_coverage_message_count
16313                 FROM usage_hourly WHERE hour_id = ?",
16314                fparams![expected_hour],
16315                |row: &FrankenRow| {
16316                    Ok((
16317                        row.get_typed(0)?,
16318                        row.get_typed(1)?,
16319                        row.get_typed(2)?,
16320                        row.get_typed(3)?,
16321                        row.get_typed(4)?,
16322                        row.get_typed(5)?,
16323                        row.get_typed(6)?,
16324                    ))
16325                },
16326            )
16327            .unwrap();
16328        assert_eq!(uh_msg, 3, "Hourly rollup should have 3 messages");
16329        assert_eq!(uh_user, 2, "Hourly rollup should have 2 user messages");
16330        assert_eq!(uh_asst, 1, "Hourly rollup should have 1 assistant message");
16331        assert_eq!(uh_plan, 1, "Hourly rollup should have 1 plan message");
16332        assert!(
16333            uh_plan_content > 0,
16334            "Hourly rollup should include plan content tokens"
16335        );
16336        assert!(
16337            uh_plan_api > 0,
16338            "Hourly rollup should include plan API tokens"
16339        );
16340        assert_eq!(
16341            uh_api_cov, 1,
16342            "Hourly rollup should have 1 API-covered message"
16343        );
16344
16345        // Verify usage_daily rollup matches hourly (same day)
16346        let (ud_msg, ud_api_cov): (i64, i64) = conn
16347            .query_row_map(
16348                "SELECT message_count, api_coverage_message_count FROM usage_daily WHERE day_id = ?",
16349                fparams![expected_day],
16350                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16351            )
16352            .unwrap();
16353        assert_eq!(ud_msg, 3, "Daily rollup should match hourly");
16354        assert_eq!(
16355            ud_api_cov, 1,
16356            "Daily api_coverage should be 1 (only assistant msg has real API data)"
16357        );
16358
16359        // Verify the API input tokens from message_metrics (only API-sourced)
16360        let api_only_input: i64 = conn
16361            .query_row_map(
16362                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE day_id = ? AND api_data_source = 'api'",
16363                fparams![expected_day],
16364                |row: &FrankenRow| row.get_typed::<i64>(0),
16365            )
16366            .unwrap();
16367        assert_eq!(
16368            api_only_input, 100,
16369            "Only API-sourced input tokens should be 100"
16370        );
16371
16372        // Verify rollups match summed message_metrics
16373        let mm_total_content_est: i64 = conn
16374            .query_row_map(
16375                "SELECT SUM(content_tokens_est) FROM message_metrics WHERE day_id = ?",
16376                fparams![expected_day],
16377                |row| row.get_typed::<i64>(0),
16378            )
16379            .unwrap();
16380        let mm_plan_content_est: i64 = conn
16381            .query_row_map(
16382                "SELECT COALESCE(SUM(content_tokens_est), 0) FROM message_metrics WHERE day_id = ? AND has_plan = 1",
16383                fparams![expected_day],
16384                |row: &FrankenRow| row.get_typed::<i64>(0),
16385            )
16386            .unwrap();
16387        let mm_plan_api_total: i64 = conn
16388            .query_row_map(
16389                "SELECT COALESCE(SUM(COALESCE(api_input_tokens, 0) + COALESCE(api_output_tokens, 0) + COALESCE(api_cache_read_tokens, 0) + COALESCE(api_cache_creation_tokens, 0) + COALESCE(api_thinking_tokens, 0)), 0)
16390                 FROM message_metrics WHERE day_id = ? AND has_plan = 1 AND api_data_source = 'api'",
16391                fparams![expected_day],
16392                |row: &FrankenRow| row.get_typed::<i64>(0),
16393            )
16394            .unwrap();
16395        let ud_content_est: i64 = conn
16396            .query_row_map(
16397                "SELECT content_tokens_est_total FROM usage_daily WHERE day_id = ?",
16398                fparams![expected_day],
16399                |row| row.get_typed::<i64>(0),
16400            )
16401            .unwrap();
16402        let (ud_plan_content_est, ud_plan_api_total): (i64, i64) = conn
16403            .query_row_map(
16404                "SELECT plan_content_tokens_est_total, plan_api_tokens_total FROM usage_daily WHERE day_id = ?",
16405                fparams![expected_day],
16406                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16407            )
16408            .unwrap();
16409        assert_eq!(
16410            mm_total_content_est, ud_content_est,
16411            "Daily rollup content_tokens_est_total must equal SUM of message_metrics"
16412        );
16413        assert_eq!(
16414            mm_plan_content_est, ud_plan_content_est,
16415            "Daily rollup plan_content_tokens_est_total must equal planned message_metrics content sum"
16416        );
16417        assert_eq!(
16418            mm_plan_api_total, ud_plan_api_total,
16419            "Daily rollup plan_api_tokens_total must equal planned message_metrics API token sum"
16420        );
16421
16422        // Verify model rollup rows
16423        let (claude_msg, claude_user, claude_asst, claude_api_total, claude_api_cov): (
16424            i64,
16425            i64,
16426            i64,
16427            i64,
16428            i64,
16429        ) = conn
16430            .query_row_map(
16431                "SELECT message_count, user_message_count, assistant_message_count, api_tokens_total, api_coverage_message_count
16432                 FROM usage_models_daily
16433                 WHERE day_id = ? AND model_family = 'claude' AND model_tier = 'opus'",
16434                fparams![expected_day],
16435                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?, row.get_typed(3)?, row.get_typed(4)?)),
16436            )
16437            .unwrap();
16438        assert_eq!(claude_msg, 1);
16439        assert_eq!(claude_user, 0);
16440        assert_eq!(claude_asst, 1);
16441        assert_eq!(claude_api_total, 380);
16442        assert_eq!(claude_api_cov, 1);
16443
16444        let unknown_msg: i64 = conn
16445            .query_row_map(
16446                "SELECT message_count FROM usage_models_daily
16447                 WHERE day_id = ? AND model_family = 'unknown' AND model_tier = 'unknown'",
16448                fparams![expected_day],
16449                |row| row.get_typed(0),
16450            )
16451            .unwrap();
16452        assert_eq!(
16453            unknown_msg, 2,
16454            "user messages should land in unknown model bucket"
16455        );
16456    }
16457
16458    #[test]
16459    fn has_plan_heuristic_detects_plans() {
16460        assert!(has_plan_heuristic(
16461            "## Plan\n\n1. First step\n2. Second step"
16462        ));
16463        assert!(has_plan_heuristic(
16464            "# Plan\nHere is what we will do:\n1. Step one\n2. Step two"
16465        ));
16466        assert!(has_plan_heuristic(
16467            "Plan:\n- Gather baseline\n- Implement changes\n- Validate with tests"
16468        ));
16469        assert!(has_plan_heuristic(
16470            "Next steps:\n1. Update schema\n2. Rebuild rollups"
16471        ));
16472        assert!(!has_plan_heuristic("Hello world"));
16473        assert!(!has_plan_heuristic("Short"));
16474        assert!(!has_plan_heuristic(
16475            "This is a regular message without plans"
16476        ));
16477        assert!(!has_plan_heuristic(
16478            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. install\\n2. run\"}\n```"
16479        ));
16480    }
16481
16482    #[test]
16483    fn has_plan_for_role_only_counts_assistant_messages() {
16484        let plan_text = "## Plan\n1. First\n2. Second";
16485        assert!(has_plan_for_role("assistant", plan_text));
16486        assert!(has_plan_for_role("agent", plan_text));
16487        assert!(has_plan_for_role("Assistant", plan_text));
16488        assert!(!has_plan_for_role("user", plan_text));
16489        assert!(!has_plan_for_role("tool", plan_text));
16490    }
16491
16492    #[test]
16493    fn api_rollups_require_api_data_source() {
16494        let mut agg = AnalyticsRollupAggregator::new();
16495
16496        let estimated_plan = MessageMetricsEntry {
16497            message_id: 1,
16498            created_at_ms: 0,
16499            hour_id: 1,
16500            day_id: 1,
16501            agent_slug: "codex".into(),
16502            workspace_id: 0,
16503            source_id: "local".into(),
16504            role: "assistant".into(),
16505            content_chars: 120,
16506            content_tokens_est: 30,
16507            model_name: None,
16508            model_family: "unknown".into(),
16509            model_tier: "unknown".into(),
16510            provider: "unknown".into(),
16511            api_input_tokens: Some(100),
16512            api_output_tokens: Some(50),
16513            api_cache_read_tokens: Some(0),
16514            api_cache_creation_tokens: Some(0),
16515            api_thinking_tokens: Some(0),
16516            api_service_tier: None,
16517            api_data_source: "estimated".into(),
16518            tool_call_count: 0,
16519            has_tool_calls: false,
16520            has_plan: true,
16521        };
16522        agg.record(&estimated_plan);
16523
16524        let api_plan = MessageMetricsEntry {
16525            message_id: 2,
16526            created_at_ms: 0,
16527            hour_id: 1,
16528            day_id: 1,
16529            agent_slug: "codex".into(),
16530            workspace_id: 0,
16531            source_id: "local".into(),
16532            role: "assistant".into(),
16533            content_chars: 80,
16534            content_tokens_est: 20,
16535            model_name: None,
16536            model_family: "unknown".into(),
16537            model_tier: "unknown".into(),
16538            provider: "unknown".into(),
16539            api_input_tokens: Some(40),
16540            api_output_tokens: Some(10),
16541            api_cache_read_tokens: Some(0),
16542            api_cache_creation_tokens: Some(0),
16543            api_thinking_tokens: Some(0),
16544            api_service_tier: None,
16545            api_data_source: "api".into(),
16546            tool_call_count: 0,
16547            has_tool_calls: false,
16548            has_plan: true,
16549        };
16550        agg.record(&api_plan);
16551
16552        let key = (1_i64, "codex".to_string(), 0_i64, "local".to_string());
16553        let hourly = agg.hourly.get(&key).expect("hourly rollup key must exist");
16554        let daily = agg.daily.get(&key).expect("daily rollup key must exist");
16555        let model_key = (
16556            1_i64,
16557            "codex".to_string(),
16558            0_i64,
16559            "local".to_string(),
16560            "unknown".to_string(),
16561            "unknown".to_string(),
16562        );
16563        let models_daily = agg
16564            .models_daily
16565            .get(&model_key)
16566            .expect("model rollup key must exist");
16567
16568        // Content rollup includes both plan messages.
16569        assert_eq!(hourly.plan_message_count, 2);
16570        assert_eq!(hourly.plan_content_tokens_est_total, 50);
16571        // API plan tokens must include only api_data_source='api' rows.
16572        assert_eq!(hourly.plan_api_tokens_total, 50);
16573        assert_eq!(daily.plan_api_tokens_total, 50);
16574        assert_eq!(models_daily.plan_api_tokens_total, 50);
16575        // Overall API totals must also exclude estimated rows.
16576        assert_eq!(hourly.api_tokens_total, 50);
16577        assert_eq!(hourly.api_input_tokens_total, 40);
16578        assert_eq!(hourly.api_output_tokens_total, 10);
16579        assert_eq!(hourly.api_coverage_message_count, 1);
16580        assert_eq!(daily.api_tokens_total, 50);
16581        assert_eq!(models_daily.api_tokens_total, 50);
16582    }
16583
16584    #[test]
16585    fn has_plan_heuristic_curated_corpus_thresholds() {
16586        // Cross-agent-style positives.
16587        let positives = [
16588            "## Plan\n1. Inspect current schema\n2. Add migration\n3. Verify rebuild",
16589            "Plan:\n1) Reproduce\n2) Patch\n3) Add tests",
16590            "Implementation plan:\n- Parse inputs\n- Update rollups\n- Run checks",
16591            "Next steps:\n1. Reserve file\n2. Implement\n3. Report status",
16592            "# Plan\n1. Gather requirements\n2. Ship changes",
16593            "Action plan:\n- Identify root cause\n- Fix it\n- Validate",
16594        ];
16595
16596        // Typical false positives we want to avoid.
16597        let negatives = [
16598            "The plan is to move fast and fix things later.",
16599            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. ls\\n2. cat\"}\n```",
16600            "stdout:\n1. Build started\n2. Build finished\nexit code: 0",
16601            "I can help with that request. Let me know if you want details.",
16602            "Here is a list:\n- apples\n- oranges",
16603            "Status update: completed tasks and blockers below.",
16604        ];
16605
16606        let tp = positives
16607            .iter()
16608            .filter(|msg| has_plan_heuristic(msg))
16609            .count();
16610        let fp = negatives
16611            .iter()
16612            .filter(|msg| has_plan_heuristic(msg))
16613            .count();
16614
16615        let recall = tp as f64 / positives.len() as f64;
16616        let false_positive_rate = fp as f64 / negatives.len() as f64;
16617
16618        assert!(
16619            recall >= 0.80,
16620            "plan heuristic recall too low: got {recall:.2}"
16621        );
16622        assert!(
16623            false_positive_rate <= 0.20,
16624            "plan heuristic false-positive rate too high: got {false_positive_rate:.2}"
16625        );
16626    }
16627
16628    #[test]
16629    fn rebuild_analytics_repopulates_from_messages() {
16630        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16631        use std::path::PathBuf;
16632
16633        let dir = TempDir::new().unwrap();
16634        let db_path = dir.path().join("test.db");
16635        let storage = SqliteStorage::open(&db_path).unwrap();
16636
16637        // Register agent
16638        let agent = Agent {
16639            id: None,
16640            slug: "claude_code".into(),
16641            name: "Claude Code".into(),
16642            version: Some("1.0".into()),
16643            kind: AgentKind::Cli,
16644        };
16645        let agent_id = storage.ensure_agent(&agent).unwrap();
16646
16647        // 2026-02-06 10:30:00 UTC
16648        let ts_ms = 1_770_551_400_000_i64;
16649        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16650        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16651
16652        let usage_json = serde_json::json!({
16653            "message": {
16654                "model": "claude-opus-4-6",
16655                "usage": {
16656                    "input_tokens": 100,
16657                    "output_tokens": 50,
16658                    "cache_read_input_tokens": 200,
16659                    "cache_creation_input_tokens": 30,
16660                    "service_tier": "standard"
16661                }
16662            }
16663        });
16664
16665        let conv = Conversation {
16666            id: None,
16667            agent_slug: "claude_code".into(),
16668            workspace: None,
16669            external_id: Some("test-rebuild-1".into()),
16670            title: Some("Test conversation".into()),
16671            source_path: PathBuf::from("/tmp/test.jsonl"),
16672            started_at: Some(ts_ms),
16673            ended_at: Some(ts_ms + 60_000),
16674            approx_tokens: None,
16675            metadata_json: serde_json::Value::Null,
16676            messages: vec![
16677                Message {
16678                    id: None,
16679                    idx: 0,
16680                    role: MessageRole::User,
16681                    author: None,
16682                    created_at: Some(ts_ms),
16683                    content: "Hello, can you help me with a plan?".into(),
16684                    extra_json: serde_json::Value::Null,
16685                    snippets: vec![],
16686                },
16687                Message {
16688                    id: None,
16689                    idx: 1,
16690                    role: MessageRole::Agent,
16691                    author: None,
16692                    created_at: Some(ts_ms + 30_000),
16693                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16694                    extra_json: usage_json,
16695                    snippets: vec![],
16696                },
16697                Message {
16698                    id: None,
16699                    idx: 2,
16700                    role: MessageRole::User,
16701                    author: None,
16702                    created_at: Some(ts_ms + 60_000),
16703                    content: "Great, let's proceed!".into(),
16704                    extra_json: serde_json::Value::Null,
16705                    snippets: vec![],
16706                },
16707            ],
16708            source_id: "local".into(),
16709            origin_host: None,
16710        };
16711
16712        storage
16713            .insert_conversations_batched(&[(agent_id, None, &conv)])
16714            .unwrap();
16715
16716        // Save original analytics state
16717        let conn = storage.raw();
16718        let orig_mm: i64 = conn
16719            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16720                row.get_typed(0)
16721            })
16722            .unwrap();
16723        let orig_hourly: i64 = conn
16724            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
16725                row.get_typed(0)
16726            })
16727            .unwrap();
16728        let orig_daily: i64 = conn
16729            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
16730                row.get_typed(0)
16731            })
16732            .unwrap();
16733        let orig_models_daily: i64 = conn
16734            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
16735                row.get_typed(0)
16736            })
16737            .unwrap();
16738        let orig_api_input: i64 = conn
16739            .query_row_map(
16740                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
16741                &[],
16742                |row: &FrankenRow| row.get_typed(0),
16743            )
16744            .unwrap();
16745
16746        assert_eq!(orig_mm, 3);
16747        assert!(orig_hourly > 0);
16748        assert!(orig_daily > 0);
16749        assert!(orig_models_daily > 0);
16750
16751        // Destroy analytics tables (simulate corruption)
16752        conn.execute("DELETE FROM message_metrics").unwrap();
16753        conn.execute("DELETE FROM usage_hourly").unwrap();
16754        conn.execute("DELETE FROM usage_daily").unwrap();
16755        conn.execute("DELETE FROM usage_models_daily").unwrap();
16756
16757        // Verify they're empty
16758        let zero: i64 = conn
16759            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16760                row.get_typed(0)
16761            })
16762            .unwrap();
16763        assert_eq!(zero, 0);
16764
16765        // Rebuild analytics
16766        let result = storage.rebuild_analytics().unwrap();
16767
16768        assert_eq!(result.message_metrics_rows, 3);
16769        assert!(result.usage_hourly_rows > 0);
16770        assert!(result.usage_daily_rows > 0);
16771        assert!(result.usage_models_daily_rows > 0);
16772        assert!(
16773            result.elapsed_ms < 10_000,
16774            "Rebuild should be fast for 3 msgs"
16775        );
16776
16777        // Verify rebuilt data matches
16778        let conn = storage.raw();
16779        let rebuilt_mm: i64 = conn
16780            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16781                row.get_typed(0)
16782            })
16783            .unwrap();
16784        assert_eq!(
16785            rebuilt_mm, orig_mm,
16786            "Rebuilt message_metrics count should match"
16787        );
16788
16789        let rebuilt_hourly: i64 = conn
16790            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
16791                row.get_typed(0)
16792            })
16793            .unwrap();
16794        assert_eq!(
16795            rebuilt_hourly, orig_hourly,
16796            "Rebuilt hourly rows should match"
16797        );
16798
16799        let rebuilt_daily: i64 = conn
16800            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
16801                row.get_typed(0)
16802            })
16803            .unwrap();
16804        assert_eq!(rebuilt_daily, orig_daily, "Rebuilt daily rows should match");
16805
16806        let rebuilt_models_daily: i64 = conn
16807            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
16808                row.get_typed(0)
16809            })
16810            .unwrap();
16811        assert_eq!(
16812            rebuilt_models_daily, orig_models_daily,
16813            "Rebuilt model rollup rows should match"
16814        );
16815
16816        // Verify API token data preserved through rebuild
16817        let rebuilt_api_input: i64 = conn
16818            .query_row_map(
16819                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
16820                &[],
16821                |row: &FrankenRow| row.get_typed(0),
16822            )
16823            .unwrap();
16824        assert_eq!(
16825            rebuilt_api_input, orig_api_input,
16826            "Rebuilt API input tokens should match original"
16827        );
16828
16829        // Verify rollups have correct data
16830        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api): (
16831            i64,
16832            i64,
16833            i64,
16834            i64,
16835            i64,
16836            i64,
16837        ) = conn
16838            .query_row_map(
16839                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16840                        plan_content_tokens_est_total, plan_api_tokens_total
16841                 FROM usage_hourly WHERE hour_id = ?",
16842                fparams![expected_hour],
16843                |row: &FrankenRow| {
16844                    Ok((
16845                        row.get_typed(0)?,
16846                        row.get_typed(1)?,
16847                        row.get_typed(2)?,
16848                        row.get_typed(3)?,
16849                        row.get_typed(4)?,
16850                        row.get_typed(5)?,
16851                    ))
16852                },
16853            )
16854            .unwrap();
16855        assert_eq!(uh_msg, 3);
16856        assert_eq!(uh_user, 2);
16857        assert_eq!(uh_asst, 1);
16858        assert_eq!(uh_plan, 1);
16859        assert!(uh_plan_content > 0);
16860        assert!(uh_plan_api > 0);
16861
16862        let ud_msg: i64 = conn
16863            .query_row_map(
16864                "SELECT message_count FROM usage_daily WHERE day_id = ?",
16865                fparams![expected_day],
16866                |row| row.get_typed(0),
16867            )
16868            .unwrap();
16869        assert_eq!(ud_msg, 3);
16870    }
16871
16872    #[test]
16873    fn insert_conversations_batched_flushes_large_fts_batches() {
16874        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16875        use std::path::PathBuf;
16876
16877        let dir = TempDir::new().unwrap();
16878        let db_path = dir.path().join("test.db");
16879        let storage = SqliteStorage::open(&db_path).unwrap();
16880        // V14 drops fts_messages during migration; cass normally recreates it
16881        // during startup via `ensure_search_fallback_fts_consistency`. Tests
16882        // that inspect fts_messages directly need to run the same repair pass
16883        // to exercise the "insert flushes FTS" contract.
16884        storage
16885            .ensure_search_fallback_fts_consistency()
16886            .expect("ensure FTS consistency before insert");
16887
16888        let agent = Agent {
16889            id: None,
16890            slug: "codex".into(),
16891            name: "Codex".into(),
16892            version: Some("0.2.3".into()),
16893            kind: AgentKind::Cli,
16894        };
16895        let agent_id = storage.ensure_agent(&agent).unwrap();
16896
16897        let content = "y".repeat((FTS_ENTRY_BATCH_MAX_CHARS / 2) + 1);
16898        let messages: Vec<_> = (0_i64..2)
16899            .map(|i| Message {
16900                id: None,
16901                idx: i,
16902                role: MessageRole::Agent,
16903                author: None,
16904                created_at: Some(1_700_000_000_000 + i),
16905                content: format!("{i}-{content}"),
16906                extra_json: serde_json::Value::Null,
16907                snippets: Vec::new(),
16908            })
16909            .collect();
16910        let conv = Conversation {
16911            id: None,
16912            agent_slug: "codex".into(),
16913            workspace: Some(PathBuf::from("/tmp/workspace")),
16914            external_id: Some("fts-large-batch".into()),
16915            title: Some("FTS Large Batch".into()),
16916            source_path: PathBuf::from("/tmp/rollout.jsonl"),
16917            started_at: Some(1_700_000_000_000),
16918            ended_at: Some(1_700_000_000_999),
16919            approx_tokens: None,
16920            metadata_json: serde_json::Value::Null,
16921            messages,
16922            source_id: "local".into(),
16923            origin_host: None,
16924        };
16925
16926        let outcomes = storage
16927            .insert_conversations_batched(&[(agent_id, None, &conv)])
16928            .unwrap();
16929        assert_eq!(outcomes.len(), 1);
16930        assert_eq!(outcomes[0].inserted_indices.len(), conv.messages.len());
16931
16932        let message_count: i64 = storage
16933            .conn
16934            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
16935                row.get_typed(0)
16936            })
16937            .unwrap();
16938        let fts_count: i64 = storage
16939            .conn
16940            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
16941                row.get_typed(0)
16942            })
16943            .unwrap();
16944
16945        assert_eq!(message_count, conv.messages.len() as i64);
16946        assert_eq!(fts_count, conv.messages.len() as i64);
16947    }
16948
16949    fn make_profiled_storage_remote_conversation(
16950        external_id: i64,
16951        msg_count: usize,
16952    ) -> Conversation {
16953        Conversation {
16954            id: None,
16955            agent_slug: "codex".into(),
16956            workspace: Some(PathBuf::from("/ws/profiled-storage-remote")),
16957            external_id: Some(format!("profiled-storage-remote-{external_id}")),
16958            title: Some(format!(
16959                "Profiled storage remote conversation {external_id}"
16960            )),
16961            source_path: PathBuf::from(format!("/log/profiled-storage-remote-{external_id}.jsonl")),
16962            started_at: Some(10_000 + external_id * 100),
16963            ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
16964            approx_tokens: Some(msg_count as i64 * 32),
16965            metadata_json: serde_json::json!({ "bench": true }),
16966            messages: (0..msg_count)
16967                .map(|idx| Message {
16968                    id: None,
16969                    idx: idx as i64,
16970                    role: if idx % 2 == 0 {
16971                        MessageRole::User
16972                    } else {
16973                        MessageRole::Agent
16974                    },
16975                    author: Some("tester".into()),
16976                    created_at: Some(20_000 + external_id * 100 + idx as i64),
16977                    content: format!(
16978                        "profiled storage remote content ext={external_id} idx={idx} {}",
16979                        "x".repeat(64)
16980                    ),
16981                    extra_json: serde_json::json!({ "idx": idx }),
16982                    snippets: Vec::new(),
16983                })
16984                .collect(),
16985            source_id: "profiled-storage-remote-source".into(),
16986            origin_host: Some("builder-profile".into()),
16987        }
16988    }
16989
16990    fn make_profiled_append_remote_merge_conversation(
16991        external_id: i64,
16992        msg_count: usize,
16993    ) -> Conversation {
16994        let base_ts = 100_000 + external_id * 1_000;
16995        Conversation {
16996            id: None,
16997            agent_slug: "codex".into(),
16998            workspace: Some(PathBuf::from("/ws/profiled-append-remote")),
16999            external_id: Some(format!("profiled-append-remote-{external_id}")),
17000            title: Some(format!("Profiled append remote conversation {external_id}")),
17001            source_path: PathBuf::from(format!("/log/profiled-append-remote-{external_id}.jsonl")),
17002            started_at: Some(base_ts),
17003            ended_at: Some(base_ts + msg_count as i64),
17004            approx_tokens: Some(msg_count as i64 * 50),
17005            metadata_json: serde_json::json!({ "bench": true }),
17006            messages: (0..msg_count)
17007                .map(|idx| Message {
17008                    id: None,
17009                    idx: idx as i64,
17010                    role: if idx % 2 == 0 {
17011                        MessageRole::User
17012                    } else {
17013                        MessageRole::Agent
17014                    },
17015                    author: Some(format!("model-{}", external_id % 5)),
17016                    created_at: Some(base_ts + idx as i64),
17017                    content: format!(
17018                        "Profiled append remote conversation {} message {}: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
17019                        external_id, idx
17020                    ),
17021                    extra_json: serde_json::json!({ "bench": true }),
17022                    snippets: Vec::new(),
17023                })
17024                .collect(),
17025            source_id: "profiled-append-remote-source".into(),
17026            origin_host: Some("builder-profile".into()),
17027        }
17028    }
17029
17030    #[test]
17031    fn insert_conversation_tree_batched_new_message_ids_match_snippet_rows() {
17032        let dir = TempDir::new().unwrap();
17033        let db_path = dir.path().join("batched-message-ids.db");
17034        let storage = SqliteStorage::open(&db_path).unwrap();
17035        let agent_id = storage
17036            .ensure_agent(&Agent {
17037                id: None,
17038                slug: "codex".into(),
17039                name: "Codex".into(),
17040                version: None,
17041                kind: AgentKind::Cli,
17042            })
17043            .unwrap();
17044        let workspace_id = storage
17045            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17046            .unwrap();
17047        let mut conv = make_profiled_storage_remote_conversation(42, 5);
17048        for (idx, msg) in conv.messages.iter_mut().enumerate() {
17049            msg.snippets.push(Snippet {
17050                id: None,
17051                file_path: Some(PathBuf::from(format!("src/file_{idx}.rs"))),
17052                start_line: Some((idx + 1) as i64),
17053                end_line: Some((idx + 2) as i64),
17054                language: Some("rust".into()),
17055                snippet_text: Some(format!("fn snippet_{idx}() {{}}")),
17056            });
17057        }
17058        let outcome = storage
17059            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
17060            .unwrap();
17061
17062        let message_count: i64 = storage
17063            .conn
17064            .query_row_map(
17065                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17066                fparams![outcome.conversation_id],
17067                |row| row.get_typed(0),
17068            )
17069            .unwrap();
17070        let joined_snippet_count: i64 = storage
17071            .conn
17072            .query_row_map(
17073                "SELECT COUNT(*)
17074                 FROM snippets s
17075                 JOIN messages m ON s.message_id = m.id
17076                 WHERE m.conversation_id = ?1",
17077                fparams![outcome.conversation_id],
17078                |row| row.get_typed(0),
17079            )
17080            .unwrap();
17081
17082        assert_eq!(message_count, conv.messages.len() as i64);
17083        assert_eq!(joined_snippet_count, conv.messages.len() as i64);
17084    }
17085
17086    #[test]
17087    fn insert_conversation_tree_batched_appended_message_ids_match_snippet_rows() {
17088        let dir = TempDir::new().unwrap();
17089        let db_path = dir.path().join("batched-append-message-ids.db");
17090        let storage = SqliteStorage::open(&db_path).unwrap();
17091        let agent_id = storage
17092            .ensure_agent(&Agent {
17093                id: None,
17094                slug: "codex".into(),
17095                name: "Codex".into(),
17096                version: None,
17097                kind: AgentKind::Cli,
17098            })
17099            .unwrap();
17100        let workspace_id = storage
17101            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17102            .unwrap();
17103
17104        let mut initial = make_profiled_storage_remote_conversation(77, 2);
17105        for (idx, msg) in initial.messages.iter_mut().enumerate() {
17106            msg.snippets.push(Snippet {
17107                id: None,
17108                file_path: Some(PathBuf::from(format!("src/append_initial_{idx}.rs"))),
17109                start_line: Some((idx + 1) as i64),
17110                end_line: Some((idx + 2) as i64),
17111                language: Some("rust".into()),
17112                snippet_text: Some(format!("fn append_initial_{idx}() {{}}")),
17113            });
17114        }
17115        let first = storage
17116            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17117            .unwrap();
17118        assert_eq!(first.inserted_indices, vec![0, 1]);
17119
17120        let mut appended = make_profiled_storage_remote_conversation(77, 5);
17121        for (idx, msg) in appended.messages.iter_mut().enumerate() {
17122            msg.snippets.push(Snippet {
17123                id: None,
17124                file_path: Some(PathBuf::from(format!("src/append_full_{idx}.rs"))),
17125                start_line: Some((idx + 10) as i64),
17126                end_line: Some((idx + 11) as i64),
17127                language: Some("rust".into()),
17128                snippet_text: Some(format!("fn append_full_{idx}() {{}}")),
17129            });
17130        }
17131        let second = storage
17132            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17133            .unwrap();
17134        assert_eq!(second.conversation_id, first.conversation_id);
17135        assert_eq!(second.inserted_indices, vec![2, 3, 4]);
17136
17137        let message_count: i64 = storage
17138            .conn
17139            .query_row_map(
17140                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17141                fparams![first.conversation_id],
17142                |row| row.get_typed(0),
17143            )
17144            .unwrap();
17145        let joined_snippets: Vec<(i64, String)> = storage
17146            .conn
17147            .query_map_collect(
17148                "SELECT m.idx, s.file_path
17149                 FROM snippets s
17150                 JOIN messages m ON s.message_id = m.id
17151                 WHERE m.conversation_id = ?1
17152                 ORDER BY m.idx, s.id",
17153                fparams![first.conversation_id],
17154                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17155            )
17156            .unwrap();
17157
17158        assert_eq!(message_count, 5);
17159        assert_eq!(
17160            joined_snippets,
17161            vec![
17162                (0, "src/append_initial_0.rs".to_string()),
17163                (1, "src/append_initial_1.rs".to_string()),
17164                (2, "src/append_full_2.rs".to_string()),
17165                (3, "src/append_full_3.rs".to_string()),
17166                (4, "src/append_full_4.rs".to_string()),
17167            ]
17168        );
17169    }
17170
17171    #[test]
17172    fn insert_conversation_tree_rehydrates_external_lookup_after_manual_clear() {
17173        let dir = TempDir::new().unwrap();
17174        let db_path = dir.path().join("external-lookup-rehydrate.db");
17175        let storage = SqliteStorage::open(&db_path).unwrap();
17176        let agent_id = storage
17177            .ensure_agent(&Agent {
17178                id: None,
17179                slug: "codex".into(),
17180                name: "Codex".into(),
17181                version: None,
17182                kind: AgentKind::Cli,
17183            })
17184            .unwrap();
17185        let workspace_id = storage
17186            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17187            .unwrap();
17188
17189        let initial = make_profiled_storage_remote_conversation(88, 2);
17190        let first = storage
17191            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17192            .unwrap();
17193        let external_id = initial.external_id.as_deref().unwrap();
17194        let lookup_key =
17195            conversation_external_lookup_key(&initial.source_id, agent_id, external_id);
17196        let lookup_id: i64 = storage
17197            .conn
17198            .query_row_map(
17199                "SELECT conversation_id
17200                 FROM conversation_external_tail_lookup
17201                 WHERE lookup_key = ?1",
17202                fparams![lookup_key.as_str()],
17203                |row| row.get_typed(0),
17204            )
17205            .unwrap();
17206        assert_eq!(lookup_id, first.conversation_id);
17207
17208        storage
17209            .conn
17210            .execute_compat(
17211                "DELETE FROM conversation_external_tail_lookup WHERE lookup_key = ?1",
17212                fparams![lookup_key.as_str()],
17213            )
17214            .unwrap();
17215
17216        let appended = make_profiled_storage_remote_conversation(88, 4);
17217        let second = storage
17218            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17219            .unwrap();
17220        assert_eq!(second.conversation_id, first.conversation_id);
17221        assert_eq!(second.inserted_indices, vec![2, 3]);
17222
17223        let conversation_count: i64 = storage
17224            .conn
17225            .query_row_map(
17226                "SELECT COUNT(*)
17227                 FROM conversations
17228                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
17229                fparams![initial.source_id.as_str(), agent_id, external_id],
17230                |row| row.get_typed(0),
17231            )
17232            .unwrap();
17233        let restored_lookup: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
17234            .conn
17235            .query_row_map(
17236                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
17237                 FROM conversation_external_tail_lookup
17238                 WHERE lookup_key = ?1",
17239                fparams![lookup_key.as_str()],
17240                |row| {
17241                    Ok((
17242                        row.get_typed(0)?,
17243                        row.get_typed(1)?,
17244                        row.get_typed(2)?,
17245                        row.get_typed(3)?,
17246                    ))
17247                },
17248            )
17249            .unwrap();
17250        let tail_state: (Option<i64>, Option<i64>, Option<i64>) = storage
17251            .conn
17252            .query_row_map(
17253                "SELECT ended_at, last_message_idx, last_message_created_at
17254                 FROM conversation_tail_state
17255                 WHERE conversation_id = ?1",
17256                fparams![first.conversation_id],
17257                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
17258            )
17259            .unwrap();
17260        assert_eq!(conversation_count, 1);
17261        assert_eq!(
17262            restored_lookup,
17263            (
17264                first.conversation_id,
17265                tail_state.0,
17266                tail_state.1,
17267                tail_state.2
17268            )
17269        );
17270        assert_eq!(
17271            tail_state,
17272            (
17273                appended.messages[3].created_at,
17274                Some(3),
17275                appended.messages[3].created_at
17276            )
17277        );
17278    }
17279
17280    #[test]
17281    fn insert_conversation_tree_recreates_daily_stats_after_manual_clear() {
17282        let dir = TempDir::new().unwrap();
17283        let db_path = dir.path().join("test.db");
17284        let storage = SqliteStorage::open(&db_path).unwrap();
17285        let agent_id = storage
17286            .ensure_agent(&Agent {
17287                id: None,
17288                slug: "codex".into(),
17289                name: "Codex".into(),
17290                version: None,
17291                kind: AgentKind::Cli,
17292            })
17293            .unwrap();
17294        let workspace = PathBuf::from("/ws/profiled-storage-remote");
17295        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17296
17297        storage
17298            .insert_conversation_tree(
17299                agent_id,
17300                Some(workspace_id),
17301                &make_profiled_storage_remote_conversation(0, 3),
17302            )
17303            .unwrap();
17304        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17305
17306        storage
17307            .insert_conversation_tree(
17308                agent_id,
17309                Some(workspace_id),
17310                &make_profiled_storage_remote_conversation(1, 2),
17311            )
17312            .unwrap();
17313
17314        let row_count: i64 = storage
17315            .conn
17316            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
17317                row.get_typed(0)
17318            })
17319            .unwrap();
17320        let (session_count, message_count): (i64, i64) = storage
17321            .conn
17322            .query_row_map(
17323                "SELECT session_count, message_count
17324                 FROM daily_stats
17325                 WHERE agent_slug = 'all' AND source_id = 'all'",
17326                fparams![],
17327                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17328            )
17329            .unwrap();
17330
17331        assert_eq!(row_count, 4);
17332        assert_eq!(session_count, 1);
17333        assert_eq!(message_count, 2);
17334    }
17335
17336    #[test]
17337    #[serial]
17338    fn insert_conversation_tree_stage_profile_tracks_steady_state_remote_reuse() {
17339        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17340
17341        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17342            let dir = TempDir::new().unwrap();
17343            let db_path = dir.path().join(format!("profile-{msg_count}.db"));
17344            let storage = SqliteStorage::open(&db_path).unwrap();
17345            let agent_id = storage
17346                .ensure_agent(&Agent {
17347                    id: None,
17348                    slug: "codex".into(),
17349                    name: "Codex".into(),
17350                    version: None,
17351                    kind: AgentKind::Cli,
17352                })
17353                .unwrap();
17354            let workspace = PathBuf::from("/ws/profiled-storage-remote");
17355            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17356
17357            storage
17358                .insert_conversation_tree(
17359                    agent_id,
17360                    Some(workspace_id),
17361                    &make_profiled_storage_remote_conversation(0, msg_count),
17362                )
17363                .unwrap();
17364
17365            let mut profile = InsertConversationTreePerfProfile::default();
17366            for external_id in 1..=iterations {
17367                storage
17368                    .insert_conversation_tree_with_profile(
17369                        agent_id,
17370                        Some(workspace_id),
17371                        &make_profiled_storage_remote_conversation(external_id as i64, msg_count),
17372                        &mut profile,
17373                    )
17374                    .unwrap();
17375            }
17376
17377            let accounted_duration = profile.source_duration
17378                + profile.tx_open_duration
17379                + profile.existing_lookup_duration
17380                + profile.conversation_row_duration
17381                + profile.message_insert_duration
17382                + profile.snippet_insert_duration
17383                + profile.fts_entry_duration
17384                + profile.fts_flush_duration
17385                + profile.analytics_duration
17386                + profile.commit_duration;
17387            assert_eq!(profile.invocations, iterations);
17388            assert_eq!(profile.messages, iterations * msg_count);
17389            assert_eq!(profile.inserted_messages, iterations * msg_count);
17390            assert!(
17391                profile.total_duration >= accounted_duration,
17392                "accounted stage durations cannot exceed total duration"
17393            );
17394
17395            profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
17396        }
17397    }
17398
17399    #[test]
17400    #[serial]
17401    fn insert_conversation_tree_stage_profile_tracks_append_remote_source_merge() {
17402        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17403
17404        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17405            let dir = TempDir::new().unwrap();
17406            let db_path = dir.path().join(format!("append-profile-{msg_count}.db"));
17407            let storage = SqliteStorage::open(&db_path).unwrap();
17408            let agent_id = storage
17409                .ensure_agent(&Agent {
17410                    id: None,
17411                    slug: "codex".into(),
17412                    name: "Codex".into(),
17413                    version: None,
17414                    kind: AgentKind::Cli,
17415                })
17416                .unwrap();
17417            let workspace = PathBuf::from("/ws/profiled-append-remote");
17418            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17419
17420            for external_id in 0..iterations {
17421                storage
17422                    .insert_conversation_tree(
17423                        agent_id,
17424                        Some(workspace_id),
17425                        &make_profiled_append_remote_merge_conversation(
17426                            external_id as i64,
17427                            msg_count,
17428                        ),
17429                    )
17430                    .unwrap();
17431            }
17432
17433            let mut profile = InsertConversationTreePerfProfile::default();
17434            for external_id in 0..iterations {
17435                storage
17436                    .append_existing_conversation_with_profile(
17437                        agent_id,
17438                        Some(workspace_id),
17439                        &make_profiled_append_remote_merge_conversation(
17440                            external_id as i64,
17441                            msg_count * 2,
17442                        ),
17443                        &mut profile,
17444                    )
17445                    .unwrap();
17446            }
17447
17448            let accounted_duration = profile.source_duration
17449                + profile.tx_open_duration
17450                + profile.existing_lookup_duration
17451                + profile.existing_idx_lookup_duration
17452                + profile.existing_replay_lookup_duration
17453                + profile.dedupe_filter_duration
17454                + profile.conversation_row_duration
17455                + profile.message_insert_duration
17456                + profile.snippet_insert_duration
17457                + profile.fts_entry_duration
17458                + profile.fts_flush_duration
17459                + profile.analytics_duration
17460                + profile.commit_duration;
17461            assert_eq!(profile.invocations, iterations);
17462            assert_eq!(profile.messages, iterations * msg_count * 2);
17463            assert_eq!(profile.inserted_messages, iterations * msg_count);
17464            assert!(
17465                profile.total_duration >= accounted_duration,
17466                "accounted append stage durations cannot exceed total duration"
17467            );
17468
17469            profile.log_summary(&format!("append_remote_merge_{msg_count}_msgs"));
17470        }
17471    }
17472
17473    #[test]
17474    fn rebuild_daily_stats_recomputes_materialized_totals_without_monolithic_group_by() {
17475        let dir = TempDir::new().unwrap();
17476        let db_path = dir.path().join("test.db");
17477        let storage = SqliteStorage::open(&db_path).unwrap();
17478        let started_at = 1_700_000_000_000_i64;
17479        let day_id = FrankenStorage::day_id_from_millis(started_at);
17480        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17481
17482        storage
17483            .conn
17484            .execute_compat(
17485                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17486                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17487                fparams![1_i64, "codex", "Codex", "cli"],
17488            )
17489            .unwrap();
17490        storage
17491            .conn
17492            .execute_compat(
17493                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17494                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17495                fparams![2_i64, "claude", "Claude", "cli"],
17496            )
17497            .unwrap();
17498
17499        storage
17500            .conn
17501            .execute_compat(
17502                "INSERT INTO conversations (
17503                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17504                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17505                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17506                fparams![
17507                    1_i64,
17508                    1_i64,
17509                    LOCAL_SOURCE_ID,
17510                    "daily-a",
17511                    "Daily A",
17512                    "/tmp/daily-a.jsonl",
17513                    started_at,
17514                    started_at + 200,
17515                    "{}"
17516                ],
17517            )
17518            .unwrap();
17519        storage
17520            .conn
17521            .execute_compat(
17522                "INSERT INTO conversations (
17523                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17524                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17525                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17526                fparams![
17527                    2_i64,
17528                    2_i64,
17529                    LOCAL_SOURCE_ID,
17530                    "daily-b",
17531                    "Daily B",
17532                    "/tmp/daily-b.jsonl",
17533                    started_at,
17534                    started_at + 300,
17535                    "{}"
17536                ],
17537            )
17538            .unwrap();
17539
17540        storage
17541            .conn
17542            .execute_compat(
17543                "INSERT INTO messages (
17544                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17545                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17546                fparams![1_i64, 1_i64, 0_i64, "user", started_at, "hello"],
17547            )
17548            .unwrap();
17549        storage
17550            .conn
17551            .execute_compat(
17552                "INSERT INTO messages (
17553                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17554                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17555                fparams![2_i64, 1_i64, 1_i64, "assistant", started_at + 100, "response"],
17556            )
17557            .unwrap();
17558        storage
17559            .conn
17560            .execute_compat(
17561                "INSERT INTO messages (
17562                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17563                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17564                fparams![3_i64, 2_i64, 0_i64, "user", started_at + 50, "abc"],
17565            )
17566            .unwrap();
17567
17568        for (message_id, agent_slug, role, content_len) in [
17569            (1_i64, "codex", "user", 5_i64),
17570            (2_i64, "codex", "assistant", 8_i64),
17571            (3_i64, "claude", "user", 3_i64),
17572        ] {
17573            storage
17574                .conn
17575                .execute_compat(
17576                    "INSERT INTO message_metrics (
17577                        message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17578                        role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17579                        api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17580                        api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17581                        model_name, model_family, model_tier, provider
17582                     ) VALUES (
17583                        ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17584                        ?8, ?9, ?10, ?11, ?12,
17585                        ?13, ?14, ?15,
17586                        ?16, ?17, ?18, ?19, ?20,
17587                        ?21, ?22, ?23, ?24
17588                     )",
17589                    fparams![
17590                        message_id,
17591                        started_at,
17592                        hour_id,
17593                        day_id,
17594                        agent_slug,
17595                        0_i64,
17596                        LOCAL_SOURCE_ID,
17597                        role,
17598                        content_len,
17599                        content_len / 4,
17600                        0_i64,
17601                        0_i64,
17602                        0_i64,
17603                        0_i64,
17604                        0_i64,
17605                        "",
17606                        "estimated",
17607                        0_i64,
17608                        0_i64,
17609                        0_i64,
17610                        "",
17611                        "unknown",
17612                        "unknown",
17613                        "unknown"
17614                    ],
17615                )
17616                .unwrap();
17617        }
17618
17619        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17620
17621        let rebuilt = storage.rebuild_daily_stats().unwrap();
17622        assert_eq!(rebuilt.total_sessions, 2);
17623
17624        let health = storage.daily_stats_health().unwrap();
17625        assert_eq!(health.conversation_count, 2);
17626        assert_eq!(health.materialized_total, 2);
17627        assert_eq!(health.drift, 0);
17628
17629        let total_messages: i64 = storage
17630            .conn
17631            .query_row_map(
17632                "SELECT message_count FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17633                fparams![],
17634                |row| row.get_typed(0),
17635            )
17636            .unwrap();
17637        assert_eq!(total_messages, 3);
17638    }
17639
17640    #[test]
17641    fn rebuild_daily_stats_preserves_byte_counts_with_message_metrics() {
17642        let dir = TempDir::new().unwrap();
17643        let db_path = dir.path().join("test.db");
17644        let storage = SqliteStorage::open(&db_path).unwrap();
17645
17646        let content = "ASCII🙂é漢字";
17647        let expected_bytes = content.len() as i64;
17648        let started_at = 1_704_067_200_000_i64;
17649        let day_id = FrankenStorage::day_id_from_millis(started_at);
17650        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17651
17652        storage
17653            .conn
17654            .execute_compat(
17655                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17656                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17657                fparams![1_i64, "tester", "Tester", "cli"],
17658            )
17659            .unwrap();
17660        storage
17661            .conn
17662            .execute_compat(
17663                "INSERT INTO conversations (
17664                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17665                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17666                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
17667                fparams![
17668                    1_i64,
17669                    1_i64,
17670                    LOCAL_SOURCE_ID,
17671                    "unicode-metrics",
17672                    "Unicode Metrics",
17673                    "/tmp/unicode-metrics.jsonl",
17674                    started_at,
17675                    "{}"
17676                ],
17677            )
17678            .unwrap();
17679        storage
17680            .conn
17681            .execute_compat(
17682                "INSERT INTO messages (
17683                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17684                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17685                fparams![1_i64, 1_i64, 0_i64, "user", started_at, content],
17686            )
17687            .unwrap();
17688        storage
17689            .conn
17690            .execute_compat(
17691                "INSERT INTO message_metrics (
17692                    message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17693                    role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17694                    api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17695                    api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17696                    model_name, model_family, model_tier, provider
17697                 ) VALUES (
17698                    ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17699                    ?8, ?9, ?10, ?11, ?12,
17700                    ?13, ?14, ?15,
17701                    ?16, ?17, ?18, ?19, ?20,
17702                    ?21, ?22, ?23, ?24
17703                 )",
17704                fparams![
17705                    1_i64,
17706                    started_at,
17707                    hour_id,
17708                    day_id,
17709                    "tester",
17710                    0_i64,
17711                    LOCAL_SOURCE_ID,
17712                    "user",
17713                    expected_bytes,
17714                    expected_bytes / 4,
17715                    0_i64,
17716                    0_i64,
17717                    0_i64,
17718                    0_i64,
17719                    0_i64,
17720                    "",
17721                    "estimated",
17722                    0_i64,
17723                    0_i64,
17724                    0_i64,
17725                    "",
17726                    "unknown",
17727                    "unknown",
17728                    "unknown"
17729                ],
17730            )
17731            .unwrap();
17732
17733        let mut tx = storage.conn.transaction().unwrap();
17734        franken_update_daily_stats_in_tx(
17735            &storage,
17736            &tx,
17737            "tester",
17738            LOCAL_SOURCE_ID,
17739            Some(started_at),
17740            StatsDelta {
17741                session_count_delta: 1,
17742                message_count_delta: 1,
17743                total_chars_delta: expected_bytes,
17744            },
17745        )
17746        .unwrap();
17747        tx.commit().unwrap();
17748
17749        let inline_total: i64 = storage
17750            .conn
17751            .query_row_map(
17752                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17753                fparams![],
17754                |row| row.get_typed(0),
17755            )
17756            .unwrap();
17757        assert_eq!(inline_total, expected_bytes);
17758
17759        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17760
17761        let rebuilt = storage.rebuild_daily_stats().unwrap();
17762        assert_eq!(rebuilt.total_sessions, 1);
17763
17764        let rebuilt_total: i64 = storage
17765            .conn
17766            .query_row_map(
17767                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17768                fparams![],
17769                |row| row.get_typed(0),
17770            )
17771            .unwrap();
17772        assert_eq!(rebuilt_total, expected_bytes);
17773    }
17774
17775    #[test]
17776    fn rebuild_daily_stats_raw_fallback_preserves_byte_counts() {
17777        let dir = TempDir::new().unwrap();
17778        let db_path = dir.path().join("test.db");
17779        let storage = SqliteStorage::open(&db_path).unwrap();
17780
17781        let content = "fallback🙂é漢字";
17782        let expected_bytes = content.len() as i64;
17783        let started_at = 1_704_067_200_000_i64;
17784        storage
17785            .conn
17786            .execute_compat(
17787                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17788                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17789                fparams![1_i64, "tester", "Tester", "cli"],
17790            )
17791            .unwrap();
17792        storage
17793            .conn
17794            .execute_compat(
17795                "INSERT INTO conversations (
17796                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17797                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17798                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
17799                fparams![
17800                    1_i64,
17801                    1_i64,
17802                    LOCAL_SOURCE_ID,
17803                    "unicode-fallback",
17804                    "Unicode Fallback",
17805                    "/tmp/unicode-fallback.jsonl",
17806                    started_at,
17807                    "{}"
17808                ],
17809            )
17810            .unwrap();
17811        storage
17812            .conn
17813            .execute_compat(
17814                "INSERT INTO messages (
17815                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17816                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17817                fparams![1_i64, 1_i64, 0_i64, "assistant", started_at, content],
17818            )
17819            .unwrap();
17820
17821        let mut tx = storage.conn.transaction().unwrap();
17822        franken_update_daily_stats_in_tx(
17823            &storage,
17824            &tx,
17825            "tester",
17826            LOCAL_SOURCE_ID,
17827            Some(started_at),
17828            StatsDelta {
17829                session_count_delta: 1,
17830                message_count_delta: 1,
17831                total_chars_delta: expected_bytes,
17832            },
17833        )
17834        .unwrap();
17835        tx.commit().unwrap();
17836
17837        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17838
17839        let rebuilt = storage.rebuild_daily_stats().unwrap();
17840        assert_eq!(rebuilt.total_sessions, 1);
17841
17842        let rebuilt_total: i64 = storage
17843            .conn
17844            .query_row_map(
17845                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17846                fparams![],
17847                |row| row.get_typed(0),
17848            )
17849            .unwrap();
17850        assert_eq!(rebuilt_total, expected_bytes);
17851    }
17852
17853    #[test]
17854    fn insert_conversations_batched_appends_duplicate_external_id() {
17855        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17856        use std::path::PathBuf;
17857
17858        let dir = TempDir::new().unwrap();
17859        let db_path = dir.path().join("test.db");
17860        let storage = SqliteStorage::open(&db_path).unwrap();
17861
17862        let agent = Agent {
17863            id: None,
17864            slug: "codex".into(),
17865            name: "Codex".into(),
17866            version: Some("0.2.3".into()),
17867            kind: AgentKind::Cli,
17868        };
17869        let agent_id = storage.ensure_agent(&agent).unwrap();
17870
17871        let base_conv = |messages: Vec<Message>| Conversation {
17872            id: None,
17873            agent_slug: "codex".into(),
17874            workspace: Some(PathBuf::from("/tmp/workspace")),
17875            external_id: Some("shared-session".into()),
17876            title: Some("Shared Session".into()),
17877            source_path: PathBuf::from("/tmp/rollout.jsonl"),
17878            started_at: Some(1_700_000_000_000),
17879            ended_at: Some(1_700_000_000_999),
17880            approx_tokens: None,
17881            metadata_json: serde_json::Value::Null,
17882            messages,
17883            source_id: "local".into(),
17884            origin_host: None,
17885        };
17886
17887        let conv_a = base_conv(vec![
17888            Message {
17889                id: None,
17890                idx: 0,
17891                role: MessageRole::User,
17892                author: None,
17893                created_at: Some(1_700_000_000_000),
17894                content: "first".into(),
17895                extra_json: serde_json::Value::Null,
17896                snippets: Vec::new(),
17897            },
17898            Message {
17899                id: None,
17900                idx: 1,
17901                role: MessageRole::Agent,
17902                author: None,
17903                created_at: Some(1_700_000_000_100),
17904                content: "second".into(),
17905                extra_json: serde_json::Value::Null,
17906                snippets: Vec::new(),
17907            },
17908        ]);
17909        let conv_b = base_conv(vec![
17910            Message {
17911                id: None,
17912                idx: 0,
17913                role: MessageRole::User,
17914                author: None,
17915                created_at: Some(1_700_000_000_000),
17916                content: "first".into(),
17917                extra_json: serde_json::Value::Null,
17918                snippets: Vec::new(),
17919            },
17920            Message {
17921                id: None,
17922                idx: 1,
17923                role: MessageRole::Agent,
17924                author: None,
17925                created_at: Some(1_700_000_000_100),
17926                content: "second".into(),
17927                extra_json: serde_json::Value::Null,
17928                snippets: Vec::new(),
17929            },
17930            Message {
17931                id: None,
17932                idx: 2,
17933                role: MessageRole::User,
17934                author: None,
17935                created_at: Some(1_700_000_000_200),
17936                content: "third".into(),
17937                extra_json: serde_json::Value::Null,
17938                snippets: Vec::new(),
17939            },
17940            Message {
17941                id: None,
17942                idx: 3,
17943                role: MessageRole::Agent,
17944                author: None,
17945                created_at: Some(1_700_000_000_300),
17946                content: "fourth".into(),
17947                extra_json: serde_json::Value::Null,
17948                snippets: Vec::new(),
17949            },
17950        ]);
17951
17952        let outcomes = storage
17953            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
17954            .unwrap();
17955        assert_eq!(outcomes.len(), 2);
17956        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
17957        assert_eq!(outcomes[1].inserted_indices, vec![2, 3]);
17958        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
17959
17960        let conversation_count: i64 = storage
17961            .conn
17962            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
17963                row.get_typed(0)
17964            })
17965            .unwrap();
17966        let conversation_count_not_indexed: i64 = storage
17967            .conn
17968            .query_row_map(
17969                "SELECT COUNT(*) FROM conversations NOT INDEXED",
17970                fparams![],
17971                |row| row.get_typed(0),
17972            )
17973            .unwrap();
17974        let conversation_count_source_index: i64 = storage
17975            .conn
17976            .query_row_map(
17977                "SELECT COUNT(*) FROM conversations INDEXED BY idx_conversations_source_id",
17978                fparams![],
17979                |row| row.get_typed(0),
17980            )
17981            .unwrap();
17982        let message_count: i64 = storage
17983            .conn
17984            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
17985                row.get_typed(0)
17986            })
17987            .unwrap();
17988        let reopened_storage = SqliteStorage::open(&db_path).unwrap();
17989        let reopened_conversation_count: i64 = reopened_storage
17990            .conn
17991            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
17992                row.get_typed(0)
17993            })
17994            .unwrap();
17995        let reopened_conversation_count_not_indexed: i64 = reopened_storage
17996            .conn
17997            .query_row_map(
17998                "SELECT COUNT(*) FROM conversations NOT INDEXED",
17999                fparams![],
18000                |row| row.get_typed(0),
18001            )
18002            .unwrap();
18003        let reopened_conversation_ids: Vec<i64> = reopened_storage
18004            .conn
18005            .query_map_collect(
18006                "SELECT id FROM conversations ORDER BY id",
18007                fparams![],
18008                |row| row.get_typed(0),
18009            )
18010            .unwrap();
18011        let reopened_conversation_ids_not_indexed: Vec<i64> = reopened_storage
18012            .conn
18013            .query_map_collect(
18014                "SELECT id FROM conversations NOT INDEXED ORDER BY id",
18015                fparams![],
18016                |row| row.get_typed(0),
18017            )
18018            .unwrap();
18019        let reopened_conversation_ids_source_index: Vec<i64> = reopened_storage
18020            .conn
18021            .query_map_collect(
18022                "SELECT id FROM conversations INDEXED BY idx_conversations_source_id ORDER BY id",
18023                fparams![],
18024                |row| row.get_typed(0),
18025            )
18026            .unwrap();
18027
18028        assert_eq!(reopened_conversation_ids, vec![outcomes[0].conversation_id]);
18029        assert_eq!(
18030            reopened_conversation_ids_not_indexed,
18031            vec![outcomes[0].conversation_id]
18032        );
18033        assert_eq!(
18034            reopened_conversation_ids_source_index,
18035            vec![outcomes[0].conversation_id]
18036        );
18037        assert_eq!(reopened_conversation_count, 1);
18038        assert_eq!(reopened_conversation_count_not_indexed, 1);
18039        assert_eq!(conversation_count_not_indexed, 1);
18040        assert_eq!(conversation_count_source_index, 1);
18041        assert_eq!(conversation_count, 1);
18042        assert_eq!(message_count, 4);
18043    }
18044
18045    #[test]
18046    fn franken_insert_conversation_or_get_existing_recovers_unique_conflict() {
18047        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18048        use std::path::PathBuf;
18049
18050        let dir = TempDir::new().unwrap();
18051        let db_path = dir.path().join("test.db");
18052        let storage = SqliteStorage::open(&db_path).unwrap();
18053
18054        let agent = Agent {
18055            id: None,
18056            slug: "codex".into(),
18057            name: "Codex".into(),
18058            version: Some("0.2.3".into()),
18059            kind: AgentKind::Cli,
18060        };
18061        let agent_id = storage.ensure_agent(&agent).unwrap();
18062
18063        let conv = Conversation {
18064            id: None,
18065            agent_slug: "codex".into(),
18066            workspace: Some(PathBuf::from("/tmp/workspace")),
18067            external_id: Some("recover-duplicate".into()),
18068            title: Some("Recover Duplicate".into()),
18069            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18070            started_at: Some(1_700_000_000_000),
18071            ended_at: Some(1_700_000_000_100),
18072            approx_tokens: None,
18073            metadata_json: serde_json::Value::Null,
18074            messages: vec![Message {
18075                id: None,
18076                idx: 0,
18077                role: MessageRole::User,
18078                author: None,
18079                created_at: Some(1_700_000_000_000),
18080                content: "hello".into(),
18081                extra_json: serde_json::Value::Null,
18082                snippets: Vec::new(),
18083            }],
18084            source_id: "local".into(),
18085            origin_host: None,
18086        };
18087
18088        let tx = storage.conn.transaction().unwrap();
18089        let inserted_id = franken_insert_conversation(&tx, agent_id, None, &conv)
18090            .unwrap()
18091            .expect("first insert should succeed");
18092
18093        let conversation_key = conversation_merge_key(agent_id, &conv);
18094        let resolved = franken_insert_conversation_or_get_existing_after_miss(
18095            &tx,
18096            agent_id,
18097            None,
18098            &conv,
18099            &conversation_key,
18100        )
18101        .unwrap();
18102
18103        match resolved {
18104            ConversationInsertStatus::Existing(existing_id) => {
18105                assert_eq!(existing_id, inserted_id);
18106            }
18107            ConversationInsertStatus::Inserted(new_id) => {
18108                panic!("expected existing conversation id, got freshly inserted {new_id}");
18109            }
18110        }
18111
18112        let conversation_count: i64 = tx
18113            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18114                row.get_typed(0)
18115            })
18116            .unwrap();
18117        assert_eq!(conversation_count, 1);
18118    }
18119
18120    #[test]
18121    fn insert_conversations_batched_merges_duplicate_external_id_with_gaps() {
18122        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18123        use std::path::PathBuf;
18124
18125        let dir = TempDir::new().unwrap();
18126        let db_path = dir.path().join("test.db");
18127        let storage = SqliteStorage::open(&db_path).unwrap();
18128
18129        let agent = Agent {
18130            id: None,
18131            slug: "codex".into(),
18132            name: "Codex".into(),
18133            version: Some("0.2.3".into()),
18134            kind: AgentKind::Cli,
18135        };
18136        let agent_id = storage.ensure_agent(&agent).unwrap();
18137
18138        let base_conv = |messages: Vec<Message>| Conversation {
18139            id: None,
18140            agent_slug: "codex".into(),
18141            workspace: Some(PathBuf::from("/tmp/workspace")),
18142            external_id: Some("shared-session-gap".into()),
18143            title: Some("Shared Session Gap".into()),
18144            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18145            started_at: Some(1_700_000_000_000),
18146            ended_at: Some(1_700_000_000_999),
18147            approx_tokens: None,
18148            metadata_json: serde_json::Value::Null,
18149            messages,
18150            source_id: "local".into(),
18151            origin_host: None,
18152        };
18153
18154        let conv_a = base_conv(vec![
18155            Message {
18156                id: None,
18157                idx: 2,
18158                role: MessageRole::User,
18159                author: None,
18160                created_at: Some(1_700_000_000_200),
18161                content: "third".into(),
18162                extra_json: serde_json::Value::Null,
18163                snippets: Vec::new(),
18164            },
18165            Message {
18166                id: None,
18167                idx: 3,
18168                role: MessageRole::Agent,
18169                author: None,
18170                created_at: Some(1_700_000_000_300),
18171                content: "fourth".into(),
18172                extra_json: serde_json::Value::Null,
18173                snippets: Vec::new(),
18174            },
18175        ]);
18176        let conv_b = base_conv(vec![
18177            Message {
18178                id: None,
18179                idx: 0,
18180                role: MessageRole::User,
18181                author: None,
18182                created_at: Some(1_700_000_000_000),
18183                content: "first".into(),
18184                extra_json: serde_json::Value::Null,
18185                snippets: Vec::new(),
18186            },
18187            Message {
18188                id: None,
18189                idx: 1,
18190                role: MessageRole::Agent,
18191                author: None,
18192                created_at: Some(1_700_000_000_100),
18193                content: "second".into(),
18194                extra_json: serde_json::Value::Null,
18195                snippets: Vec::new(),
18196            },
18197            Message {
18198                id: None,
18199                idx: 3,
18200                role: MessageRole::Agent,
18201                author: None,
18202                created_at: Some(1_700_000_000_300),
18203                content: "fourth".into(),
18204                extra_json: serde_json::Value::Null,
18205                snippets: Vec::new(),
18206            },
18207        ]);
18208
18209        let outcomes = storage
18210            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18211            .unwrap();
18212        assert_eq!(outcomes.len(), 2);
18213        assert_eq!(outcomes[0].inserted_indices, vec![2, 3]);
18214        assert_eq!(outcomes[1].inserted_indices, vec![0, 1]);
18215        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18216
18217        let stored_indices: Vec<i64> = storage
18218            .conn
18219            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18220                row.get_typed(0)
18221            })
18222            .unwrap();
18223        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18224    }
18225
18226    #[test]
18227    fn insert_conversations_batched_refreshes_partial_pending_message_lookup() {
18228        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18229        use std::path::PathBuf;
18230
18231        let dir = TempDir::new().unwrap();
18232        let db_path = dir.path().join("test.db");
18233        let storage = SqliteStorage::open(&db_path).unwrap();
18234
18235        let agent = Agent {
18236            id: None,
18237            slug: "codex".into(),
18238            name: "Codex".into(),
18239            version: Some("0.2.3".into()),
18240            kind: AgentKind::Cli,
18241        };
18242        let agent_id = storage.ensure_agent(&agent).unwrap();
18243
18244        let make_message = |idx: i64, content: &str| Message {
18245            id: None,
18246            idx,
18247            role: if idx == 0 {
18248                MessageRole::User
18249            } else {
18250                MessageRole::Agent
18251            },
18252            author: None,
18253            created_at: Some(1_700_000_000_000 + idx),
18254            content: content.into(),
18255            extra_json: serde_json::Value::Null,
18256            snippets: Vec::new(),
18257        };
18258
18259        let base_conv = |messages: Vec<Message>| Conversation {
18260            id: None,
18261            agent_slug: "codex".into(),
18262            workspace: Some(PathBuf::from("/tmp/workspace")),
18263            external_id: Some("partial-cache-session".into()),
18264            title: Some("Partial cache session".into()),
18265            source_path: PathBuf::from("/tmp/partial-cache.jsonl"),
18266            started_at: Some(1_700_000_000_000),
18267            ended_at: Some(1_700_000_000_100),
18268            approx_tokens: None,
18269            metadata_json: serde_json::Value::Null,
18270            messages,
18271            source_id: "local".into(),
18272            origin_host: None,
18273        };
18274
18275        let canonical = base_conv(vec![
18276            make_message(0, "canonical zero"),
18277            make_message(20, "canonical twenty"),
18278        ]);
18279        storage
18280            .insert_conversation_tree(agent_id, None, &canonical)
18281            .unwrap();
18282
18283        let exact_prefix = base_conv(vec![make_message(0, "canonical zero")]);
18284        let conflicting_tail = base_conv(vec![make_message(20, "conflicting twenty")]);
18285
18286        let outcomes = storage
18287            .insert_conversations_batched(&[
18288                (agent_id, None, &exact_prefix),
18289                (agent_id, None, &conflicting_tail),
18290            ])
18291            .unwrap();
18292
18293        assert_eq!(outcomes.len(), 2);
18294        assert!(outcomes[0].inserted_indices.is_empty());
18295        assert!(
18296            outcomes[1].inserted_indices.is_empty(),
18297            "the second batch item must refresh the partial pending lookup and retain the canonical idx=20 row"
18298        );
18299
18300        let stored_messages: Vec<(i64, String)> = storage
18301            .conn
18302            .query_map_collect(
18303                "SELECT idx, content FROM messages ORDER BY idx",
18304                fparams![],
18305                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18306            )
18307            .unwrap();
18308        assert_eq!(
18309            stored_messages,
18310            vec![
18311                (0, "canonical zero".to_string()),
18312                (20, "canonical twenty".to_string()),
18313            ]
18314        );
18315    }
18316
18317    #[test]
18318    fn insert_conversations_batched_reprocessing_conversation_is_idempotent() {
18319        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18320        use std::path::PathBuf;
18321
18322        const MESSAGE_COUNT: i64 = 64;
18323
18324        let dir = TempDir::new().unwrap();
18325        let db_path = dir.path().join("test.db");
18326        let storage = SqliteStorage::open(&db_path).unwrap();
18327
18328        let agent = Agent {
18329            id: None,
18330            slug: "codex".into(),
18331            name: "Codex".into(),
18332            version: Some("0.2.3".into()),
18333            kind: AgentKind::Cli,
18334        };
18335        let agent_id = storage.ensure_agent(&agent).unwrap();
18336
18337        let messages: Vec<Message> = (0..MESSAGE_COUNT)
18338            .map(|idx| Message {
18339                id: None,
18340                idx,
18341                role: if idx % 2 == 0 {
18342                    MessageRole::User
18343                } else {
18344                    MessageRole::Agent
18345                },
18346                author: None,
18347                created_at: Some(1_700_000_000_000 + idx),
18348                content: format!("message {idx}"),
18349                extra_json: serde_json::Value::Null,
18350                snippets: Vec::new(),
18351            })
18352            .collect();
18353
18354        let conversation = Conversation {
18355            id: None,
18356            agent_slug: "codex".into(),
18357            workspace: Some(PathBuf::from("/tmp/workspace")),
18358            external_id: Some("large-reprocess-session".into()),
18359            title: Some("Large Reprocess Session".into()),
18360            source_path: PathBuf::from("/tmp/large-reprocess-session.jsonl"),
18361            started_at: Some(1_700_000_000_000),
18362            ended_at: Some(1_700_000_000_000 + MESSAGE_COUNT - 1),
18363            approx_tokens: None,
18364            metadata_json: serde_json::Value::Null,
18365            messages,
18366            source_id: "local".into(),
18367            origin_host: None,
18368        };
18369
18370        let first = storage
18371            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18372            .unwrap();
18373        let second = storage
18374            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18375            .unwrap();
18376
18377        assert_eq!(first.len(), 1);
18378        assert_eq!(second.len(), 1);
18379        assert_eq!(first[0].inserted_indices.len(), MESSAGE_COUNT as usize);
18380        assert!(
18381            second[0].inserted_indices.is_empty(),
18382            "full reprocessing of a large conversation must not attempt duplicate idx inserts"
18383        );
18384        assert_eq!(first[0].conversation_id, second[0].conversation_id);
18385
18386        let conversation_count: i64 = storage
18387            .conn
18388            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18389                row.get_typed(0)
18390            })
18391            .unwrap();
18392        let message_count: i64 = storage
18393            .conn
18394            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18395                row.get_typed(0)
18396            })
18397            .unwrap();
18398
18399        assert_eq!(conversation_count, 1);
18400        assert_eq!(message_count, MESSAGE_COUNT);
18401    }
18402
18403    #[test]
18404    fn parallel_insert_conversation_tree_keeps_unique_external_ids_distinct() {
18405        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18406        use crate::indexer::persist::map_to_internal;
18407        use crate::model::types::{Agent, AgentKind};
18408        use frankensqlite::compat::{ConnectionExt, RowExt};
18409        use rand::RngExt;
18410        use rayon::prelude::*;
18411
18412        fn retryable_franken_error(err: &anyhow::Error) -> bool {
18413            err.downcast_ref::<frankensqlite::FrankenError>()
18414                .or_else(|| {
18415                    err.root_cause()
18416                        .downcast_ref::<frankensqlite::FrankenError>()
18417                })
18418                .is_some_and(|inner| {
18419                    matches!(
18420                        inner,
18421                        frankensqlite::FrankenError::Busy
18422                            | frankensqlite::FrankenError::BusyRecovery
18423                            | frankensqlite::FrankenError::BusySnapshot { .. }
18424                            | frankensqlite::FrankenError::WriteConflict { .. }
18425                            | frankensqlite::FrankenError::SerializationFailure { .. }
18426                    )
18427                })
18428        }
18429
18430        fn with_retry<F, T>(mut f: F) -> anyhow::Result<T>
18431        where
18432            F: FnMut() -> anyhow::Result<T>,
18433        {
18434            let mut rng = rand::rng();
18435            let mut backoff_ms = 4_u64;
18436            for attempt in 0..=24 {
18437                match f() {
18438                    Ok(value) => return Ok(value),
18439                    Err(err) if attempt < 24 && retryable_franken_error(&err) => {
18440                        let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
18441                        std::thread::sleep(Duration::from_millis(sleep_ms));
18442                        backoff_ms = (backoff_ms * 2).min(512);
18443                    }
18444                    Err(err) => return Err(err),
18445                }
18446            }
18447            unreachable!("retry loop must return on success or final failure")
18448        }
18449
18450        let dir = TempDir::new().unwrap();
18451        let db_path = dir.path().join("parallel_insert_conversation_tree.db");
18452        let seed = FrankenStorage::open(&db_path).unwrap();
18453        drop(seed);
18454
18455        let conversations: Vec<NormalizedConversation> = (0..10)
18456            .map(|i| NormalizedConversation {
18457                agent_slug: format!("agent-{}", i % 3),
18458                external_id: Some(format!("conv-{i}")),
18459                title: Some(format!("Conversation {i}")),
18460                workspace: Some(PathBuf::from(format!("/ws/{i}"))),
18461                source_path: PathBuf::from(format!("/log/{i}.jsonl")),
18462                started_at: Some(1_000 + i * 100),
18463                ended_at: Some(1_000 + i * 100 + 50),
18464                metadata: serde_json::json!({}),
18465                messages: (0..3)
18466                    .map(|j| NormalizedMessage {
18467                        idx: j,
18468                        role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
18469                        author: Some("tester".into()),
18470                        created_at: Some(1_000 + i * 100 + j * 10),
18471                        content: format!("parallel-distinct-test conv={i} msg={j}"),
18472                        extra: serde_json::json!({}),
18473                        snippets: vec![],
18474                        invocations: Vec::new(),
18475                    })
18476                    .collect(),
18477            })
18478            .collect();
18479
18480        let mut outcomes: Vec<(String, i64, Vec<i64>)> = conversations
18481            .par_chunks(3)
18482            .map(|chunk| {
18483                let storage = FrankenStorage::open_writer(&db_path).unwrap();
18484                let mut agent_cache: HashMap<String, i64> = HashMap::new();
18485                let mut workspace_cache: HashMap<PathBuf, i64> = HashMap::new();
18486                let mut chunk_outcomes = Vec::with_capacity(chunk.len());
18487
18488                for conv in chunk {
18489                    let agent_slug = conv.agent_slug.clone();
18490                    let workspace = conv.workspace.clone();
18491                    let external_id = conv.external_id.clone().expect("external id");
18492                    let internal = map_to_internal(conv);
18493                    let outcome = with_retry(|| {
18494                        let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
18495                            *id
18496                        } else {
18497                            let agent = Agent {
18498                                id: None,
18499                                slug: agent_slug.clone(),
18500                                name: agent_slug.clone(),
18501                                version: None,
18502                                kind: AgentKind::Cli,
18503                            };
18504                            let id = storage.ensure_agent(&agent)?;
18505                            agent_cache.insert(agent_slug.clone(), id);
18506                            id
18507                        };
18508                        let workspace_id = if let Some(path) = &workspace {
18509                            if let Some(id) = workspace_cache.get(path) {
18510                                Some(*id)
18511                            } else {
18512                                let id = storage.ensure_workspace(path, None)?;
18513                                workspace_cache.insert(path.clone(), id);
18514                                Some(id)
18515                            }
18516                        } else {
18517                            None
18518                        };
18519                        storage.insert_conversation_tree(agent_id, workspace_id, &internal)
18520                    })
18521                    .unwrap();
18522                    chunk_outcomes.push((
18523                        external_id,
18524                        outcome.conversation_id,
18525                        outcome.inserted_indices,
18526                    ));
18527                }
18528
18529                storage.close().unwrap();
18530                chunk_outcomes
18531            })
18532            .flatten()
18533            .collect();
18534        outcomes.sort_by(|left, right| left.0.cmp(&right.0));
18535
18536        assert!(
18537            outcomes
18538                .iter()
18539                .all(|(_, _, inserted_indices)| inserted_indices == &vec![0, 1, 2]),
18540            "unique external ids must not be routed through the existing-conversation merge path: {outcomes:?}"
18541        );
18542
18543        let distinct_ids: HashSet<i64> = outcomes
18544            .iter()
18545            .map(|(_, conversation_id, _)| *conversation_id)
18546            .collect();
18547        assert_eq!(
18548            distinct_ids.len(),
18549            conversations.len(),
18550            "unique external ids must produce distinct conversation ids: {outcomes:?}"
18551        );
18552
18553        let reader = FrankenStorage::open(&db_path).unwrap();
18554        let stored_rows: Vec<(i64, String)> = reader
18555            .raw()
18556            .query_map_collect(
18557                "SELECT id, external_id FROM conversations ORDER BY id",
18558                &[],
18559                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18560            )
18561            .unwrap();
18562        let stored_count: i64 = reader
18563            .raw()
18564            .query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
18565                row.get_typed(0)
18566            })
18567            .unwrap();
18568
18569        assert_eq!(
18570            stored_count as usize,
18571            conversations.len(),
18572            "parallel distinct inserts must persist one row per external id; rows={stored_rows:?}; outcomes={outcomes:?}"
18573        );
18574        assert_eq!(
18575            stored_rows.len(),
18576            conversations.len(),
18577            "parallel distinct inserts must remain visible after reopening; rows={stored_rows:?}; outcomes={outcomes:?}"
18578        );
18579    }
18580
18581    #[test]
18582    fn insert_conversation_tree_merges_duplicate_external_id_with_gaps() {
18583        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18584        use crate::indexer::persist::map_to_internal;
18585        use crate::model::types::{Agent, AgentKind};
18586        use std::path::PathBuf;
18587
18588        let dir = TempDir::new().unwrap();
18589        let db_path = dir.path().join("test.db");
18590        let storage = SqliteStorage::open(&db_path).unwrap();
18591
18592        let agent = Agent {
18593            id: None,
18594            slug: "codex".into(),
18595            name: "Codex".into(),
18596            version: Some("0.2.3".into()),
18597            kind: AgentKind::Cli,
18598        };
18599        let agent_id = storage.ensure_agent(&agent).unwrap();
18600
18601        let base_conv = |messages: Vec<NormalizedMessage>| NormalizedConversation {
18602            agent_slug: "codex".into(),
18603            workspace: Some(PathBuf::from("/tmp/workspace")),
18604            external_id: Some("tree-gap-session".into()),
18605            title: Some("Tree Gap Session".into()),
18606            source_path: PathBuf::from("/tmp/tree.jsonl"),
18607            started_at: Some(1_700_000_000_000),
18608            ended_at: Some(1_700_000_000_999),
18609            metadata: serde_json::Value::Null,
18610            messages,
18611        };
18612
18613        let conv_a = map_to_internal(&base_conv(vec![
18614            NormalizedMessage {
18615                idx: 2,
18616                role: "user".into(),
18617                author: None,
18618                created_at: Some(1_700_000_000_200),
18619                content: "third".into(),
18620                extra: serde_json::Value::Null,
18621                snippets: Vec::new(),
18622                invocations: Vec::new(),
18623            },
18624            NormalizedMessage {
18625                idx: 3,
18626                role: "assistant".into(),
18627                author: None,
18628                created_at: Some(1_700_000_000_300),
18629                content: "fourth".into(),
18630                extra: serde_json::Value::Null,
18631                snippets: Vec::new(),
18632                invocations: Vec::new(),
18633            },
18634        ]));
18635        let conv_b = map_to_internal(&base_conv(vec![
18636            NormalizedMessage {
18637                idx: 0,
18638                role: "user".into(),
18639                author: None,
18640                created_at: Some(1_700_000_000_000),
18641                content: "first".into(),
18642                extra: serde_json::Value::Null,
18643                snippets: Vec::new(),
18644                invocations: Vec::new(),
18645            },
18646            NormalizedMessage {
18647                idx: 1,
18648                role: "assistant".into(),
18649                author: None,
18650                created_at: Some(1_700_000_000_100),
18651                content: "second".into(),
18652                extra: serde_json::Value::Null,
18653                snippets: Vec::new(),
18654                invocations: Vec::new(),
18655            },
18656            NormalizedMessage {
18657                idx: 3,
18658                role: "assistant".into(),
18659                author: None,
18660                created_at: Some(1_700_000_000_300),
18661                content: "fourth".into(),
18662                extra: serde_json::Value::Null,
18663                snippets: Vec::new(),
18664                invocations: Vec::new(),
18665            },
18666        ]));
18667
18668        let first = storage
18669            .insert_conversation_tree(agent_id, None, &conv_a)
18670            .unwrap();
18671        let second = storage
18672            .insert_conversation_tree(agent_id, None, &conv_b)
18673            .unwrap();
18674
18675        assert_eq!(first.inserted_indices, vec![2, 3]);
18676        assert_eq!(second.inserted_indices, vec![0, 1]);
18677        assert_eq!(first.conversation_id, second.conversation_id);
18678
18679        let stored_indices: Vec<i64> = storage
18680            .conn
18681            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18682                row.get_typed(0)
18683            })
18684            .unwrap();
18685        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18686    }
18687
18688    #[test]
18689    fn insert_conversation_tree_skips_duplicate_message_indices_for_new_conversation() {
18690        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18691        use std::path::PathBuf;
18692
18693        let dir = TempDir::new().unwrap();
18694        let db_path = dir.path().join("test.db");
18695        let storage = SqliteStorage::open(&db_path).unwrap();
18696
18697        let agent = Agent {
18698            id: None,
18699            slug: "codex".into(),
18700            name: "Codex".into(),
18701            version: Some("0.2.3".into()),
18702            kind: AgentKind::Cli,
18703        };
18704        let agent_id = storage.ensure_agent(&agent).unwrap();
18705
18706        let conversation = Conversation {
18707            id: None,
18708            agent_slug: "codex".into(),
18709            workspace: Some(PathBuf::from("/tmp/workspace")),
18710            external_id: Some("duplicate-new-session".into()),
18711            title: Some("Duplicate New Session".into()),
18712            source_path: PathBuf::from("/tmp/duplicate-new-session.jsonl"),
18713            started_at: Some(1_700_000_000_000),
18714            ended_at: Some(1_700_000_000_999),
18715            approx_tokens: None,
18716            metadata_json: serde_json::Value::Null,
18717            messages: vec![
18718                Message {
18719                    id: None,
18720                    idx: 0,
18721                    role: MessageRole::User,
18722                    author: None,
18723                    created_at: Some(1_700_000_000_000),
18724                    content: "first canonical".into(),
18725                    extra_json: serde_json::Value::Null,
18726                    snippets: Vec::new(),
18727                },
18728                Message {
18729                    id: None,
18730                    idx: 0,
18731                    role: MessageRole::User,
18732                    author: None,
18733                    created_at: Some(1_700_000_000_001),
18734                    content: "duplicate idx should be skipped".into(),
18735                    extra_json: serde_json::Value::Null,
18736                    snippets: Vec::new(),
18737                },
18738                Message {
18739                    id: None,
18740                    idx: 1,
18741                    role: MessageRole::Agent,
18742                    author: None,
18743                    created_at: Some(1_700_000_000_100),
18744                    content: "second".into(),
18745                    extra_json: serde_json::Value::Null,
18746                    snippets: Vec::new(),
18747                },
18748            ],
18749            source_id: "local".into(),
18750            origin_host: None,
18751        };
18752
18753        let outcome = storage
18754            .insert_conversation_tree(agent_id, None, &conversation)
18755            .unwrap();
18756
18757        assert_eq!(outcome.inserted_indices, vec![0, 1]);
18758
18759        let stored_messages: Vec<(i64, String)> = storage
18760            .conn
18761            .query_map_collect(
18762                "SELECT idx, content FROM messages ORDER BY idx",
18763                fparams![],
18764                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18765            )
18766            .unwrap();
18767        assert_eq!(
18768            stored_messages,
18769            vec![
18770                (0, "first canonical".to_string()),
18771                (1, "second".to_string())
18772            ]
18773        );
18774    }
18775
18776    #[test]
18777    fn insert_conversation_tree_merges_duplicate_source_path_without_external_id() {
18778        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18779        use std::path::PathBuf;
18780
18781        let dir = TempDir::new().unwrap();
18782        let db_path = dir.path().join("test.db");
18783        let storage = SqliteStorage::open(&db_path).unwrap();
18784
18785        let agent = Agent {
18786            id: None,
18787            slug: "codex".into(),
18788            name: "Codex".into(),
18789            version: Some("0.2.3".into()),
18790            kind: AgentKind::Cli,
18791        };
18792        let agent_id = storage.ensure_agent(&agent).unwrap();
18793
18794        let base_conv = |messages: Vec<Message>| Conversation {
18795            id: None,
18796            agent_slug: "codex".into(),
18797            workspace: Some(PathBuf::from("/tmp/workspace")),
18798            external_id: None,
18799            title: Some("Source Path Merge".into()),
18800            source_path: PathBuf::from("/tmp/shared-session.jsonl"),
18801            started_at: Some(1_700_000_000_000),
18802            ended_at: Some(1_700_000_000_999),
18803            approx_tokens: None,
18804            metadata_json: serde_json::Value::Null,
18805            messages,
18806            source_id: "local".into(),
18807            origin_host: None,
18808        };
18809
18810        let first = storage
18811            .insert_conversation_tree(
18812                agent_id,
18813                None,
18814                &base_conv(vec![
18815                    Message {
18816                        id: None,
18817                        idx: 0,
18818                        role: MessageRole::User,
18819                        author: None,
18820                        created_at: Some(1_700_000_000_000),
18821                        content: "first".into(),
18822                        extra_json: serde_json::Value::Null,
18823                        snippets: Vec::new(),
18824                    },
18825                    Message {
18826                        id: None,
18827                        idx: 1,
18828                        role: MessageRole::Agent,
18829                        author: None,
18830                        created_at: Some(1_700_000_000_100),
18831                        content: "second".into(),
18832                        extra_json: serde_json::Value::Null,
18833                        snippets: Vec::new(),
18834                    },
18835                ]),
18836            )
18837            .unwrap();
18838
18839        let second = storage
18840            .insert_conversation_tree(
18841                agent_id,
18842                None,
18843                &base_conv(vec![
18844                    Message {
18845                        id: None,
18846                        idx: 1,
18847                        role: MessageRole::Agent,
18848                        author: None,
18849                        created_at: Some(1_700_000_000_100),
18850                        content: "second".into(),
18851                        extra_json: serde_json::Value::Null,
18852                        snippets: Vec::new(),
18853                    },
18854                    Message {
18855                        id: None,
18856                        idx: 2,
18857                        role: MessageRole::User,
18858                        author: None,
18859                        created_at: Some(1_700_000_000_200),
18860                        content: "third".into(),
18861                        extra_json: serde_json::Value::Null,
18862                        snippets: Vec::new(),
18863                    },
18864                ]),
18865            )
18866            .unwrap();
18867
18868        assert_eq!(first.conversation_id, second.conversation_id);
18869        assert_eq!(first.inserted_indices, vec![0, 1]);
18870        assert_eq!(second.inserted_indices, vec![2]);
18871
18872        let stored_indices: Vec<i64> = storage
18873            .conn
18874            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18875                row.get_typed(0)
18876            })
18877            .unwrap();
18878        assert_eq!(stored_indices, vec![0, 1, 2]);
18879    }
18880
18881    #[test]
18882    fn insert_conversation_tree_merges_source_path_duplicates_with_start_drift() {
18883        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18884        use std::path::PathBuf;
18885
18886        let dir = TempDir::new().unwrap();
18887        let db_path = dir.path().join("test.db");
18888        let storage = SqliteStorage::open(&db_path).unwrap();
18889
18890        let agent = Agent {
18891            id: None,
18892            slug: "codex".into(),
18893            name: "Codex".into(),
18894            version: Some("0.2.3".into()),
18895            kind: AgentKind::Cli,
18896        };
18897        let agent_id = storage.ensure_agent(&agent).unwrap();
18898
18899        let base_conv = |started_at: Option<i64>, messages: Vec<Message>| Conversation {
18900            id: None,
18901            agent_slug: "codex".into(),
18902            workspace: Some(PathBuf::from("/tmp/workspace")),
18903            external_id: None,
18904            title: Some("Drift Merge".into()),
18905            source_path: PathBuf::from("/tmp/drift-session.jsonl"),
18906            started_at,
18907            ended_at: Some(1_700_000_000_999),
18908            approx_tokens: None,
18909            metadata_json: serde_json::Value::Null,
18910            messages,
18911            source_id: "local".into(),
18912            origin_host: None,
18913        };
18914
18915        let first = storage
18916            .insert_conversation_tree(
18917                agent_id,
18918                None,
18919                &base_conv(
18920                    Some(1_700_000_000_000),
18921                    vec![
18922                        Message {
18923                            id: None,
18924                            idx: 0,
18925                            role: MessageRole::User,
18926                            author: None,
18927                            created_at: Some(1_700_000_000_000),
18928                            content: "first".into(),
18929                            extra_json: serde_json::Value::Null,
18930                            snippets: Vec::new(),
18931                        },
18932                        Message {
18933                            id: None,
18934                            idx: 1,
18935                            role: MessageRole::Agent,
18936                            author: None,
18937                            created_at: Some(1_700_000_000_100),
18938                            content: "second".into(),
18939                            extra_json: serde_json::Value::Null,
18940                            snippets: Vec::new(),
18941                        },
18942                    ],
18943                ),
18944            )
18945            .unwrap();
18946
18947        let second = storage
18948            .insert_conversation_tree(
18949                agent_id,
18950                None,
18951                &base_conv(
18952                    Some(1_700_000_004_000),
18953                    vec![
18954                        Message {
18955                            id: None,
18956                            idx: 1,
18957                            role: MessageRole::Agent,
18958                            author: None,
18959                            created_at: Some(1_700_000_000_100),
18960                            content: "second".into(),
18961                            extra_json: serde_json::Value::Null,
18962                            snippets: Vec::new(),
18963                        },
18964                        Message {
18965                            id: None,
18966                            idx: 2,
18967                            role: MessageRole::User,
18968                            author: None,
18969                            created_at: Some(1_700_000_004_200),
18970                            content: "third".into(),
18971                            extra_json: serde_json::Value::Null,
18972                            snippets: Vec::new(),
18973                        },
18974                    ],
18975                ),
18976            )
18977            .unwrap();
18978
18979        assert_eq!(first.conversation_id, second.conversation_id);
18980        assert_eq!(second.inserted_indices, vec![2]);
18981    }
18982
18983    #[test]
18984    fn insert_conversation_tree_keeps_single_message_overlap_sessions_separate() {
18985        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18986        use std::path::PathBuf;
18987
18988        let dir = TempDir::new().unwrap();
18989        let db_path = dir.path().join("test.db");
18990        let storage = SqliteStorage::open(&db_path).unwrap();
18991
18992        let agent = Agent {
18993            id: None,
18994            slug: "codex".into(),
18995            name: "Codex".into(),
18996            version: Some("0.2.3".into()),
18997            kind: AgentKind::Cli,
18998        };
18999        let agent_id = storage.ensure_agent(&agent).unwrap();
19000
19001        let make_conv = |started_at: i64, idx: i64, content: &str| Conversation {
19002            id: None,
19003            agent_slug: "codex".into(),
19004            workspace: Some(PathBuf::from("/tmp/workspace")),
19005            external_id: None,
19006            title: Some("Partial overlap".into()),
19007            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19008            started_at: Some(started_at),
19009            ended_at: Some(started_at + 500),
19010            approx_tokens: None,
19011            metadata_json: serde_json::Value::Null,
19012            messages: vec![Message {
19013                id: None,
19014                idx,
19015                role: MessageRole::User,
19016                author: None,
19017                created_at: Some(started_at),
19018                content: content.into(),
19019                extra_json: serde_json::Value::Null,
19020                snippets: Vec::new(),
19021            }],
19022            source_id: "local".into(),
19023            origin_host: None,
19024        };
19025
19026        storage
19027            .insert_conversation_tree(
19028                agent_id,
19029                None,
19030                &Conversation {
19031                    messages: vec![
19032                        Message {
19033                            id: None,
19034                            idx: 0,
19035                            role: MessageRole::User,
19036                            author: None,
19037                            created_at: Some(1_700_000_000_000),
19038                            content: "shared opener".into(),
19039                            extra_json: serde_json::Value::Null,
19040                            snippets: Vec::new(),
19041                        },
19042                        Message {
19043                            id: None,
19044                            idx: 1,
19045                            role: MessageRole::Agent,
19046                            author: None,
19047                            created_at: Some(1_700_000_000_100),
19048                            content: "first session unique".into(),
19049                            extra_json: serde_json::Value::Null,
19050                            snippets: Vec::new(),
19051                        },
19052                    ],
19053                    ..make_conv(1_700_000_000_000, 0, "unused")
19054                },
19055            )
19056            .unwrap();
19057        storage
19058            .insert_conversation_tree(
19059                agent_id,
19060                None,
19061                &make_conv(1_700_000_900_000, 0, "shared opener"),
19062            )
19063            .unwrap();
19064
19065        let conversation_count: i64 = storage
19066            .conn
19067            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19068                row.get_typed(0)
19069            })
19070            .unwrap();
19071        assert_eq!(conversation_count, 2);
19072    }
19073
19074    #[test]
19075    fn insert_conversation_tree_keeps_distinct_source_path_sessions_separate() {
19076        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19077        use std::path::PathBuf;
19078
19079        let dir = TempDir::new().unwrap();
19080        let db_path = dir.path().join("test.db");
19081        let storage = SqliteStorage::open(&db_path).unwrap();
19082
19083        let agent = Agent {
19084            id: None,
19085            slug: "codex".into(),
19086            name: "Codex".into(),
19087            version: Some("0.2.3".into()),
19088            kind: AgentKind::Cli,
19089        };
19090        let agent_id = storage.ensure_agent(&agent).unwrap();
19091
19092        let make_conv = |started_at: i64, created_at: i64, content: &str| Conversation {
19093            id: None,
19094            agent_slug: "codex".into(),
19095            workspace: Some(PathBuf::from("/tmp/workspace")),
19096            external_id: None,
19097            title: Some("Same Path Different Session".into()),
19098            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19099            started_at: Some(started_at),
19100            ended_at: Some(started_at + 500),
19101            approx_tokens: None,
19102            metadata_json: serde_json::Value::Null,
19103            messages: vec![Message {
19104                id: None,
19105                idx: 0,
19106                role: MessageRole::User,
19107                author: None,
19108                created_at: Some(created_at),
19109                content: content.into(),
19110                extra_json: serde_json::Value::Null,
19111                snippets: Vec::new(),
19112            }],
19113            source_id: "local".into(),
19114            origin_host: None,
19115        };
19116
19117        storage
19118            .insert_conversation_tree(
19119                agent_id,
19120                None,
19121                &make_conv(1_700_000_000_000, 1_700_000_000_000, "first session"),
19122            )
19123            .unwrap();
19124        storage
19125            .insert_conversation_tree(
19126                agent_id,
19127                None,
19128                &make_conv(1_700_000_900_000, 1_700_000_900_000, "second session"),
19129            )
19130            .unwrap();
19131
19132        let conversation_count: i64 = storage
19133            .conn
19134            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19135                row.get_typed(0)
19136            })
19137            .unwrap();
19138        assert_eq!(conversation_count, 2);
19139    }
19140
19141    #[test]
19142    fn insert_conversation_tree_merges_replay_equivalent_messages_with_shifted_idx() {
19143        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19144        use std::path::PathBuf;
19145
19146        let dir = TempDir::new().unwrap();
19147        let db_path = dir.path().join("test.db");
19148        let storage = SqliteStorage::open(&db_path).unwrap();
19149
19150        let agent = Agent {
19151            id: None,
19152            slug: "codex".into(),
19153            name: "Codex".into(),
19154            version: Some("0.2.3".into()),
19155            kind: AgentKind::Cli,
19156        };
19157        let agent_id = storage.ensure_agent(&agent).unwrap();
19158
19159        let make_conv = |started_at: i64, messages: Vec<Message>| Conversation {
19160            id: None,
19161            agent_slug: "codex".into(),
19162            workspace: Some(PathBuf::from("/tmp/workspace")),
19163            external_id: None,
19164            title: Some("Shifted replay".into()),
19165            source_path: PathBuf::from("/tmp/replay-session.jsonl"),
19166            started_at: Some(started_at),
19167            ended_at: Some(started_at + 500),
19168            approx_tokens: None,
19169            metadata_json: serde_json::Value::Null,
19170            messages,
19171            source_id: "local".into(),
19172            origin_host: None,
19173        };
19174
19175        let first = storage
19176            .insert_conversation_tree(
19177                agent_id,
19178                None,
19179                &make_conv(
19180                    1_700_000_000_000,
19181                    vec![
19182                        Message {
19183                            id: None,
19184                            idx: 0,
19185                            role: MessageRole::User,
19186                            author: None,
19187                            created_at: Some(1_700_000_000_000),
19188                            content: "first".into(),
19189                            extra_json: serde_json::Value::Null,
19190                            snippets: Vec::new(),
19191                        },
19192                        Message {
19193                            id: None,
19194                            idx: 1,
19195                            role: MessageRole::Agent,
19196                            author: None,
19197                            created_at: Some(1_700_000_000_100),
19198                            content: "second".into(),
19199                            extra_json: serde_json::Value::Null,
19200                            snippets: Vec::new(),
19201                        },
19202                    ],
19203                ),
19204            )
19205            .unwrap();
19206
19207        let second = storage
19208            .insert_conversation_tree(
19209                agent_id,
19210                None,
19211                &make_conv(
19212                    1_700_000_900_000,
19213                    vec![
19214                        Message {
19215                            id: None,
19216                            idx: 10,
19217                            role: MessageRole::User,
19218                            author: None,
19219                            created_at: Some(1_700_000_000_000),
19220                            content: "first".into(),
19221                            extra_json: serde_json::Value::Null,
19222                            snippets: Vec::new(),
19223                        },
19224                        Message {
19225                            id: None,
19226                            idx: 11,
19227                            role: MessageRole::Agent,
19228                            author: None,
19229                            created_at: Some(1_700_000_000_100),
19230                            content: "second".into(),
19231                            extra_json: serde_json::Value::Null,
19232                            snippets: Vec::new(),
19233                        },
19234                        Message {
19235                            id: None,
19236                            idx: 12,
19237                            role: MessageRole::User,
19238                            author: None,
19239                            created_at: Some(1_700_000_000_200),
19240                            content: "third".into(),
19241                            extra_json: serde_json::Value::Null,
19242                            snippets: Vec::new(),
19243                        },
19244                    ],
19245                ),
19246            )
19247            .unwrap();
19248
19249        assert_eq!(first.conversation_id, second.conversation_id);
19250        assert_eq!(second.inserted_indices, vec![12]);
19251
19252        let stored_indices: Vec<i64> = storage
19253            .conn
19254            .query_map_collect(
19255                "SELECT idx FROM messages WHERE conversation_id = ?1 ORDER BY idx",
19256                fparams![first.conversation_id],
19257                |row| row.get_typed(0),
19258            )
19259            .unwrap();
19260        assert_eq!(stored_indices, vec![0, 1, 12]);
19261    }
19262
19263    #[test]
19264    fn salvage_historical_databases_imports_backups_once_and_merges_overlap() {
19265        use crate::model::types::{Conversation, Message, MessageRole};
19266        use std::path::PathBuf;
19267
19268        fn base_conv(source_path: &str, messages: Vec<Message>) -> Conversation {
19269            Conversation {
19270                id: None,
19271                agent_slug: "codex".into(),
19272                workspace: Some(PathBuf::from("/tmp/workspace")),
19273                external_id: None,
19274                title: Some("Recovered".into()),
19275                source_path: PathBuf::from(source_path),
19276                started_at: Some(1_700_000_000_000),
19277                ended_at: Some(1_700_000_000_999),
19278                approx_tokens: None,
19279                metadata_json: serde_json::Value::Null,
19280                messages,
19281                source_id: "local".into(),
19282                origin_host: None,
19283            }
19284        }
19285
19286        let dir = TempDir::new().unwrap();
19287        let canonical_db = dir.path().join("agent_search.db");
19288        let storage = SqliteStorage::open(&canonical_db).unwrap();
19289
19290        let overlapping_a = base_conv(
19291            "/tmp/shared-history.jsonl",
19292            vec![
19293                Message {
19294                    id: None,
19295                    idx: 0,
19296                    role: MessageRole::User,
19297                    author: None,
19298                    created_at: Some(1_700_000_000_000),
19299                    content: "first".into(),
19300                    extra_json: serde_json::Value::Null,
19301                    snippets: Vec::new(),
19302                },
19303                Message {
19304                    id: None,
19305                    idx: 1,
19306                    role: MessageRole::Agent,
19307                    author: None,
19308                    created_at: Some(1_700_000_000_100),
19309                    content: "second".into(),
19310                    extra_json: serde_json::Value::Null,
19311                    snippets: Vec::new(),
19312                },
19313            ],
19314        );
19315        let overlapping_b = base_conv(
19316            "/tmp/shared-history.jsonl",
19317            vec![
19318                Message {
19319                    id: None,
19320                    idx: 1,
19321                    role: MessageRole::Agent,
19322                    author: None,
19323                    created_at: Some(1_700_000_000_100),
19324                    content: "second".into(),
19325                    extra_json: serde_json::Value::Null,
19326                    snippets: Vec::new(),
19327                },
19328                Message {
19329                    id: None,
19330                    idx: 2,
19331                    role: MessageRole::User,
19332                    author: None,
19333                    created_at: Some(1_700_000_000_200),
19334                    content: "third".into(),
19335                    extra_json: serde_json::Value::Null,
19336                    snippets: Vec::new(),
19337                },
19338            ],
19339        );
19340        let unique = Conversation {
19341            source_path: PathBuf::from("/tmp/unique-history.jsonl"),
19342            messages: vec![Message {
19343                id: None,
19344                idx: 0,
19345                role: MessageRole::User,
19346                author: None,
19347                created_at: Some(1_700_000_001_000),
19348                content: "unique".into(),
19349                extra_json: serde_json::Value::Null,
19350                snippets: Vec::new(),
19351            }],
19352            started_at: Some(1_700_000_001_000),
19353            ended_at: Some(1_700_000_001_100),
19354            ..base_conv("/tmp/unique-history.jsonl", Vec::new())
19355        };
19356
19357        seed_historical_db_direct(
19358            &dir.path()
19359                .join("backups/agent_search.db.20260322T020200.bak"),
19360            std::slice::from_ref(&overlapping_a),
19361        );
19362        seed_historical_db_direct(
19363            &dir.path().join("agent_search.corrupt.20260324_212907"),
19364            &[overlapping_b, unique],
19365        );
19366
19367        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19368        assert_eq!(first.bundles_considered, 2);
19369        assert_eq!(first.bundles_imported, 2);
19370        assert_eq!(first.messages_imported, 4);
19371
19372        let conversations = storage.list_conversations(10, 0).unwrap();
19373        assert_eq!(conversations.len(), 2);
19374
19375        let shared_id = conversations
19376            .iter()
19377            .find(|conv| conv.source_path == std::path::Path::new("/tmp/shared-history.jsonl"))
19378            .and_then(|conv| conv.id)
19379            .unwrap();
19380        let shared_indices: Vec<i64> = storage
19381            .fetch_messages(shared_id)
19382            .unwrap()
19383            .into_iter()
19384            .map(|msg| msg.idx)
19385            .collect();
19386        assert_eq!(shared_indices, vec![0, 1, 2]);
19387
19388        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19389        assert_eq!(second.bundles_imported, 0);
19390        assert_eq!(second.messages_imported, 0);
19391    }
19392
19393    #[test]
19394    fn salvage_historical_databases_normalizes_host_only_remote_provenance() {
19395        use crate::model::types::{Conversation, Message, MessageRole};
19396        use std::path::PathBuf;
19397
19398        let dir = TempDir::new().unwrap();
19399        let canonical_db = dir.path().join("agent_search.db");
19400        let storage = SqliteStorage::open(&canonical_db).unwrap();
19401
19402        let host_only_remote = Conversation {
19403            id: None,
19404            agent_slug: "codex".into(),
19405            workspace: Some(PathBuf::from("/tmp/workspace")),
19406            external_id: None,
19407            title: Some("Recovered Host Only Remote".into()),
19408            source_path: PathBuf::from("/tmp/host-only-history.jsonl"),
19409            started_at: Some(1_700_000_000_000),
19410            ended_at: Some(1_700_000_000_999),
19411            approx_tokens: None,
19412            metadata_json: serde_json::Value::Null,
19413            messages: vec![Message {
19414                id: None,
19415                idx: 0,
19416                role: MessageRole::User,
19417                author: None,
19418                created_at: Some(1_700_000_000_000),
19419                content: "host-only remote".into(),
19420                extra_json: serde_json::Value::Null,
19421                snippets: Vec::new(),
19422            }],
19423            source_id: "   ".into(),
19424            origin_host: Some("builder-5".into()),
19425        };
19426
19427        let historical_db = dir
19428            .path()
19429            .join("backups/agent_search.db.20260322T020200.bak");
19430        seed_historical_db_direct(&historical_db, std::slice::from_ref(&host_only_remote));
19431
19432        let historical_conn =
19433            FrankenConnection::open(historical_db.to_string_lossy().into_owned()).unwrap();
19434        historical_conn
19435            .execute_compat(
19436                "INSERT INTO sources(id, kind, host_label, created_at, updated_at) VALUES(?1, ?2, ?3, ?4, ?5)",
19437                fparams!["   ", "ssh", "builder-5", 0_i64, 0_i64],
19438            )
19439            .unwrap();
19440        historical_conn
19441            .execute_compat(
19442                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE source_path = ?3",
19443                fparams!["   ", "builder-5", "/tmp/host-only-history.jsonl"],
19444            )
19445            .unwrap();
19446        historical_conn
19447            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams!["builder-5"])
19448            .unwrap();
19449        drop(historical_conn);
19450
19451        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19452        assert_eq!(first.bundles_imported, 1);
19453        assert_eq!(first.messages_imported, 1);
19454
19455        let source_ids = storage.get_source_ids().unwrap();
19456        assert_eq!(source_ids, vec!["builder-5".to_string()]);
19457
19458        let conversations = storage.list_conversations(10, 0).unwrap();
19459        assert_eq!(conversations.len(), 1);
19460        assert_eq!(conversations[0].source_id, "builder-5");
19461        assert_eq!(conversations[0].origin_host.as_deref(), Some("builder-5"));
19462    }
19463
19464    #[test]
19465    fn historical_salvage_retry_splits_single_conversation_until_it_fits() {
19466        use crate::model::types::{Conversation, Message, MessageRole};
19467        use std::path::PathBuf;
19468
19469        let mut attempts: Vec<Vec<usize>> = Vec::new();
19470        let entry = HistoricalBatchEntry {
19471            source_row_id: 77,
19472            agent_id: 1,
19473            workspace_id: None,
19474            conversation: Conversation {
19475                id: None,
19476                agent_slug: "gemini".into(),
19477                workspace: Some(PathBuf::from("/tmp/workspace")),
19478                external_id: Some("conv-77".into()),
19479                title: Some("Large recovered conversation".into()),
19480                source_path: PathBuf::from("/tmp/history.jsonl"),
19481                started_at: Some(1_700_000_000_000),
19482                ended_at: Some(1_700_000_000_999),
19483                approx_tokens: None,
19484                metadata_json: serde_json::Value::Null,
19485                messages: (0..4)
19486                    .map(|idx| Message {
19487                        id: None,
19488                        idx,
19489                        role: MessageRole::User,
19490                        author: None,
19491                        created_at: Some(1_700_000_000_000 + idx),
19492                        content: format!("message-{idx}"),
19493                        extra_json: serde_json::Value::Null,
19494                        snippets: Vec::new(),
19495                    })
19496                    .collect(),
19497                source_id: LOCAL_SOURCE_ID.into(),
19498                origin_host: None,
19499            },
19500        };
19501
19502        let totals = SqliteStorage::import_historical_batch_with_retry(
19503            std::slice::from_ref(&entry),
19504            &mut |batch| {
19505                attempts.push(
19506                    batch
19507                        .iter()
19508                        .map(|entry| entry.conversation.messages.len())
19509                        .collect(),
19510                );
19511                let total_messages: usize = batch
19512                    .iter()
19513                    .map(|entry| entry.conversation.messages.len())
19514                    .sum();
19515                if total_messages > 1 {
19516                    Err(anyhow!("out of memory"))
19517                } else {
19518                    Ok(HistoricalBatchImportTotals {
19519                        inserted_source_rows: batch.len(),
19520                        inserted_messages: total_messages,
19521                    })
19522                }
19523            },
19524        )
19525        .unwrap();
19526
19527        assert_eq!(
19528            totals,
19529            HistoricalBatchImportTotals {
19530                inserted_source_rows: 1,
19531                inserted_messages: 4,
19532            }
19533        );
19534        assert_eq!(attempts.first().cloned(), Some(vec![4]));
19535        assert!(
19536            attempts.iter().filter(|sizes| sizes == &&vec![1]).count() >= 4,
19537            "expected recursive fallback to reach one-message slices"
19538        );
19539    }
19540
19541    #[test]
19542    fn salvage_historical_databases_resumes_from_progress_checkpoint() {
19543        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19544        use std::path::PathBuf;
19545
19546        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19547            Conversation {
19548                id: None,
19549                agent_slug: "codex".into(),
19550                workspace: Some(PathBuf::from("/tmp/workspace")),
19551                external_id: Some(format!("conv-{idx_seed}")),
19552                title: Some(format!("Recovered {idx_seed}")),
19553                source_path: PathBuf::from(source_path),
19554                started_at: Some(1_700_000_000_000 + idx_seed),
19555                ended_at: Some(1_700_000_000_100 + idx_seed),
19556                approx_tokens: None,
19557                metadata_json: serde_json::Value::Null,
19558                messages: vec![Message {
19559                    id: None,
19560                    idx: 0,
19561                    role: MessageRole::User,
19562                    author: None,
19563                    created_at: Some(1_700_000_000_000 + idx_seed),
19564                    content: format!("message-{idx_seed}"),
19565                    extra_json: serde_json::Value::Null,
19566                    snippets: Vec::new(),
19567                }],
19568                source_id: LOCAL_SOURCE_ID.into(),
19569                origin_host: None,
19570            }
19571        }
19572
19573        let dir = TempDir::new().unwrap();
19574        let canonical_db = dir.path().join("agent_search.db");
19575        let backup_db = dir
19576            .path()
19577            .join("backups/agent_search.db.20260322T020200.bak");
19578        let storage = SqliteStorage::open(&canonical_db).unwrap();
19579        let conv_a = make_conv("/tmp/one.jsonl", 1);
19580        let conv_b = make_conv("/tmp/two.jsonl", 2);
19581        let conv_c = make_conv("/tmp/three.jsonl", 3);
19582        seed_historical_db_direct(
19583            &backup_db,
19584            &[conv_a.clone(), conv_b.clone(), conv_c.clone()],
19585        );
19586
19587        let agent = Agent {
19588            id: None,
19589            slug: "codex".into(),
19590            name: "Codex".into(),
19591            version: Some("0.2.3".into()),
19592            kind: AgentKind::Cli,
19593        };
19594        let agent_id = storage.ensure_agent(&agent).unwrap();
19595        storage
19596            .insert_conversation_tree(agent_id, None, &conv_a)
19597            .unwrap();
19598
19599        let bundle = discover_historical_database_bundles(&canonical_db)
19600            .into_iter()
19601            .find(|bundle| bundle.root_path == backup_db)
19602            .unwrap();
19603        let first_row_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19604            .unwrap()
19605            .query_row_map(
19606                "SELECT id FROM conversations WHERE source_path = ?1",
19607                fparams!["/tmp/one.jsonl"],
19608                |row| row.get_typed(0),
19609            )
19610            .unwrap();
19611        storage
19612            .record_historical_bundle_progress(&bundle, "direct-readonly", first_row_id, 50, 99)
19613            .unwrap();
19614
19615        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
19616        assert_eq!(outcome.bundles_imported, 1);
19617        assert_eq!(outcome.conversations_imported, 52);
19618        assert_eq!(outcome.messages_imported, 101);
19619        assert_eq!(storage.list_conversations(10, 0).unwrap().len(), 3);
19620
19621        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
19622        let progress_left: Option<String> = storage
19623            .conn
19624            .query_row_map(
19625                "SELECT value FROM meta WHERE key = ?1",
19626                fparams![progress_key.as_str()],
19627                |row| row.get_typed(0),
19628            )
19629            .optional()
19630            .unwrap();
19631        assert!(
19632            progress_left.is_none(),
19633            "completed salvage should clear bundle progress"
19634        );
19635
19636        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19637        assert_eq!(second.bundles_imported, 0);
19638        assert_eq!(second.messages_imported, 0);
19639    }
19640
19641    #[test]
19642    fn salvage_historical_databases_skips_bundle_when_checkpoint_covers_backup() {
19643        // Regression for issue #247 (coding_agent_session_search-r8pcy): a bundle
19644        // whose progress checkpoint already covers the backup's entire conversation
19645        // row-id space (daemon OOM-killed after the last batch committed but before
19646        // the completion ledger marker landed) must be ledgered + skipped, not
19647        // re-scanned O(n) with imported=0 every batch.
19648        use crate::model::types::{Conversation, Message, MessageRole};
19649        use std::path::PathBuf;
19650
19651        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19652            Conversation {
19653                id: None,
19654                agent_slug: "codex".into(),
19655                workspace: Some(PathBuf::from("/tmp/workspace")),
19656                external_id: Some(format!("conv-{idx_seed}")),
19657                title: Some(format!("Recovered {idx_seed}")),
19658                source_path: PathBuf::from(source_path),
19659                started_at: Some(1_700_000_000_000 + idx_seed),
19660                ended_at: Some(1_700_000_000_100 + idx_seed),
19661                approx_tokens: None,
19662                metadata_json: serde_json::Value::Null,
19663                messages: vec![Message {
19664                    id: None,
19665                    idx: 0,
19666                    role: MessageRole::User,
19667                    author: None,
19668                    created_at: Some(1_700_000_000_000 + idx_seed),
19669                    content: format!("message-{idx_seed}"),
19670                    extra_json: serde_json::Value::Null,
19671                    snippets: Vec::new(),
19672                }],
19673                source_id: LOCAL_SOURCE_ID.into(),
19674                origin_host: None,
19675            }
19676        }
19677
19678        let dir = TempDir::new().unwrap();
19679        let canonical_db = dir.path().join("agent_search.db");
19680        let backup_db = dir
19681            .path()
19682            .join("backups/agent_search.db.20260322T020200.bak");
19683        let storage = SqliteStorage::open(&canonical_db).unwrap();
19684        seed_historical_db_direct(
19685            &backup_db,
19686            &[
19687                make_conv("/tmp/one.jsonl", 1),
19688                make_conv("/tmp/two.jsonl", 2),
19689                make_conv("/tmp/three.jsonl", 3),
19690            ],
19691        );
19692
19693        let bundle = discover_historical_database_bundles(&canonical_db)
19694            .into_iter()
19695            .find(|bundle| bundle.root_path == backup_db)
19696            .unwrap();
19697
19698        // Checkpoint high-water mark == backup's max conversation id.
19699        let backup_max_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19700            .unwrap()
19701            .query_row_map(
19702                "SELECT COALESCE(MAX(id), 0) FROM conversations",
19703                fparams![],
19704                |row| row.get_typed(0),
19705            )
19706            .unwrap();
19707        assert!(backup_max_id > 0, "seeded backup should have conversations");
19708        storage
19709            .record_historical_bundle_progress(&bundle, "direct-readonly", backup_max_id, 3, 3)
19710            .unwrap();
19711
19712        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
19713        assert_eq!(
19714            outcome.bundles_imported, 0,
19715            "fully-checkpointed bundle must not be re-scanned"
19716        );
19717        assert_eq!(outcome.conversations_imported, 0);
19718        assert_eq!(outcome.messages_imported, 0);
19719        assert_eq!(
19720            storage.list_conversations(10, 0).unwrap().len(),
19721            0,
19722            "skip path must not import anything"
19723        );
19724        assert!(
19725            storage.historical_bundle_already_imported(&bundle).unwrap(),
19726            "skipped bundle must be ledgered as salvaged so future runs short-circuit"
19727        );
19728
19729        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
19730        let progress_left: Option<String> = storage
19731            .conn
19732            .query_row_map(
19733                "SELECT value FROM meta WHERE key = ?1",
19734                fparams![progress_key.as_str()],
19735                |row| row.get_typed(0),
19736            )
19737            .optional()
19738            .unwrap();
19739        assert!(
19740            progress_left.is_none(),
19741            "skip path must clear the bundle progress checkpoint"
19742        );
19743    }
19744
19745    #[test]
19746    fn list_conversations_for_lexical_rebuild_uses_stable_id_order() {
19747        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19748        use std::path::PathBuf;
19749
19750        let dir = TempDir::new().unwrap();
19751        let db_path = dir.path().join("agent_search.db");
19752        let storage = SqliteStorage::open(&db_path).unwrap();
19753        let agent = Agent {
19754            id: None,
19755            slug: "codex".into(),
19756            name: "Codex".into(),
19757            version: Some("0.2.3".into()),
19758            kind: AgentKind::Cli,
19759        };
19760        let agent_id = storage.ensure_agent(&agent).unwrap();
19761
19762        let make_conv = |source_path: &str, started_at: i64| Conversation {
19763            id: None,
19764            agent_slug: "codex".into(),
19765            workspace: Some(PathBuf::from("/tmp/workspace")),
19766            external_id: Some(source_path.to_string()),
19767            title: Some(source_path.to_string()),
19768            source_path: PathBuf::from(source_path),
19769            started_at: Some(started_at),
19770            ended_at: Some(started_at + 1),
19771            approx_tokens: None,
19772            metadata_json: serde_json::Value::Null,
19773            messages: vec![Message {
19774                id: None,
19775                idx: 0,
19776                role: MessageRole::User,
19777                author: None,
19778                created_at: Some(started_at),
19779                content: format!("message for {source_path}"),
19780                extra_json: serde_json::Value::Null,
19781                snippets: Vec::new(),
19782            }],
19783            source_id: LOCAL_SOURCE_ID.into(),
19784            origin_host: None,
19785        };
19786
19787        let conv_a = make_conv("/tmp/a.jsonl", 3_000);
19788        let conv_b = make_conv("/tmp/b.jsonl", 1_000);
19789        let conv_c = make_conv("/tmp/c.jsonl", 2_000);
19790
19791        storage
19792            .insert_conversation_tree(agent_id, None, &conv_a)
19793            .unwrap();
19794        storage
19795            .insert_conversation_tree(agent_id, None, &conv_b)
19796            .unwrap();
19797        storage
19798            .insert_conversation_tree(agent_id, None, &conv_c)
19799            .unwrap();
19800
19801        let user_order: Vec<PathBuf> = storage
19802            .list_conversations(10, 0)
19803            .unwrap()
19804            .into_iter()
19805            .map(|conv| conv.source_path)
19806            .collect();
19807        assert_eq!(
19808            user_order,
19809            vec![
19810                PathBuf::from("/tmp/a.jsonl"),
19811                PathBuf::from("/tmp/c.jsonl"),
19812                PathBuf::from("/tmp/b.jsonl"),
19813            ]
19814        );
19815
19816        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
19817        let rebuild_order: Vec<PathBuf> = storage
19818            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
19819            .unwrap()
19820            .into_iter()
19821            .map(|conv| conv.source_path)
19822            .collect();
19823        assert_eq!(
19824            rebuild_order,
19825            vec![
19826                PathBuf::from("/tmp/a.jsonl"),
19827                PathBuf::from("/tmp/b.jsonl"),
19828                PathBuf::from("/tmp/c.jsonl"),
19829            ]
19830        );
19831
19832        let first_page = storage
19833            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
19834            .unwrap();
19835        let first_page_paths: Vec<PathBuf> = first_page
19836            .iter()
19837            .map(|conv| conv.source_path.clone())
19838            .collect();
19839        assert_eq!(
19840            first_page_paths,
19841            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
19842        );
19843
19844        let second_page = storage
19845            .list_conversations_for_lexical_rebuild_after_id(
19846                2,
19847                first_page
19848                    .last()
19849                    .and_then(|conv| conv.id)
19850                    .expect("first page should include an id"),
19851                &agent_slugs,
19852                &workspace_paths,
19853            )
19854            .unwrap();
19855        let second_page_paths: Vec<PathBuf> = second_page
19856            .iter()
19857            .map(|conv| conv.source_path.clone())
19858            .collect();
19859        assert_eq!(second_page_paths, vec![PathBuf::from("/tmp/c.jsonl")]);
19860
19861        let bounded_page = storage
19862            .list_conversations_for_lexical_rebuild_after_id_through_id(
19863                10,
19864                0,
19865                first_page
19866                    .last()
19867                    .and_then(|conv| conv.id)
19868                    .expect("first page should include an id"),
19869                &agent_slugs,
19870                &workspace_paths,
19871            )
19872            .unwrap();
19873        let bounded_paths: Vec<PathBuf> = bounded_page
19874            .iter()
19875            .map(|conv| conv.source_path.clone())
19876            .collect();
19877        assert_eq!(
19878            bounded_paths,
19879            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
19880        );
19881    }
19882
19883    #[test]
19884    fn keyset_traversal_handles_sparse_holey_conversation_ids() {
19885        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19886        use std::path::PathBuf;
19887
19888        let dir = TempDir::new().unwrap();
19889        let db_path = dir.path().join("agent_search.db");
19890        let storage = SqliteStorage::open(&db_path).unwrap();
19891        let agent = Agent {
19892            id: None,
19893            slug: "codex".into(),
19894            name: "Codex".into(),
19895            version: Some("0.2.3".into()),
19896            kind: AgentKind::Cli,
19897        };
19898        let agent_id = storage.ensure_agent(&agent).unwrap();
19899
19900        let make_conv = |label: &str, ts: i64| Conversation {
19901            id: None,
19902            agent_slug: "codex".into(),
19903            workspace: Some(PathBuf::from("/tmp/workspace")),
19904            external_id: Some(label.to_string()),
19905            title: Some(label.to_string()),
19906            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
19907            started_at: Some(ts),
19908            ended_at: Some(ts + 1),
19909            approx_tokens: None,
19910            metadata_json: serde_json::Value::Null,
19911            messages: vec![Message {
19912                id: None,
19913                idx: 0,
19914                role: MessageRole::User,
19915                author: None,
19916                created_at: Some(ts),
19917                content: format!("msg for {label}"),
19918                extra_json: serde_json::Value::Null,
19919                snippets: Vec::new(),
19920            }],
19921            source_id: LOCAL_SOURCE_ID.into(),
19922            origin_host: None,
19923        };
19924
19925        for i in 0..6 {
19926            storage
19927                .insert_conversation_tree(
19928                    agent_id,
19929                    None,
19930                    &make_conv(&format!("conv-{i}"), 1000 + i),
19931                )
19932                .unwrap();
19933        }
19934
19935        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
19936        storage
19937            .conn
19938            .execute_compat("DELETE FROM conversations WHERE id IN (2, 4)", fparams![])
19939            .unwrap();
19940        storage
19941            .conn
19942            .execute_compat(
19943                "DELETE FROM messages WHERE conversation_id IN (2, 4)",
19944                fparams![],
19945            )
19946            .unwrap();
19947        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
19948
19949        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
19950
19951        let page1 = storage
19952            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
19953            .unwrap();
19954        assert_eq!(page1.len(), 2);
19955        let page1_ids: Vec<i64> = page1.iter().map(|c| c.id.unwrap()).collect();
19956        assert_eq!(page1_ids, vec![1, 3]);
19957
19958        let page2 = storage
19959            .list_conversations_for_lexical_rebuild_after_id(
19960                2,
19961                *page1_ids.last().unwrap(),
19962                &agent_slugs,
19963                &workspace_paths,
19964            )
19965            .unwrap();
19966        assert_eq!(page2.len(), 2);
19967        let page2_ids: Vec<i64> = page2.iter().map(|c| c.id.unwrap()).collect();
19968        assert_eq!(page2_ids, vec![5, 6]);
19969
19970        let page3 = storage
19971            .list_conversations_for_lexical_rebuild_after_id(
19972                2,
19973                *page2_ids.last().unwrap(),
19974                &agent_slugs,
19975                &workspace_paths,
19976            )
19977            .unwrap();
19978        assert!(page3.is_empty());
19979
19980        let all_ids: Vec<i64> = page1_ids.iter().chain(page2_ids.iter()).copied().collect();
19981        assert_eq!(all_ids, vec![1, 3, 5, 6]);
19982    }
19983
19984    #[test]
19985    fn keyset_traversal_through_id_with_sparse_ranges() {
19986        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19987        use std::path::PathBuf;
19988
19989        let dir = TempDir::new().unwrap();
19990        let db_path = dir.path().join("agent_search.db");
19991        let storage = SqliteStorage::open(&db_path).unwrap();
19992        let agent = Agent {
19993            id: None,
19994            slug: "codex".into(),
19995            name: "Codex".into(),
19996            version: Some("0.2.3".into()),
19997            kind: AgentKind::Cli,
19998        };
19999        let agent_id = storage.ensure_agent(&agent).unwrap();
20000
20001        let make_conv = |label: &str, ts: i64| Conversation {
20002            id: None,
20003            agent_slug: "codex".into(),
20004            workspace: Some(PathBuf::from("/tmp/workspace")),
20005            external_id: Some(label.to_string()),
20006            title: Some(label.to_string()),
20007            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20008            started_at: Some(ts),
20009            ended_at: Some(ts + 1),
20010            approx_tokens: None,
20011            metadata_json: serde_json::Value::Null,
20012            messages: vec![Message {
20013                id: None,
20014                idx: 0,
20015                role: MessageRole::User,
20016                author: None,
20017                created_at: Some(ts),
20018                content: format!("msg for {label}"),
20019                extra_json: serde_json::Value::Null,
20020                snippets: Vec::new(),
20021            }],
20022            source_id: LOCAL_SOURCE_ID.into(),
20023            origin_host: None,
20024        };
20025
20026        for i in 0..10 {
20027            storage
20028                .insert_conversation_tree(
20029                    agent_id,
20030                    None,
20031                    &make_conv(&format!("conv-{i}"), 1000 + i),
20032                )
20033                .unwrap();
20034        }
20035
20036        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20037        storage
20038            .conn
20039            .execute_compat(
20040                "DELETE FROM conversations WHERE id IN (3, 5, 7, 8)",
20041                fparams![],
20042            )
20043            .unwrap();
20044        storage
20045            .conn
20046            .execute_compat(
20047                "DELETE FROM messages WHERE conversation_id IN (3, 5, 7, 8)",
20048                fparams![],
20049            )
20050            .unwrap();
20051        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20052
20053        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20054
20055        let through_5 = storage
20056            .list_conversations_for_lexical_rebuild_after_id_through_id(
20057                100,
20058                0,
20059                5,
20060                &agent_slugs,
20061                &workspace_paths,
20062            )
20063            .unwrap();
20064        let through_5_ids: Vec<i64> = through_5.iter().map(|c| c.id.unwrap()).collect();
20065        assert_eq!(through_5_ids, vec![1, 2, 4]);
20066
20067        let after_4_through_10 = storage
20068            .list_conversations_for_lexical_rebuild_after_id_through_id(
20069                100,
20070                4,
20071                10,
20072                &agent_slugs,
20073                &workspace_paths,
20074            )
20075            .unwrap();
20076        let ids: Vec<i64> = after_4_through_10.iter().map(|c| c.id.unwrap()).collect();
20077        assert_eq!(ids, vec![6, 9, 10]);
20078
20079        let after_10 = storage
20080            .list_conversations_for_lexical_rebuild_after_id_through_id(
20081                100,
20082                10,
20083                20,
20084                &agent_slugs,
20085                &workspace_paths,
20086            )
20087            .unwrap();
20088        assert!(after_10.is_empty());
20089    }
20090
20091    #[test]
20092    fn list_conversation_footprints_for_lexical_rebuild_estimates_bytes_and_keeps_empty_conversations()
20093     {
20094        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20095        use std::path::PathBuf;
20096
20097        let dir = TempDir::new().unwrap();
20098        let db_path = dir.path().join("agent_search.db");
20099        let storage = SqliteStorage::open(&db_path).unwrap();
20100        let agent = Agent {
20101            id: None,
20102            slug: "codex".into(),
20103            name: "Codex".into(),
20104            version: Some("0.2.3".into()),
20105            kind: AgentKind::Cli,
20106        };
20107        let agent_id = storage.ensure_agent(&agent).unwrap();
20108
20109        let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
20110            storage
20111                .insert_conversation_tree(
20112                    agent_id,
20113                    None,
20114                    &Conversation {
20115                        id: None,
20116                        agent_slug: "codex".into(),
20117                        workspace: Some(PathBuf::from("/tmp/workspace")),
20118                        external_id: Some(external_id.to_string()),
20119                        title: Some(external_id.to_string()),
20120                        source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
20121                        started_at: Some(base_ts),
20122                        ended_at: Some(base_ts + 100),
20123                        approx_tokens: None,
20124                        metadata_json: serde_json::Value::Null,
20125                        messages,
20126                        source_id: LOCAL_SOURCE_ID.into(),
20127                        origin_host: None,
20128                    },
20129                )
20130                .unwrap()
20131                .conversation_id
20132        };
20133
20134        let ascii_id = insert(
20135            "footprint-ascii",
20136            1_700_000_000_000,
20137            vec![
20138                Message {
20139                    id: None,
20140                    idx: 0,
20141                    role: MessageRole::User,
20142                    author: None,
20143                    created_at: Some(1_700_000_000_001),
20144                    content: "abc".into(),
20145                    extra_json: serde_json::Value::Null,
20146                    snippets: Vec::new(),
20147                },
20148                Message {
20149                    id: None,
20150                    idx: 1,
20151                    role: MessageRole::Agent,
20152                    author: None,
20153                    created_at: Some(1_700_000_000_002),
20154                    content: "defg".into(),
20155                    extra_json: serde_json::Value::Null,
20156                    snippets: Vec::new(),
20157                },
20158            ],
20159        );
20160        let empty_id = insert("footprint-empty", 1_700_000_001_000, Vec::new());
20161        let utf8_id = insert(
20162            "footprint-utf8",
20163            1_700_000_002_000,
20164            vec![Message {
20165                id: None,
20166                idx: 0,
20167                role: MessageRole::Tool,
20168                author: None,
20169                created_at: Some(1_700_000_002_001),
20170                content: "hé🙂".into(),
20171                extra_json: serde_json::Value::Null,
20172                snippets: Vec::new(),
20173            }],
20174        );
20175        let sparse_id = insert(
20176            "footprint-sparse",
20177            1_700_000_003_000,
20178            vec![Message {
20179                id: None,
20180                idx: 10,
20181                role: MessageRole::User,
20182                author: None,
20183                created_at: Some(1_700_000_003_010),
20184                content: "sparse".into(),
20185                extra_json: serde_json::Value::Null,
20186                snippets: Vec::new(),
20187            }],
20188        );
20189        storage
20190            .conn
20191            .execute_compat(
20192                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20193                fparams![utf8_id],
20194            )
20195            .unwrap();
20196
20197        let footprints = storage
20198            .list_conversation_footprints_for_lexical_rebuild()
20199            .unwrap();
20200        assert_eq!(
20201            footprints,
20202            vec![
20203                LexicalRebuildConversationFootprintRow {
20204                    conversation_id: ascii_id,
20205                    message_count: 2,
20206                    message_bytes: 2 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20207                },
20208                LexicalRebuildConversationFootprintRow {
20209                    conversation_id: empty_id,
20210                    message_count: 0,
20211                    message_bytes: 0,
20212                },
20213                LexicalRebuildConversationFootprintRow {
20214                    conversation_id: utf8_id,
20215                    message_count: 1,
20216                    message_bytes: LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20217                },
20218                LexicalRebuildConversationFootprintRow {
20219                    conversation_id: sparse_id,
20220                    message_count: 11,
20221                    message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20222                },
20223            ]
20224        );
20225    }
20226
20227    #[test]
20228    fn list_conversation_footprints_for_lexical_rebuild_falls_back_for_missing_tail_cache() {
20229        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20230        use std::path::PathBuf;
20231
20232        let dir = TempDir::new().unwrap();
20233        let db_path = dir.path().join("agent_search.db");
20234        let storage = SqliteStorage::open(&db_path).unwrap();
20235        let agent = Agent {
20236            id: None,
20237            slug: "codex".into(),
20238            name: "Codex".into(),
20239            version: Some("0.2.3".into()),
20240            kind: AgentKind::Cli,
20241        };
20242        let agent_id = storage.ensure_agent(&agent).unwrap();
20243        let conversation_id = storage
20244            .insert_conversation_tree(
20245                agent_id,
20246                None,
20247                &Conversation {
20248                    id: None,
20249                    agent_slug: "codex".into(),
20250                    workspace: Some(PathBuf::from("/tmp/workspace")),
20251                    external_id: Some("footprint-missing-tail".to_string()),
20252                    title: Some("footprint-missing-tail".to_string()),
20253                    source_path: PathBuf::from("/tmp/footprint-missing-tail.jsonl"),
20254                    started_at: Some(1_700_000_000_000),
20255                    ended_at: Some(1_700_000_000_100),
20256                    approx_tokens: None,
20257                    metadata_json: serde_json::Value::Null,
20258                    messages: vec![Message {
20259                        id: None,
20260                        idx: 10,
20261                        role: MessageRole::User,
20262                        author: None,
20263                        created_at: Some(1_700_000_000_010),
20264                        content: "legacy sparse tail".into(),
20265                        extra_json: serde_json::Value::Null,
20266                        snippets: Vec::new(),
20267                    }],
20268                    source_id: LOCAL_SOURCE_ID.into(),
20269                    origin_host: None,
20270                },
20271            )
20272            .unwrap()
20273            .conversation_id;
20274
20275        storage
20276            .conn
20277            .execute_compat(
20278                "UPDATE conversations
20279                 SET last_message_idx = NULL, last_message_created_at = NULL
20280                 WHERE id = ?1",
20281                fparams![conversation_id],
20282            )
20283            .unwrap();
20284        storage
20285            .conn
20286            .execute_compat(
20287                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20288                fparams![conversation_id],
20289            )
20290            .unwrap();
20291
20292        let footprints = storage
20293            .list_conversation_footprints_for_lexical_rebuild()
20294            .unwrap();
20295
20296        assert_eq!(
20297            footprints,
20298            vec![LexicalRebuildConversationFootprintRow {
20299                conversation_id,
20300                message_count: 11,
20301                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20302            }],
20303            "missing tail-cache metadata should fall back to messages MAX(idx) instead of treating legacy conversations as empty"
20304        );
20305    }
20306
20307    #[test]
20308    fn list_conversation_footprints_for_lexical_rebuild_raises_stale_low_tail_cache() {
20309        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20310        use std::path::PathBuf;
20311
20312        let dir = TempDir::new().unwrap();
20313        let db_path = dir.path().join("agent_search.db");
20314        let storage = SqliteStorage::open(&db_path).unwrap();
20315        let agent = Agent {
20316            id: None,
20317            slug: "codex".into(),
20318            name: "Codex".into(),
20319            version: Some("0.2.3".into()),
20320            kind: AgentKind::Cli,
20321        };
20322        let agent_id = storage.ensure_agent(&agent).unwrap();
20323        let conversation_id = storage
20324            .insert_conversation_tree(
20325                agent_id,
20326                None,
20327                &Conversation {
20328                    id: None,
20329                    agent_slug: "codex".into(),
20330                    workspace: Some(PathBuf::from("/tmp/workspace")),
20331                    external_id: Some("footprint-stale-tail".to_string()),
20332                    title: Some("footprint-stale-tail".to_string()),
20333                    source_path: PathBuf::from("/tmp/footprint-stale-tail.jsonl"),
20334                    started_at: Some(1_700_000_000_000),
20335                    ended_at: Some(1_700_000_000_100),
20336                    approx_tokens: None,
20337                    metadata_json: serde_json::Value::Null,
20338                    messages: (0..3)
20339                        .map(|idx| Message {
20340                            id: None,
20341                            idx,
20342                            role: MessageRole::User,
20343                            author: None,
20344                            created_at: Some(1_700_000_000_010 + idx),
20345                            content: format!("message {idx}"),
20346                            extra_json: serde_json::Value::Null,
20347                            snippets: Vec::new(),
20348                        })
20349                        .collect(),
20350                    source_id: LOCAL_SOURCE_ID.into(),
20351                    origin_host: None,
20352                },
20353            )
20354            .unwrap()
20355            .conversation_id;
20356
20357        storage
20358            .conn
20359            .execute_compat(
20360                "UPDATE conversations
20361                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20362                 WHERE id = ?1",
20363                fparams![conversation_id],
20364            )
20365            .unwrap();
20366        storage
20367            .conn
20368            .execute_compat(
20369                "UPDATE conversation_tail_state
20370                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20371                 WHERE conversation_id = ?1",
20372                fparams![conversation_id],
20373            )
20374            .unwrap();
20375
20376        let footprints = storage
20377            .list_conversation_footprints_for_lexical_rebuild()
20378            .unwrap();
20379
20380        assert_eq!(
20381            footprints,
20382            vec![LexicalRebuildConversationFootprintRow {
20383                conversation_id,
20384                message_count: 3,
20385                message_bytes: 3 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20386            }],
20387            "stale-low tail caches must not under-plan lexical shards and trip doc>plan invariants"
20388        );
20389    }
20390
20391    #[test]
20392    fn list_conversation_footprints_for_lexical_rebuild_tolerates_missing_tail_state_table() {
20393        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20394        use std::path::PathBuf;
20395
20396        let dir = TempDir::new().unwrap();
20397        let db_path = dir.path().join("agent_search.db");
20398        let storage = SqliteStorage::open(&db_path).unwrap();
20399        let agent = Agent {
20400            id: None,
20401            slug: "codex".into(),
20402            name: "Codex".into(),
20403            version: Some("0.2.3".into()),
20404            kind: AgentKind::Cli,
20405        };
20406        let agent_id = storage.ensure_agent(&agent).unwrap();
20407        let conversation_id = storage
20408            .insert_conversation_tree(
20409                agent_id,
20410                None,
20411                &Conversation {
20412                    id: None,
20413                    agent_slug: "codex".into(),
20414                    workspace: Some(PathBuf::from("/tmp/workspace")),
20415                    external_id: Some("footprint-missing-tail-table".to_string()),
20416                    title: Some("footprint-missing-tail-table".to_string()),
20417                    source_path: PathBuf::from("/tmp/footprint-missing-tail-table.jsonl"),
20418                    started_at: Some(1_700_000_000_000),
20419                    ended_at: Some(1_700_000_000_100),
20420                    approx_tokens: None,
20421                    metadata_json: serde_json::Value::Null,
20422                    messages: vec![Message {
20423                        id: None,
20424                        idx: 10,
20425                        role: MessageRole::User,
20426                        author: None,
20427                        created_at: Some(1_700_000_000_010),
20428                        content: "legacy sparse tail without hot table".into(),
20429                        extra_json: serde_json::Value::Null,
20430                        snippets: Vec::new(),
20431                    }],
20432                    source_id: LOCAL_SOURCE_ID.into(),
20433                    origin_host: None,
20434                },
20435            )
20436            .unwrap()
20437            .conversation_id;
20438
20439        storage
20440            .conn
20441            .execute_compat(
20442                "UPDATE conversations
20443                 SET last_message_idx = NULL, last_message_created_at = NULL
20444                 WHERE id = ?1",
20445                fparams![conversation_id],
20446            )
20447            .unwrap();
20448        storage
20449            .conn
20450            .execute_compat("DROP TABLE conversation_tail_state", fparams![])
20451            .unwrap();
20452
20453        let footprints = storage
20454            .list_conversation_footprints_for_lexical_rebuild()
20455            .unwrap();
20456
20457        assert_eq!(
20458            footprints,
20459            vec![LexicalRebuildConversationFootprintRow {
20460                conversation_id,
20461                message_count: 11,
20462                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20463            }],
20464            "read-only lexical self-heal must tolerate pre-tail-cache databases and use messages MAX(idx)"
20465        );
20466    }
20467
20468    #[test]
20469    fn list_conversation_footprints_for_lexical_rebuild_tolerates_legacy_search_demo_fixture() {
20470        let fixture_db = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
20471            .join("tests")
20472            .join("fixtures")
20473            .join("search_demo_data")
20474            .join("agent_search.db");
20475        let storage = FrankenStorage::open_readonly(&fixture_db).unwrap();
20476
20477        let footprints = storage
20478            .list_conversation_footprints_for_lexical_rebuild()
20479            .unwrap();
20480
20481        assert!(
20482            !footprints.is_empty(),
20483            "search self-heal should be able to plan a lexical rebuild from the legacy search demo fixture"
20484        );
20485        assert!(
20486            footprints
20487                .iter()
20488                .all(|footprint| footprint.message_count > 0),
20489            "legacy fixture conversations should derive message counts from messages when tail caches are absent"
20490        );
20491    }
20492
20493    #[test]
20494    fn lexical_rebuild_listing_normalizes_host_only_remote_source_from_blank_source_id() {
20495        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20496        use std::path::PathBuf;
20497
20498        let dir = TempDir::new().unwrap();
20499        let db_path = dir.path().join("agent_search.db");
20500        let storage = SqliteStorage::open(&db_path).unwrap();
20501        let agent = Agent {
20502            id: None,
20503            slug: "codex".into(),
20504            name: "Codex".into(),
20505            version: Some("0.2.3".into()),
20506            kind: AgentKind::Cli,
20507        };
20508        let agent_id = storage.ensure_agent(&agent).unwrap();
20509        let conversation = Conversation {
20510            id: None,
20511            agent_slug: "codex".into(),
20512            workspace: Some(PathBuf::from("/tmp/workspace")),
20513            external_id: Some("legacy-blank-source".into()),
20514            title: Some("Legacy blank source".into()),
20515            source_path: PathBuf::from("/tmp/legacy-blank-source.jsonl"),
20516            started_at: Some(1_700_000_000_000),
20517            ended_at: Some(1_700_000_000_100),
20518            approx_tokens: None,
20519            metadata_json: serde_json::Value::Null,
20520            messages: vec![Message {
20521                id: None,
20522                idx: 0,
20523                role: MessageRole::User,
20524                author: None,
20525                created_at: Some(1_700_000_000_000),
20526                content: "hello".into(),
20527                extra_json: serde_json::Value::Null,
20528                snippets: Vec::new(),
20529            }],
20530            source_id: LOCAL_SOURCE_ID.into(),
20531            origin_host: None,
20532        };
20533
20534        let conversation_id = storage
20535            .insert_conversation_tree(agent_id, None, &conversation)
20536            .unwrap()
20537            .conversation_id;
20538        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20539        storage
20540            .conn
20541            .execute_compat(
20542                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
20543                fparams!["   ", "dev@laptop", conversation_id],
20544            )
20545            .unwrap();
20546        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20547
20548        let listed = storage.list_conversations(10, 0).unwrap();
20549        assert_eq!(listed.len(), 1);
20550        assert_eq!(listed[0].source_id, "dev@laptop");
20551        assert_eq!(listed[0].origin_host.as_deref(), Some("dev@laptop"));
20552
20553        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20554        let rebuild_listed = storage
20555            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
20556            .unwrap();
20557        assert_eq!(rebuild_listed.len(), 1);
20558        assert_eq!(rebuild_listed[0].source_id, "dev@laptop");
20559        assert_eq!(rebuild_listed[0].origin_host.as_deref(), Some("dev@laptop"));
20560    }
20561
20562    #[test]
20563    fn seed_canonical_from_best_historical_bundle_copies_data_and_resets_runtime_meta() {
20564        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20565        use std::path::PathBuf;
20566
20567        let dir = TempDir::new().unwrap();
20568        let canonical_db = dir.path().join("agent_search.db");
20569        let source_db = dir
20570            .path()
20571            .join("backups/agent_search.db.20260322T020200.bak");
20572
20573        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
20574
20575        let source = SqliteStorage::open(&source_db).unwrap();
20576        let agent = Agent {
20577            id: None,
20578            slug: "codex".into(),
20579            name: "Codex".into(),
20580            version: Some("0.2.3".into()),
20581            kind: AgentKind::Cli,
20582        };
20583        let agent_id = source.ensure_agent(&agent).unwrap();
20584        let conversation = Conversation {
20585            id: None,
20586            agent_slug: "codex".into(),
20587            workspace: Some(PathBuf::from("/tmp/workspace")),
20588            external_id: Some("seed-conv".into()),
20589            title: Some("Historical seed".into()),
20590            source_path: PathBuf::from("/tmp/historical-seed.jsonl"),
20591            started_at: Some(1_700_000_000_000),
20592            ended_at: Some(1_700_000_000_100),
20593            approx_tokens: Some(42),
20594            metadata_json: serde_json::json!({"seed": true}),
20595            messages: vec![Message {
20596                id: None,
20597                idx: 0,
20598                role: MessageRole::Agent,
20599                author: Some("assistant".into()),
20600                created_at: Some(1_700_000_000_050),
20601                content: "seeded message".into(),
20602                extra_json: serde_json::json!({"usage": {"total_tokens": 12}}),
20603                snippets: Vec::new(),
20604            }],
20605            source_id: LOCAL_SOURCE_ID.into(),
20606            origin_host: None,
20607        };
20608        source
20609            .insert_conversation_tree(agent_id, None, &conversation)
20610            .unwrap();
20611        source.set_last_scan_ts(123).unwrap();
20612        source.set_last_indexed_at(456).unwrap();
20613        source.set_last_embedded_message_id(789).unwrap();
20614        source
20615            .conn
20616            .execute_compat(
20617                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
20618                fparams!["historical_bundle_salvaged:stale", "{\"stale\":true}"],
20619            )
20620            .unwrap();
20621        drop(source);
20622
20623        // Legacy "duplicate FTS" fixture reconstruction.
20624        //
20625        // Post-V14 migration cass drops the V13-era fts_messages virtual table
20626        // and recreates it lazily, so a freshly-opened canonical DB has zero
20627        // fts_messages entries in sqlite_master. To reproduce the historical
20628        // failure mode this test exercises — a legacy v13 bundle with a
20629        // duplicated CREATE VIRTUAL TABLE row — we have to inject *both*
20630        // entries: the original V13-era contentless row and the buggy duplicate
20631        // row. Before V14 existed the original was already present after
20632        // migration and only the duplicate needed manual injection.
20633        let legacy_v13_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, content='', tokenize='porter')";
20634        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
20635        let legacy = rusqlite_test_fixture_conn(&source_db);
20636        legacy
20637            .execute_batch(
20638                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
20639                 DELETE FROM _schema_migrations WHERE version = 14;
20640                 PRAGMA writable_schema = ON;",
20641            )
20642            .unwrap();
20643        legacy
20644            .execute(
20645                "DELETE FROM meta WHERE key = ?1",
20646                [FTS_FRANKEN_REBUILD_META_KEY],
20647            )
20648            .unwrap();
20649        // Inject the V13 original first.
20650        legacy
20651            .execute(
20652                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20653                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20654                [legacy_v13_fts_sql],
20655            )
20656            .unwrap();
20657        // Then the duplicate that's the real subject of the fixup logic.
20658        legacy
20659            .execute(
20660                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20661                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20662                [duplicate_legacy_fts_sql],
20663            )
20664            .unwrap();
20665        legacy
20666            .execute_batch("PRAGMA writable_schema = OFF;")
20667            .unwrap();
20668        drop(legacy);
20669
20670        // Verify fixture with rusqlite+writable_schema to see raw
20671        // sqlite_master rows (frankensqlite deduplicates schema entries).
20672        {
20673            let verify = rusqlite_test_fixture_conn(&source_db);
20674            verify
20675                .execute_batch("PRAGMA writable_schema = ON;")
20676                .unwrap();
20677            let fts_entries: i64 = verify
20678                .query_row(
20679                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20680                    [],
20681                    |row| row.get(0),
20682                )
20683                .unwrap();
20684            assert_eq!(
20685                fts_entries, 2,
20686                "test fixture should reproduce the duplicate legacy fts_messages rows"
20687            );
20688            let msg_count: i64 = verify
20689                .query_row("SELECT COUNT(*) FROM messages", [], |row| row.get(0))
20690                .unwrap();
20691            assert_eq!(msg_count, 1);
20692        }
20693
20694        let fresh = SqliteStorage::open(&canonical_db).unwrap();
20695        drop(fresh);
20696
20697        let outcome = seed_canonical_from_best_historical_bundle(&canonical_db)
20698            .unwrap()
20699            .unwrap();
20700        assert_eq!(outcome.bundles_imported, 1);
20701        assert_eq!(outcome.conversations_imported, 1);
20702        assert_eq!(outcome.messages_imported, 1);
20703
20704        let readonly = open_franken_with_flags(
20705            &canonical_db.to_string_lossy(),
20706            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20707        )
20708        .unwrap();
20709        let readonly_message_count: i64 = readonly
20710            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20711                row.get_typed(0)
20712            })
20713            .unwrap();
20714        assert_eq!(readonly_message_count, 1);
20715
20716        let seeded = SqliteStorage::open(&canonical_db).unwrap();
20717        assert_eq!(
20718            seeded
20719                .count_sessions_in_range(None, None, None, None)
20720                .unwrap()
20721                .0,
20722            1
20723        );
20724        let message_count: i64 = seeded
20725            .conn
20726            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20727                row.get_typed(0)
20728            })
20729            .unwrap();
20730        assert_eq!(message_count, 1);
20731        assert_eq!(seeded.get_last_scan_ts().unwrap(), None);
20732        assert_eq!(seeded.get_last_embedded_message_id().unwrap(), None);
20733
20734        let last_indexed: Option<String> = seeded
20735            .conn
20736            .query_row_map(
20737                "SELECT value FROM meta WHERE key = 'last_indexed_at'",
20738                fparams![],
20739                |row| row.get_typed(0),
20740            )
20741            .optional()
20742            .unwrap();
20743        assert!(last_indexed.is_none());
20744
20745        let salvage_keys: Vec<String> = seeded
20746            .conn
20747            .query_map_collect(
20748                "SELECT key FROM meta WHERE key LIKE 'historical_bundle_salvaged:%' ORDER BY key",
20749                fparams![],
20750                |row| row.get_typed(0),
20751            )
20752            .unwrap();
20753        assert_eq!(salvage_keys.len(), 1);
20754
20755        let reopened_readonly = open_franken_with_flags(
20756            &canonical_db.to_string_lossy(),
20757            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20758        )
20759        .unwrap();
20760        let reopened_fts_entries: i64 = reopened_readonly
20761            .query_row_map(
20762                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20763                fparams![],
20764                |row| row.get_typed(0),
20765            )
20766            .unwrap();
20767        assert_eq!(
20768            reopened_fts_entries, 1,
20769            "seeded canonical db should keep a single stock-SQLite fts_messages schema row"
20770        );
20771        let reopened_message_count: i64 = reopened_readonly
20772            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20773                row.get_typed(0)
20774            })
20775            .unwrap();
20776        assert_eq!(reopened_message_count, 1);
20777
20778        let franken_seeded = FrankenStorage::open(&canonical_db).unwrap();
20779        assert_eq!(
20780            franken_seeded.schema_version().unwrap(),
20781            CURRENT_SCHEMA_VERSION
20782        );
20783        // Post-V14 fts_messages is recreated lazily. `FrankenStorage::open`
20784        // alone doesn't re-register the virtual table for the frankensqlite
20785        // query engine — the consistency pass does, and this is exactly what
20786        // normal cass startup runs before the first search. Invoke it
20787        // explicitly so the query below exercises the expected post-repair
20788        // state rather than the between-steps state.
20789        franken_seeded
20790            .ensure_search_fallback_fts_consistency()
20791            .expect("ensure FTS consistency after seed");
20792        let post_franken_schema_rows: i64 = franken_seeded
20793            .raw()
20794            .query_row_map(
20795                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20796                fparams![],
20797                |row| row.get_typed(0),
20798            )
20799            .unwrap();
20800        assert_eq!(post_franken_schema_rows, 1);
20801        assert!(
20802            franken_seeded
20803                .raw()
20804                .query("SELECT rowid FROM fts_messages LIMIT 1")
20805                .is_ok()
20806        );
20807    }
20808
20809    #[test]
20810    fn failed_baseline_seed_preserves_existing_canonical_bundle() {
20811        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20812        use std::path::PathBuf;
20813
20814        let dir = TempDir::new().unwrap();
20815        let canonical_db = dir.path().join("agent_search.db");
20816        let source_db = dir
20817            .path()
20818            .join("backups/agent_search.db.20260325T120000Z.bad-seed.bak");
20819
20820        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
20821
20822        let canonical = SqliteStorage::open(&canonical_db).unwrap();
20823        canonical
20824            .conn
20825            .execute_compat(
20826                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
20827                fparams!["sentinel", "keep-me"],
20828            )
20829            .unwrap();
20830        drop(canonical);
20831
20832        let source = SqliteStorage::open(&source_db).unwrap();
20833        let agent = Agent {
20834            id: None,
20835            slug: "codex".into(),
20836            name: "Codex".into(),
20837            version: Some("0.2.3".into()),
20838            kind: AgentKind::Cli,
20839        };
20840        let agent_id = source.ensure_agent(&agent).unwrap();
20841        let conversation = Conversation {
20842            id: None,
20843            agent_slug: "codex".into(),
20844            workspace: Some(PathBuf::from("/tmp/workspace")),
20845            external_id: Some("bad-seed-conv".into()),
20846            title: Some("Bad seed".into()),
20847            source_path: PathBuf::from("/tmp/bad-seed.jsonl"),
20848            started_at: Some(1_700_000_000_000),
20849            ended_at: Some(1_700_000_000_100),
20850            approx_tokens: Some(42),
20851            metadata_json: serde_json::json!({"seed": "bad"}),
20852            messages: vec![Message {
20853                id: None,
20854                idx: 0,
20855                role: MessageRole::Agent,
20856                author: Some("assistant".into()),
20857                created_at: Some(1_700_000_000_050),
20858                content: "this seed should fail".into(),
20859                extra_json: serde_json::Value::Null,
20860                snippets: Vec::new(),
20861            }],
20862            source_id: LOCAL_SOURCE_ID.into(),
20863            origin_host: None,
20864        };
20865        source
20866            .insert_conversation_tree(agent_id, None, &conversation)
20867            .unwrap();
20868        drop(source);
20869
20870        let legacy = FrankenConnection::open(source_db.to_string_lossy().into_owned()).unwrap();
20871        legacy
20872            .execute("UPDATE meta SET value = '12' WHERE key = 'schema_version'")
20873            .unwrap();
20874        drop(legacy);
20875
20876        let err = seed_canonical_from_best_historical_bundle(&canonical_db).unwrap_err();
20877        assert!(
20878            err.to_string()
20879                .contains("schema_version 12 is too old for baseline import"),
20880            "unexpected seed error: {err:#}"
20881        );
20882
20883        let reopened = SqliteStorage::open(&canonical_db).unwrap();
20884        let sentinel: Option<String> = reopened
20885            .conn
20886            .query_row_map(
20887                "SELECT value FROM meta WHERE key = 'sentinel'",
20888                fparams![],
20889                |row| row.get_typed(0),
20890            )
20891            .optional()
20892            .unwrap();
20893        assert_eq!(sentinel.as_deref(), Some("keep-me"));
20894
20895        let conversation_count: i64 = reopened
20896            .conn
20897            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
20898                row.get_typed(0)
20899            })
20900            .unwrap();
20901        assert_eq!(conversation_count, 0);
20902
20903        let readonly = open_franken_with_flags(
20904            &canonical_db.to_string_lossy(),
20905            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20906        )
20907        .unwrap();
20908        let readonly_conversation_count: i64 = readonly
20909            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
20910                row.get_typed(0)
20911            })
20912            .unwrap();
20913        assert_eq!(readonly_conversation_count, 0);
20914    }
20915
20916    #[test]
20917    fn fetch_messages_for_lexical_rebuild_skips_extra_json() {
20918        let dir = TempDir::new().unwrap();
20919        let db_path = dir.path().join("test.db");
20920        let storage = SqliteStorage::open(&db_path).unwrap();
20921
20922        let agent = Agent {
20923            id: None,
20924            slug: "codex".into(),
20925            name: "Codex".into(),
20926            version: Some("0.2.3".into()),
20927            kind: AgentKind::Cli,
20928        };
20929        let agent_id = storage.ensure_agent(&agent).unwrap();
20930
20931        let conversation = Conversation {
20932            id: None,
20933            agent_slug: "codex".into(),
20934            workspace: Some(PathBuf::from("/tmp/workspace")),
20935            external_id: Some("lexical-rebuild-test".into()),
20936            title: Some("Lexical rebuild".into()),
20937            source_path: PathBuf::from("/tmp/lexical-rebuild.jsonl"),
20938            started_at: Some(1_700_000_000_000),
20939            ended_at: Some(1_700_000_000_100),
20940            approx_tokens: Some(42),
20941            metadata_json: serde_json::Value::Null,
20942            messages: vec![Message {
20943                id: None,
20944                idx: 0,
20945                role: MessageRole::Agent,
20946                author: Some("assistant".into()),
20947                created_at: Some(1_700_000_000_050),
20948                content: "indexed text".into(),
20949                extra_json: serde_json::json!({
20950                    "usage": { "total_tokens": 1234 },
20951                    "irrelevant_blob": "still preserved in canonical storage"
20952                }),
20953                snippets: Vec::new(),
20954            }],
20955            source_id: LOCAL_SOURCE_ID.into(),
20956            origin_host: None,
20957        };
20958
20959        let inserted = storage
20960            .insert_conversation_tree(agent_id, None, &conversation)
20961            .unwrap();
20962        let conversation_id = inserted.conversation_id;
20963
20964        let stored = storage.fetch_messages(conversation_id).unwrap();
20965        assert_eq!(stored.len(), 1);
20966        assert!(!stored[0].extra_json.is_null());
20967
20968        let lexical = storage
20969            .fetch_messages_for_lexical_rebuild(conversation_id)
20970            .unwrap();
20971        assert_eq!(lexical.len(), 1);
20972        assert_eq!(lexical[0].content, "indexed text");
20973        assert_eq!(lexical[0].author.as_deref(), Some("assistant"));
20974        assert!(lexical[0].extra_json.is_null());
20975    }
20976
20977    #[test]
20978    fn fetch_messages_for_lexical_rebuild_batch_groups_and_orders_messages() {
20979        let dir = TempDir::new().unwrap();
20980        let db_path = dir.path().join("test.db");
20981        let storage = SqliteStorage::open(&db_path).unwrap();
20982
20983        let agent = Agent {
20984            id: None,
20985            slug: "codex".into(),
20986            name: "Codex".into(),
20987            version: Some("0.2.3".into()),
20988            kind: AgentKind::Cli,
20989        };
20990        let agent_id = storage.ensure_agent(&agent).unwrap();
20991
20992        let first = Conversation {
20993            id: None,
20994            agent_slug: "codex".into(),
20995            workspace: Some(PathBuf::from("/tmp/workspace")),
20996            external_id: Some("lexical-batch-1".into()),
20997            title: Some("Lexical batch 1".into()),
20998            source_path: PathBuf::from("/tmp/lexical-batch-1.jsonl"),
20999            started_at: Some(1_700_000_000_000),
21000            ended_at: Some(1_700_000_000_100),
21001            approx_tokens: Some(42),
21002            metadata_json: serde_json::Value::Null,
21003            messages: vec![
21004                Message {
21005                    id: None,
21006                    idx: 0,
21007                    role: MessageRole::User,
21008                    author: Some("user".into()),
21009                    created_at: Some(1_700_000_000_010),
21010                    content: "first-a".into(),
21011                    extra_json: serde_json::json!({"opaque": true}),
21012                    snippets: Vec::new(),
21013                },
21014                Message {
21015                    id: None,
21016                    idx: 1,
21017                    role: MessageRole::Agent,
21018                    author: Some("assistant".into()),
21019                    created_at: Some(1_700_000_000_020),
21020                    content: "first-b".into(),
21021                    extra_json: serde_json::json!({"opaque": true}),
21022                    snippets: Vec::new(),
21023                },
21024            ],
21025            source_id: LOCAL_SOURCE_ID.into(),
21026            origin_host: None,
21027        };
21028
21029        let second = Conversation {
21030            id: None,
21031            agent_slug: "codex".into(),
21032            workspace: Some(PathBuf::from("/tmp/workspace")),
21033            external_id: Some("lexical-batch-2".into()),
21034            title: Some("Lexical batch 2".into()),
21035            source_path: PathBuf::from("/tmp/lexical-batch-2.jsonl"),
21036            started_at: Some(1_700_000_000_200),
21037            ended_at: Some(1_700_000_000_300),
21038            approx_tokens: Some(84),
21039            metadata_json: serde_json::Value::Null,
21040            messages: vec![Message {
21041                id: None,
21042                idx: 0,
21043                role: MessageRole::Tool,
21044                author: Some("tool".into()),
21045                created_at: Some(1_700_000_000_210),
21046                content: "second-a".into(),
21047                extra_json: serde_json::json!({"opaque": true}),
21048                snippets: Vec::new(),
21049            }],
21050            source_id: LOCAL_SOURCE_ID.into(),
21051            origin_host: None,
21052        };
21053        let third = Conversation {
21054            external_id: Some("lexical-batch-3".into()),
21055            title: Some("Lexical batch 3".into()),
21056            source_path: PathBuf::from("/tmp/lexical-batch-3.jsonl"),
21057            messages: vec![Message {
21058                id: None,
21059                idx: 0,
21060                role: MessageRole::System,
21061                author: Some("system".into()),
21062                created_at: Some(1_700_000_000_410),
21063                content: "third-a".into(),
21064                extra_json: serde_json::json!({"opaque": true}),
21065                snippets: Vec::new(),
21066            }],
21067            ..second.clone()
21068        };
21069
21070        let first_id = storage
21071            .insert_conversation_tree(agent_id, None, &first)
21072            .unwrap()
21073            .conversation_id;
21074        let second_id = storage
21075            .insert_conversation_tree(agent_id, None, &second)
21076            .unwrap()
21077            .conversation_id;
21078        let third_id = storage
21079            .insert_conversation_tree(agent_id, None, &third)
21080            .unwrap()
21081            .conversation_id;
21082
21083        let lexical = storage
21084            .fetch_messages_for_lexical_rebuild_batch(&[third_id, first_id], None, None)
21085            .unwrap();
21086
21087        let first_messages = lexical.get(&first_id).expect("first conversation");
21088        assert_eq!(first_messages.len(), 2);
21089        assert_eq!(first_messages[0].content, "first-a");
21090        assert_eq!(first_messages[1].content, "first-b");
21091        assert!(
21092            first_messages
21093                .iter()
21094                .all(|message| message.extra_json.is_null())
21095        );
21096
21097        assert!(
21098            !lexical.contains_key(&second_id),
21099            "batch fetch must exclude conversations not requested by the caller"
21100        );
21101
21102        let third_messages = lexical.get(&third_id).expect("third conversation");
21103        assert_eq!(third_messages.len(), 1);
21104        assert_eq!(third_messages[0].content, "third-a");
21105        assert!(third_messages[0].extra_json.is_null());
21106    }
21107
21108    #[test]
21109    fn fetch_messages_for_lexical_rebuild_batch_enforces_content_byte_guardrail() {
21110        let dir = TempDir::new().unwrap();
21111        let db_path = dir.path().join("test.db");
21112        let storage = SqliteStorage::open(&db_path).unwrap();
21113
21114        let agent = Agent {
21115            id: None,
21116            slug: "codex".into(),
21117            name: "Codex".into(),
21118            version: Some("0.2.3".into()),
21119            kind: AgentKind::Cli,
21120        };
21121        let agent_id = storage.ensure_agent(&agent).unwrap();
21122
21123        let conversation = Conversation {
21124            id: None,
21125            agent_slug: "codex".into(),
21126            workspace: Some(PathBuf::from("/tmp/workspace")),
21127            external_id: Some("lexical-batch-guard".into()),
21128            title: Some("Lexical batch guard".into()),
21129            source_path: PathBuf::from("/tmp/lexical-batch-guard.jsonl"),
21130            started_at: Some(1_700_000_000_000),
21131            ended_at: Some(1_700_000_000_100),
21132            approx_tokens: Some(42),
21133            metadata_json: serde_json::Value::Null,
21134            messages: vec![
21135                Message {
21136                    id: None,
21137                    idx: 0,
21138                    role: MessageRole::User,
21139                    author: Some("user".into()),
21140                    created_at: Some(1_700_000_000_010),
21141                    content: "123456".into(),
21142                    extra_json: serde_json::Value::Null,
21143                    snippets: Vec::new(),
21144                },
21145                Message {
21146                    id: None,
21147                    idx: 1,
21148                    role: MessageRole::Agent,
21149                    author: Some("assistant".into()),
21150                    created_at: Some(1_700_000_000_020),
21151                    content: "abcdef".into(),
21152                    extra_json: serde_json::Value::Null,
21153                    snippets: Vec::new(),
21154                },
21155            ],
21156            source_id: LOCAL_SOURCE_ID.into(),
21157            origin_host: None,
21158        };
21159
21160        let conversation_id = storage
21161            .insert_conversation_tree(agent_id, None, &conversation)
21162            .unwrap()
21163            .conversation_id;
21164
21165        let error = storage
21166            .fetch_messages_for_lexical_rebuild_batch(&[conversation_id], Some(10), Some(8))
21167            .expect_err("guardrail should reject oversized batch content");
21168
21169        let message = format!("{error:#}");
21170        assert!(
21171            message.contains("content-byte guardrail"),
21172            "expected guardrail reason in error, got {message}"
21173        );
21174    }
21175
21176    #[test]
21177    fn fetch_messages_handles_manual_rows_inserted_via_raw_connection() {
21178        let dir = TempDir::new().unwrap();
21179        let db_path = dir.path().join("manual-rows.db");
21180        let storage = FrankenStorage::open(&db_path).unwrap();
21181        let conn = storage.raw();
21182
21183        conn.execute(
21184            "INSERT INTO agents (id, slug, name, kind, created_at, updated_at)
21185             VALUES (1, 'claude_code', 'Claude Code', 'local', 0, 0)",
21186        )
21187        .unwrap();
21188        conn.execute(
21189            "INSERT INTO conversations
21190             (id, agent_id, external_id, title, source_path, source_id, started_at)
21191             VALUES (1, 1, 'manual-ext', 'Manual Session', '/tmp/manual.jsonl', 'local', 200)",
21192        )
21193        .unwrap();
21194        conn.execute(
21195            "INSERT INTO messages
21196             (id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
21197             VALUES (1, 1, 0, 'user', 'tester', 1700000000000, 'manual body', '{\"k\":1}', NULL)",
21198        )
21199        .unwrap();
21200
21201        let lexical = storage.fetch_messages_for_lexical_rebuild(1).unwrap();
21202        assert_eq!(lexical.len(), 1);
21203        assert_eq!(lexical[0].content, "manual body");
21204
21205        let full = storage.fetch_messages(1).unwrap();
21206        assert_eq!(full.len(), 1);
21207        assert_eq!(full[0].content, "manual body");
21208        assert_eq!(full[0].author.as_deref(), Some("tester"));
21209        assert_eq!(full[0].extra_json, serde_json::json!({ "k": 1 }));
21210    }
21211
21212    #[test]
21213    fn lexical_rebuild_batch_messages_query_avoids_sorter_temp_btrees() {
21214        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21215        use std::path::PathBuf;
21216
21217        let dir = TempDir::new().unwrap();
21218        let db_path = dir.path().join("agent_search.db");
21219        let storage = SqliteStorage::open(&db_path).unwrap();
21220
21221        let agent = Agent {
21222            id: None,
21223            slug: "claude_code".into(),
21224            name: "Claude Code".into(),
21225            version: None,
21226            kind: AgentKind::Cli,
21227        };
21228        let agent_id = storage.ensure_agent(&agent).unwrap();
21229
21230        for (external_id, base_ts) in [
21231            ("conv-1", 1_700_000_000_000_i64),
21232            ("conv-2", 1_700_000_001_000_i64),
21233        ] {
21234            let conversation = Conversation {
21235                id: None,
21236                agent_slug: "claude_code".into(),
21237                workspace: Some(PathBuf::from("/tmp/workspace")),
21238                external_id: Some(external_id.to_string()),
21239                title: Some("Lexical rebuild".into()),
21240                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21241                started_at: Some(base_ts),
21242                ended_at: Some(base_ts + 100),
21243                approx_tokens: None,
21244                metadata_json: serde_json::Value::Null,
21245                messages: vec![
21246                    Message {
21247                        id: None,
21248                        idx: 0,
21249                        role: MessageRole::User,
21250                        author: Some("user".into()),
21251                        created_at: Some(base_ts + 10),
21252                        content: format!("{external_id}-first"),
21253                        extra_json: serde_json::Value::Null,
21254                        snippets: Vec::new(),
21255                    },
21256                    Message {
21257                        id: None,
21258                        idx: 1,
21259                        role: MessageRole::Agent,
21260                        author: Some("assistant".into()),
21261                        created_at: Some(base_ts + 20),
21262                        content: format!("{external_id}-second"),
21263                        extra_json: serde_json::Value::Null,
21264                        snippets: Vec::new(),
21265                    },
21266                ],
21267                source_id: LOCAL_SOURCE_ID.into(),
21268                origin_host: None,
21269            };
21270            storage
21271                .insert_conversation_tree(agent_id, None, &conversation)
21272                .unwrap();
21273        }
21274
21275        let conversation_ids: Vec<i64> = storage
21276            .conn
21277            .query_map_collect(
21278                "SELECT id FROM conversations ORDER BY id",
21279                fparams![],
21280                |row| row.get_typed(0),
21281            )
21282            .unwrap();
21283        assert_eq!(conversation_ids.len(), 2);
21284
21285        let plan_details: Vec<String> = storage
21286            .conn
21287            .query_map_collect(
21288                "EXPLAIN QUERY PLAN \
21289                 SELECT conversation_id, id, idx, role, author, created_at, content \
21290                 FROM messages \
21291                 WHERE conversation_id IN (?1, ?2) \
21292                 ORDER BY conversation_id ASC, idx ASC",
21293                fparams![conversation_ids[0], conversation_ids[1]],
21294                |row| row.get_typed(3),
21295            )
21296            .unwrap();
21297
21298        assert!(
21299            plan_details
21300                .iter()
21301                .any(|detail| detail.contains("sqlite_autoindex_messages_1")),
21302            "expected batched lexical rebuild fetch to use the conversation_id/idx composite index, got {plan_details:?}"
21303        );
21304        assert!(
21305            !plan_details
21306                .iter()
21307                .any(|detail| detail.contains("TEMP B-TREE")),
21308            "expected batched lexical rebuild fetch to avoid sorter temp b-trees, got {plan_details:?}"
21309        );
21310    }
21311
21312    #[test]
21313    fn stream_messages_for_lexical_rebuild_groups_and_orders_messages() {
21314        let dir = TempDir::new().unwrap();
21315        let db_path = dir.path().join("test.db");
21316        let storage = SqliteStorage::open(&db_path).unwrap();
21317
21318        let agent = Agent {
21319            id: None,
21320            slug: "codex".into(),
21321            name: "Codex".into(),
21322            version: Some("0.2.3".into()),
21323            kind: AgentKind::Cli,
21324        };
21325        let agent_id = storage.ensure_agent(&agent).unwrap();
21326
21327        let first = Conversation {
21328            id: None,
21329            agent_slug: "codex".into(),
21330            workspace: Some(PathBuf::from("/tmp/workspace")),
21331            external_id: Some("lexical-stream-1".into()),
21332            title: Some("Lexical stream 1".into()),
21333            source_path: PathBuf::from("/tmp/lexical-stream-1.jsonl"),
21334            started_at: Some(1_700_000_000_000),
21335            ended_at: Some(1_700_000_000_100),
21336            approx_tokens: Some(42),
21337            metadata_json: serde_json::Value::Null,
21338            messages: vec![
21339                Message {
21340                    id: None,
21341                    idx: 0,
21342                    role: MessageRole::User,
21343                    author: Some("user".into()),
21344                    created_at: Some(1_700_000_000_010),
21345                    content: "first-a".into(),
21346                    extra_json: serde_json::json!({"opaque": true}),
21347                    snippets: Vec::new(),
21348                },
21349                Message {
21350                    id: None,
21351                    idx: 1,
21352                    role: MessageRole::Agent,
21353                    author: Some("assistant".into()),
21354                    created_at: Some(1_700_000_000_020),
21355                    content: "first-b".into(),
21356                    extra_json: serde_json::json!({"opaque": true}),
21357                    snippets: Vec::new(),
21358                },
21359            ],
21360            source_id: LOCAL_SOURCE_ID.into(),
21361            origin_host: None,
21362        };
21363
21364        let second = Conversation {
21365            id: None,
21366            agent_slug: "codex".into(),
21367            workspace: Some(PathBuf::from("/tmp/workspace")),
21368            external_id: Some("lexical-stream-2".into()),
21369            title: Some("Lexical stream 2".into()),
21370            source_path: PathBuf::from("/tmp/lexical-stream-2.jsonl"),
21371            started_at: Some(1_700_000_000_200),
21372            ended_at: Some(1_700_000_000_300),
21373            approx_tokens: Some(84),
21374            metadata_json: serde_json::Value::Null,
21375            messages: vec![Message {
21376                id: None,
21377                idx: 0,
21378                role: MessageRole::Tool,
21379                author: Some("tool".into()),
21380                created_at: Some(1_700_000_000_210),
21381                content: "second-a".into(),
21382                extra_json: serde_json::json!({"opaque": true}),
21383                snippets: Vec::new(),
21384            }],
21385            source_id: LOCAL_SOURCE_ID.into(),
21386            origin_host: None,
21387        };
21388
21389        let first_id = storage
21390            .insert_conversation_tree(agent_id, None, &first)
21391            .unwrap()
21392            .conversation_id;
21393        let second_id = storage
21394            .insert_conversation_tree(agent_id, None, &second)
21395            .unwrap()
21396            .conversation_id;
21397
21398        let mut streamed = Vec::new();
21399        storage
21400            .stream_messages_for_lexical_rebuild_from_conversation_id(first_id, |row| {
21401                streamed.push((
21402                    row.conversation_id,
21403                    row.idx,
21404                    row.role,
21405                    row.author,
21406                    row.content,
21407                ));
21408                Ok(())
21409            })
21410            .unwrap();
21411
21412        assert_eq!(
21413            streamed,
21414            vec![
21415                (
21416                    first_id,
21417                    0,
21418                    "user".to_string(),
21419                    Some("user".to_string()),
21420                    "first-a".to_string(),
21421                ),
21422                (
21423                    first_id,
21424                    1,
21425                    "agent".to_string(),
21426                    Some("assistant".to_string()),
21427                    "first-b".to_string(),
21428                ),
21429                (
21430                    second_id,
21431                    0,
21432                    "tool".to_string(),
21433                    Some("tool".to_string()),
21434                    "second-a".to_string(),
21435                ),
21436            ]
21437        );
21438    }
21439
21440    #[test]
21441    fn stream_messages_for_lexical_rebuild_between_conversation_ids_respects_upper_bound() {
21442        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21443        use std::path::PathBuf;
21444
21445        let dir = TempDir::new().unwrap();
21446        let db_path = dir.path().join("agent_search.db");
21447        let storage = SqliteStorage::open(&db_path).unwrap();
21448
21449        let agent = Agent {
21450            id: None,
21451            slug: "claude_code".into(),
21452            name: "Claude Code".into(),
21453            version: Some("1.2.3".into()),
21454            kind: AgentKind::Cli,
21455        };
21456        let agent_id = storage.ensure_agent(&agent).unwrap();
21457
21458        let first = Conversation {
21459            id: None,
21460            agent_slug: "claude_code".into(),
21461            workspace: Some(PathBuf::from("/tmp/workspace")),
21462            external_id: Some("lexical-range-1".into()),
21463            title: Some("Lexical range 1".into()),
21464            source_path: PathBuf::from("/tmp/lexical-range-1.jsonl"),
21465            started_at: Some(1_700_000_000_000),
21466            ended_at: Some(1_700_000_000_100),
21467            approx_tokens: Some(42),
21468            metadata_json: serde_json::Value::Null,
21469            messages: vec![Message {
21470                id: None,
21471                idx: 0,
21472                role: MessageRole::User,
21473                author: Some("user".into()),
21474                created_at: Some(1_700_000_000_010),
21475                content: "first-only".into(),
21476                extra_json: serde_json::json!({"opaque": true}),
21477                snippets: Vec::new(),
21478            }],
21479            source_id: LOCAL_SOURCE_ID.into(),
21480            origin_host: None,
21481        };
21482
21483        let second = Conversation {
21484            id: None,
21485            agent_slug: "claude_code".into(),
21486            workspace: Some(PathBuf::from("/tmp/workspace")),
21487            external_id: Some("lexical-range-2".into()),
21488            title: Some("Lexical range 2".into()),
21489            source_path: PathBuf::from("/tmp/lexical-range-2.jsonl"),
21490            started_at: Some(1_700_000_000_200),
21491            ended_at: Some(1_700_000_000_300),
21492            approx_tokens: Some(84),
21493            metadata_json: serde_json::Value::Null,
21494            messages: vec![Message {
21495                id: None,
21496                idx: 0,
21497                role: MessageRole::Tool,
21498                author: Some("tool".into()),
21499                created_at: Some(1_700_000_000_210),
21500                content: "second-should-not-appear".into(),
21501                extra_json: serde_json::json!({"opaque": true}),
21502                snippets: Vec::new(),
21503            }],
21504            source_id: LOCAL_SOURCE_ID.into(),
21505            origin_host: None,
21506        };
21507
21508        let first_id = storage
21509            .insert_conversation_tree(agent_id, None, &first)
21510            .unwrap()
21511            .conversation_id;
21512        let second_id = storage
21513            .insert_conversation_tree(agent_id, None, &second)
21514            .unwrap()
21515            .conversation_id;
21516
21517        let mut streamed = Vec::new();
21518        storage
21519            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21520                first_id,
21521                first_id,
21522                |row| {
21523                    streamed.push((row.conversation_id, row.idx, row.content));
21524                    Ok(())
21525                },
21526            )
21527            .unwrap();
21528
21529        assert_eq!(streamed, vec![(first_id, 0, "first-only".to_string())]);
21530        assert!(
21531            streamed
21532                .iter()
21533                .all(|(conversation_id, _, _)| *conversation_id != second_id),
21534            "upper bound should exclude later conversation ids"
21535        );
21536    }
21537
21538    #[test]
21539    fn stream_messages_for_lexical_rebuild_between_conversation_ids_handles_mixed_ranges() {
21540        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21541        use std::path::PathBuf;
21542
21543        let dir = TempDir::new().unwrap();
21544        let db_path = dir.path().join("agent_search.db");
21545        let storage = SqliteStorage::open(&db_path).unwrap();
21546
21547        let claude_agent_id = storage
21548            .ensure_agent(&Agent {
21549                id: None,
21550                slug: "claude_code".into(),
21551                name: "Claude Code".into(),
21552                version: None,
21553                kind: AgentKind::Cli,
21554            })
21555            .unwrap();
21556        let aider_agent_id = storage
21557            .ensure_agent(&Agent {
21558                id: None,
21559                slug: "aider".into(),
21560                name: "Aider".into(),
21561                version: None,
21562                kind: AgentKind::Cli,
21563            })
21564            .unwrap();
21565
21566        type MessageSpec = (i64, MessageRole, Option<String>, Option<i64>, String);
21567
21568        let mut expected = Vec::new();
21569        let mut first_conversation_id = None;
21570        let mut last_conversation_id = None;
21571        let mut insert_conversation =
21572            |agent_id: i64,
21573             external_id: &str,
21574             title: &str,
21575             source_path: &str,
21576             started_at: i64,
21577             message_specs: Vec<MessageSpec>| {
21578                let conversation = Conversation {
21579                    id: None,
21580                    agent_slug: if agent_id == aider_agent_id {
21581                        "aider".into()
21582                    } else {
21583                        "claude_code".into()
21584                    },
21585                    workspace: Some(PathBuf::from("/tmp/workspace")),
21586                    external_id: Some(external_id.to_string()),
21587                    title: Some(title.to_string()),
21588                    source_path: PathBuf::from(source_path),
21589                    started_at: Some(started_at),
21590                    ended_at: Some(started_at + 100),
21591                    approx_tokens: None,
21592                    metadata_json: serde_json::Value::Null,
21593                    messages: message_specs
21594                        .iter()
21595                        .map(|(idx, role, author, created_at, content)| Message {
21596                            id: None,
21597                            idx: *idx,
21598                            role: role.clone(),
21599                            author: author.clone(),
21600                            created_at: *created_at,
21601                            content: content.clone(),
21602                            extra_json: serde_json::Value::Null,
21603                            snippets: Vec::new(),
21604                        })
21605                        .collect(),
21606                    source_id: LOCAL_SOURCE_ID.into(),
21607                    origin_host: None,
21608                };
21609                let conversation_id = storage
21610                    .insert_conversation_tree(agent_id, None, &conversation)
21611                    .unwrap()
21612                    .conversation_id;
21613                if first_conversation_id.is_none() {
21614                    first_conversation_id = Some(conversation_id);
21615                }
21616                last_conversation_id = Some(conversation_id);
21617                expected.extend(message_specs.into_iter().map(
21618                    |(idx, role, author, created_at, content)| {
21619                        (
21620                            conversation_id,
21621                            idx,
21622                            match role {
21623                                MessageRole::User => "user".to_string(),
21624                                MessageRole::Agent => "agent".to_string(),
21625                                MessageRole::Tool => "tool".to_string(),
21626                                MessageRole::System => "system".to_string(),
21627                                MessageRole::Other(other) => other,
21628                            },
21629                            author,
21630                            created_at,
21631                            content,
21632                        )
21633                    },
21634                ));
21635            };
21636
21637        for (label, base_ts) in [
21638            ("alpha", 1_700_000_000_000_i64),
21639            ("beta", 1_700_000_001_000_i64),
21640            ("gamma", 1_700_000_002_000_i64),
21641            ("delta", 1_700_000_003_000_i64),
21642            ("epsilon", 1_700_000_004_000_i64),
21643        ] {
21644            insert_conversation(
21645                claude_agent_id,
21646                &format!("lexical-{label}"),
21647                &format!("Lexical {label}"),
21648                &format!("/tmp/{label}.jsonl"),
21649                base_ts,
21650                vec![
21651                    (
21652                        0,
21653                        MessageRole::User,
21654                        None,
21655                        Some(base_ts + 10),
21656                        format!("{label}_content"),
21657                    ),
21658                    (
21659                        1,
21660                        MessageRole::Agent,
21661                        None,
21662                        Some(base_ts + 20),
21663                        format!("{label}_content_response"),
21664                    ),
21665                ],
21666            );
21667        }
21668
21669        insert_conversation(
21670            aider_agent_id,
21671            "lexical-aider-history",
21672            "Aider Chat: coding_agent_session_search",
21673            "/tmp/.aider.chat.history.md",
21674            1_764_619_673_394,
21675            vec![
21676                (
21677                    0,
21678                    MessageRole::System,
21679                    Some("system".to_string()),
21680                    None,
21681                    "# aider chat started at 2025-12-01 20:07:47".to_string(),
21682                ),
21683                (
21684                    1,
21685                    MessageRole::User,
21686                    Some("user".to_string()),
21687                    None,
21688                    "/tmp/workspace/.venv/bin/aider --no-git --message hello world".to_string(),
21689                ),
21690            ],
21691        );
21692        insert_conversation(
21693            aider_agent_id,
21694            "lexical-aider-fixture",
21695            "Aider Chat: aider",
21696            "/tmp/tests/fixtures/aider/.aider.chat.history.md",
21697            1_764_621_401_399,
21698            vec![
21699                (
21700                    0,
21701                    MessageRole::User,
21702                    Some("user".to_string()),
21703                    None,
21704                    "/add src/main.rs".to_string(),
21705                ),
21706                (
21707                    1,
21708                    MessageRole::Agent,
21709                    Some("assistant".to_string()),
21710                    None,
21711                    "Added src/main.rs to the chat.
21712
21713#### /add src/main.rs"
21714                        .to_string(),
21715                ),
21716                (
21717                    2,
21718                    MessageRole::User,
21719                    Some("user".to_string()),
21720                    None,
21721                    "Please refactor.".to_string(),
21722                ),
21723                (
21724                    3,
21725                    MessageRole::Agent,
21726                    Some("assistant".to_string()),
21727                    None,
21728                    "Sure, here is the code.".to_string(),
21729                ),
21730            ],
21731        );
21732
21733        let mut streamed = Vec::new();
21734        storage
21735            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21736                first_conversation_id.unwrap(),
21737                last_conversation_id.unwrap(),
21738                |row| {
21739                    streamed.push((
21740                        row.conversation_id,
21741                        row.idx,
21742                        row.role,
21743                        row.author,
21744                        row.created_at,
21745                        row.content,
21746                    ));
21747                    Ok(())
21748                },
21749            )
21750            .unwrap();
21751
21752        assert_eq!(streamed, expected);
21753    }
21754
21755    #[test]
21756    fn lexical_rebuild_stream_queries_use_rowid_and_per_conversation_probes() {
21757        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21758        use std::path::PathBuf;
21759
21760        let dir = TempDir::new().unwrap();
21761        let db_path = dir.path().join("agent_search.db");
21762        let storage = SqliteStorage::open(&db_path).unwrap();
21763
21764        let agent = Agent {
21765            id: None,
21766            slug: "claude_code".into(),
21767            name: "Claude Code".into(),
21768            version: None,
21769            kind: AgentKind::Cli,
21770        };
21771        let agent_id = storage.ensure_agent(&agent).unwrap();
21772
21773        for (external_id, base_ts) in [
21774            ("conv-1", 1_700_000_000_000_i64),
21775            ("conv-2", 1_700_000_001_000_i64),
21776        ] {
21777            let conversation = Conversation {
21778                id: None,
21779                agent_slug: "claude_code".into(),
21780                workspace: Some(PathBuf::from("/tmp/workspace")),
21781                external_id: Some(external_id.to_string()),
21782                title: Some("Lexical rebuild".into()),
21783                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21784                started_at: Some(base_ts),
21785                ended_at: Some(base_ts + 100),
21786                approx_tokens: None,
21787                metadata_json: serde_json::Value::Null,
21788                messages: vec![
21789                    Message {
21790                        id: None,
21791                        idx: 0,
21792                        role: MessageRole::User,
21793                        author: Some("user".into()),
21794                        created_at: Some(base_ts + 10),
21795                        content: format!("{external_id}-first"),
21796                        extra_json: serde_json::Value::Null,
21797                        snippets: Vec::new(),
21798                    },
21799                    Message {
21800                        id: None,
21801                        idx: 1,
21802                        role: MessageRole::Agent,
21803                        author: Some("assistant".into()),
21804                        created_at: Some(base_ts + 20),
21805                        content: format!("{external_id}-second"),
21806                        extra_json: serde_json::Value::Null,
21807                        snippets: Vec::new(),
21808                    },
21809                ],
21810                source_id: LOCAL_SOURCE_ID.into(),
21811                origin_host: None,
21812            };
21813            storage
21814                .insert_conversation_tree(agent_id, None, &conversation)
21815                .unwrap();
21816        }
21817
21818        let first_id: i64 = storage
21819            .conn
21820            .query_row_map(
21821                "SELECT id FROM conversations ORDER BY id LIMIT 1",
21822                fparams![],
21823                |row| row.get_typed(0),
21824            )
21825            .unwrap();
21826        let last_id: i64 = storage
21827            .conn
21828            .query_row_map(
21829                "SELECT id FROM conversations ORDER BY id DESC LIMIT 1",
21830                fparams![],
21831                |row| row.get_typed(0),
21832            )
21833            .unwrap();
21834
21835        let conversation_plan_details: Vec<String> = storage
21836            .conn
21837            .query_map_collect(
21838                "EXPLAIN QUERY PLAN                  SELECT id FROM conversations                  WHERE id >= ?1 AND id <= ?2                  ORDER BY id ASC",
21839                fparams![first_id, last_id],
21840                |row| row.get_typed(3),
21841            )
21842            .unwrap();
21843        assert!(
21844            !conversation_plan_details
21845                .iter()
21846                .any(|detail| detail.contains("TEMP B-TREE")),
21847            "expected streamed lexical rebuild conversation listing to avoid sorter temp b-trees, got {conversation_plan_details:?}"
21848        );
21849
21850        let message_plan_details: Vec<String> = storage
21851            .conn
21852            .query_map_collect(
21853                "EXPLAIN QUERY PLAN                  SELECT id, idx, role, author, created_at, content                  FROM messages INDEXED BY sqlite_autoindex_messages_1                  WHERE conversation_id = ?1                  ORDER BY idx",
21854                fparams![first_id],
21855                |row| row.get_typed(3),
21856            )
21857            .unwrap();
21858        assert!(
21859            message_plan_details
21860                .iter()
21861                .any(|detail| detail.contains("sqlite_autoindex_messages_1")
21862                    || detail.contains("idx_messages_conv_idx")),
21863            "expected per-conversation lexical rebuild fetch to use the conversation_id/idx index, got {message_plan_details:?}"
21864        );
21865        assert!(
21866            !message_plan_details
21867                .iter()
21868                .any(|detail| detail.contains("TEMP B-TREE")),
21869            "expected per-conversation lexical rebuild fetch to avoid sorter temp b-trees, got {message_plan_details:?}"
21870        );
21871    }
21872
21873    #[test]
21874    fn discover_historical_database_bundles_prefers_larger_archives_first() {
21875        let dir = TempDir::new().unwrap();
21876        let canonical_db = dir.path().join("agent_search.db");
21877        fs::write(&canonical_db, b"canonical").unwrap();
21878
21879        let smaller = dir.path().join("agent_search.corrupt.small");
21880        fs::write(&smaller, vec![0_u8; 32]).unwrap();
21881
21882        let backups_dir = dir.path().join("backups");
21883        fs::create_dir_all(&backups_dir).unwrap();
21884        let larger = backups_dir.join("agent_search.db.20260322T020200.bak");
21885        fs::write(&larger, vec![0_u8; 128]).unwrap();
21886
21887        let bundles = discover_historical_database_bundles(&canonical_db);
21888        let ordered_paths: Vec<PathBuf> =
21889            bundles.into_iter().map(|bundle| bundle.root_path).collect();
21890
21891        assert_eq!(ordered_paths, vec![larger, smaller]);
21892    }
21893
21894    #[test]
21895    fn discover_historical_database_bundles_prefers_queryable_direct_bundles_first() {
21896        let dir = TempDir::new().unwrap();
21897        let canonical_db = dir.path().join("agent_search.db");
21898        fs::write(&canonical_db, b"canonical").unwrap();
21899
21900        let larger_corrupt = dir.path().join("agent_search.corrupt.20260324_212907");
21901        fs::write(&larger_corrupt, vec![0_u8; 4096]).unwrap();
21902
21903        let backups_dir = dir.path().join("backups");
21904        fs::create_dir_all(&backups_dir).unwrap();
21905        let smaller_healthy = backups_dir.join("agent_search.db.20260322T020200.bak");
21906        let conn = FrankenConnection::open(smaller_healthy.to_string_lossy().into_owned()).unwrap();
21907        conn.execute_batch(
21908            "CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT);
21909             CREATE TABLE messages (
21910                 id INTEGER PRIMARY KEY,
21911                 conversation_id INTEGER NOT NULL,
21912                 idx INTEGER NOT NULL,
21913                 content TEXT
21914             );
21915             INSERT INTO conversations(id, source_path) VALUES (1, '/tmp/history.jsonl');
21916             INSERT INTO messages(id, conversation_id, idx, content)
21917             VALUES (1, 1, 0, 'seed');",
21918        )
21919        .unwrap();
21920        drop(conn);
21921
21922        let bundles = discover_historical_database_bundles(&canonical_db);
21923        let ordered_paths: Vec<PathBuf> = bundles
21924            .iter()
21925            .map(|bundle| bundle.root_path.clone())
21926            .collect();
21927
21928        assert_eq!(ordered_paths, vec![smaller_healthy, larger_corrupt]);
21929        assert!(bundles[0].supports_direct_readonly);
21930        assert!(!bundles[1].supports_direct_readonly);
21931    }
21932
21933    #[test]
21934    fn salvage_historical_databases_skips_unreadable_quarantined_bundles() {
21935        let dir = TempDir::new().unwrap();
21936        let canonical_db = dir.path().join("agent_search.db");
21937        let storage = SqliteStorage::open(&canonical_db).unwrap();
21938
21939        let quarantined = dir.path().join("agent_search.corrupt.20260324_212907");
21940        fs::write(&quarantined, b"not a sqlite database").unwrap();
21941
21942        let discovered: Vec<PathBuf> = discover_historical_database_bundles(&canonical_db)
21943            .into_iter()
21944            .map(|bundle| bundle.root_path)
21945            .collect();
21946        assert_eq!(discovered, vec![quarantined]);
21947
21948        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
21949        assert_eq!(outcome.bundles_considered, 1);
21950        assert_eq!(outcome.bundles_imported, 0);
21951        assert_eq!(outcome.conversations_imported, 0);
21952        assert_eq!(outcome.messages_imported, 0);
21953        assert!(storage.list_conversations(10, 0).unwrap().is_empty());
21954    }
21955
21956    #[test]
21957    fn discover_historical_database_bundles_includes_repair_lab_and_snapshots_named_roots() {
21958        let dir = TempDir::new().unwrap();
21959        let canonical_db = dir.path().join("agent_search.db");
21960        fs::write(&canonical_db, b"canonical").unwrap();
21961
21962        let repair_lab_dir = dir.path().join("repair-lab").join("live-copy");
21963        fs::create_dir_all(&repair_lab_dir).unwrap();
21964        let repair_lab_db = repair_lab_dir.join("agent_search.db");
21965        fs::write(&repair_lab_db, vec![0_u8; 96]).unwrap();
21966        fs::write(
21967            repair_lab_dir.join("agent_search.rebuild-test.db"),
21968            vec![0_u8; 192],
21969        )
21970        .unwrap();
21971
21972        let snapshots_dir = dir.path().join("snapshots").join("20260324T013201Z");
21973        fs::create_dir_all(&snapshots_dir).unwrap();
21974        let snapshot_db = snapshots_dir.join("agent_search.db");
21975        fs::write(&snapshot_db, vec![0_u8; 64]).unwrap();
21976
21977        let bundles = discover_historical_database_bundles(&canonical_db);
21978        let ordered_paths: Vec<PathBuf> =
21979            bundles.into_iter().map(|bundle| bundle.root_path).collect();
21980
21981        assert!(ordered_paths.contains(&repair_lab_db));
21982        assert!(ordered_paths.contains(&snapshot_db));
21983        assert!(
21984            !ordered_paths
21985                .iter()
21986                .any(|path| path.file_name().and_then(|name| name.to_str())
21987                    == Some("agent_search.rebuild-test.db"))
21988        );
21989    }
21990
21991    #[test]
21992    fn discover_historical_database_bundles_prefers_healthy_backup_over_replay_priority() {
21993        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21994
21995        let dir = TempDir::new().unwrap();
21996        let canonical_db = dir.path().join("agent_search.db");
21997        fs::write(&canonical_db, b"canonical").unwrap();
21998
21999        let replay_dir = dir
22000            .path()
22001            .join("repair-lab")
22002            .join("replay-20260324T070101Z");
22003        fs::create_dir_all(&replay_dir).unwrap();
22004        let replay_db = replay_dir.join("agent_search.db");
22005        let replay_storage = SqliteStorage::open(&replay_db).unwrap();
22006        let agent = Agent {
22007            id: None,
22008            slug: "codex".into(),
22009            name: "Codex".into(),
22010            version: Some("0.2.3".into()),
22011            kind: AgentKind::Cli,
22012        };
22013        let agent_id = replay_storage.ensure_agent(&agent).unwrap();
22014        let conversation = Conversation {
22015            id: None,
22016            agent_slug: "codex".into(),
22017            workspace: Some(PathBuf::from("/tmp/workspace")),
22018            external_id: Some("replay-conv".into()),
22019            title: Some("Replay bundle".into()),
22020            source_path: PathBuf::from("/tmp/replay.jsonl"),
22021            started_at: Some(1_700_000_000_000),
22022            ended_at: Some(1_700_000_000_100),
22023            approx_tokens: Some(42),
22024            metadata_json: serde_json::Value::Null,
22025            messages: vec![Message {
22026                id: None,
22027                idx: 0,
22028                role: MessageRole::Agent,
22029                author: Some("assistant".into()),
22030                created_at: Some(1_700_000_000_050),
22031                content: "replay message".into(),
22032                extra_json: serde_json::Value::Null,
22033                snippets: Vec::new(),
22034            }],
22035            source_id: LOCAL_SOURCE_ID.into(),
22036            origin_host: None,
22037        };
22038        replay_storage
22039            .insert_conversation_tree(agent_id, None, &conversation)
22040            .unwrap();
22041        drop(replay_storage);
22042
22043        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
22044        let replay_legacy = rusqlite_test_fixture_conn(&replay_db);
22045        replay_legacy
22046            .execute_batch(
22047                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
22048                 DELETE FROM _schema_migrations WHERE version = 14;
22049                 PRAGMA writable_schema = ON;",
22050            )
22051            .unwrap();
22052        replay_legacy
22053            .execute(
22054                "DELETE FROM meta WHERE key = ?1",
22055                [FTS_FRANKEN_REBUILD_META_KEY],
22056            )
22057            .unwrap();
22058        replay_legacy
22059            .execute(
22060                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22061                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22062                [duplicate_legacy_fts_sql],
22063            )
22064            .unwrap();
22065        replay_legacy
22066            .execute_batch("PRAGMA writable_schema = OFF;")
22067            .unwrap();
22068        drop(replay_legacy);
22069
22070        let backups_dir = dir.path().join("backups");
22071        fs::create_dir_all(&backups_dir).unwrap();
22072        let clean_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
22073        let clean_storage = SqliteStorage::open(&clean_backup).unwrap();
22074        let clean_agent_id = clean_storage.ensure_agent(&agent).unwrap();
22075        clean_storage
22076            .insert_conversation_tree(clean_agent_id, None, &conversation)
22077            .unwrap();
22078        drop(clean_storage);
22079
22080        let bundles = discover_historical_database_bundles(&canonical_db);
22081        let ordered_paths: Vec<PathBuf> = bundles
22082            .iter()
22083            .map(|bundle| bundle.root_path.clone())
22084            .collect();
22085
22086        assert_eq!(ordered_paths[0], clean_backup);
22087        assert_eq!(ordered_paths[1], replay_db);
22088        assert_eq!(
22089            bundles[0].probe.schema_version,
22090            Some(CURRENT_SCHEMA_VERSION)
22091        );
22092        // Post-V14 cass drops the fts_messages virtual table during migration
22093        // and recreates it lazily on first open, so a freshly-migrated "clean"
22094        // backup has zero fts_messages rows in sqlite_master. The bundle is
22095        // still ranked as healthy by `bundle_health_rank` because 0 rows is a
22096        // legitimate lazy-FTS state (see comment there).
22097        assert_eq!(bundles[0].probe.fts_schema_rows, Some(0));
22098        // `fts_queryable` mirrors a direct rusqlite probe; with 0 sqlite_master
22099        // rows the table isn't queryable until lazy repair runs.
22100        assert!(!bundles[0].probe.fts_queryable);
22101        assert_eq!(bundles[1].probe.schema_version, Some(13));
22102        // The replay bundle had V14 run (dropping fts_messages → 0 rows), then
22103        // the test rolls meta.schema_version back to 13, deletes the V14
22104        // marker, and manually injects a duplicate sqlite_master row. Net
22105        // result: one synthetic (malformed) fts_messages entry.
22106        assert_eq!(bundles[1].probe.fts_schema_rows, Some(1));
22107    }
22108
22109    #[test]
22110    fn ensure_fts_consistency_via_rusqlite_catches_up_missing_rows() {
22111        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22112
22113        let dir = TempDir::new().unwrap();
22114        let db_path = dir.path().join("fts-catchup.db");
22115        let storage = SqliteStorage::open(&db_path).unwrap();
22116        let agent = Agent {
22117            id: None,
22118            slug: "codex".into(),
22119            name: "Codex".into(),
22120            version: Some("0.2.3".into()),
22121            kind: AgentKind::Cli,
22122        };
22123        let agent_id = storage.ensure_agent(&agent).unwrap();
22124        let conversation = Conversation {
22125            id: None,
22126            agent_slug: "codex".into(),
22127            workspace: Some(PathBuf::from("/tmp/workspace")),
22128            external_id: Some("fts-catchup".into()),
22129            title: Some("FTS catchup".into()),
22130            source_path: PathBuf::from("/tmp/fts-catchup.jsonl"),
22131            started_at: Some(1_700_000_000_000),
22132            ended_at: Some(1_700_000_000_100),
22133            approx_tokens: Some(42),
22134            metadata_json: serde_json::Value::Null,
22135            messages: vec![Message {
22136                id: None,
22137                idx: 0,
22138                role: MessageRole::User,
22139                author: Some("user".into()),
22140                created_at: Some(1_700_000_000_050),
22141                content: "initial message".into(),
22142                extra_json: serde_json::Value::Null,
22143                snippets: Vec::new(),
22144            }],
22145            source_id: LOCAL_SOURCE_ID.into(),
22146            origin_host: None,
22147        };
22148        storage
22149            .insert_conversation_tree(agent_id, None, &conversation)
22150            .unwrap();
22151        drop(storage);
22152
22153        rebuild_fts_via_rusqlite(&db_path).unwrap();
22154
22155        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22156        let conversation_id: i64 = conn
22157            .query_row_map("SELECT id FROM conversations LIMIT 1", fparams![], |row| {
22158                row.get_typed(0)
22159            })
22160            .unwrap();
22161        conn.execute_compat(
22162            "INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
22163             VALUES(2, ?1, 1, 'assistant', 'assistant', 1700000000060, 'authentication catchup', NULL, NULL)",
22164            fparams![conversation_id],
22165        )
22166        .unwrap();
22167        drop(conn);
22168
22169        let repair = ensure_fts_consistency_via_rusqlite(&db_path).unwrap();
22170        assert_eq!(
22171            repair,
22172            FtsConsistencyRepair::IncrementalCatchUp {
22173                inserted_rows: 1,
22174                total_rows: 2
22175            }
22176        );
22177
22178        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22179        let auth_rows: i64 = conn
22180            .query_row_map(
22181                "SELECT COUNT(*) FROM fts_messages WHERE rowid = 2",
22182                fparams![],
22183                |row| row.get_typed(0),
22184            )
22185            .unwrap();
22186        assert_eq!(auth_rows, 1);
22187    }
22188
22189    #[test]
22190    fn rebuild_fts_via_rusqlite_cleans_duplicate_legacy_schema_rows() {
22191        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22192
22193        let dir = TempDir::new().unwrap();
22194        let db_path = dir.path().join("fts-duplicate-rebuild.db");
22195
22196        let storage = SqliteStorage::open(&db_path).unwrap();
22197        let agent = Agent {
22198            id: None,
22199            slug: "codex".into(),
22200            name: "Codex".into(),
22201            version: Some("0.2.3".into()),
22202            kind: AgentKind::Cli,
22203        };
22204        let agent_id = storage.ensure_agent(&agent).unwrap();
22205        let conversation = Conversation {
22206            id: None,
22207            agent_slug: "codex".into(),
22208            workspace: Some(PathBuf::from("/ws")),
22209            external_id: Some("retro".into()),
22210            title: Some("retro".into()),
22211            source_path: PathBuf::from("/tmp/retro.jsonl"),
22212            started_at: Some(42),
22213            ended_at: Some(42),
22214            approx_tokens: None,
22215            metadata_json: serde_json::Value::Null,
22216            messages: vec![Message {
22217                id: None,
22218                idx: 0,
22219                role: MessageRole::User,
22220                author: None,
22221                created_at: Some(42),
22222                content: "retro investigation".into(),
22223                extra_json: serde_json::Value::Null,
22224                snippets: Vec::new(),
22225            }],
22226            source_id: LOCAL_SOURCE_ID.into(),
22227            origin_host: None,
22228        };
22229        storage
22230            .insert_conversation_tree(agent_id, None, &conversation)
22231            .unwrap();
22232        drop(storage);
22233        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
22234
22235        let conn = rusqlite_test_fixture_conn(&db_path);
22236        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
22237        conn.execute(
22238            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22239             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22240            ["CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')"],
22241        )
22242        .unwrap();
22243        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
22244        let duplicate_rows: i64 = conn
22245            .query_row(
22246                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
22247                [],
22248                |row| row.get(0),
22249            )
22250            .unwrap();
22251        assert_eq!(duplicate_rows, 2);
22252        drop(conn);
22253
22254        let inserted = rebuild_fts_via_rusqlite(&db_path).unwrap();
22255        assert_eq!(inserted, 1);
22256
22257        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22258        let schema_rows = franken_fts_schema_rows(&conn).unwrap();
22259        assert_eq!(
22260            schema_rows, 1,
22261            "DROP TABLE should leave one clean FTS schema"
22262        );
22263        let match_count: i64 = conn
22264            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
22265                row.get_typed(0)
22266            })
22267            .unwrap();
22268        assert_eq!(match_count, 1);
22269    }
22270
22271    // =========================================================================
22272    // Agent storage tests (bead yln.4)
22273    // =========================================================================
22274
22275    #[test]
22276    fn ensure_agent_creates_new() {
22277        let dir = TempDir::new().unwrap();
22278        let db_path = dir.path().join("test.db");
22279        let storage = SqliteStorage::open(&db_path).unwrap();
22280
22281        let agent = Agent {
22282            id: None,
22283            slug: "test_agent".into(),
22284            name: "Test Agent".into(),
22285            version: Some("1.0".into()),
22286            kind: AgentKind::Cli,
22287        };
22288
22289        let id = storage.ensure_agent(&agent).unwrap();
22290        assert!(id > 0);
22291    }
22292
22293    #[test]
22294    fn ensure_agent_returns_existing_id() {
22295        let dir = TempDir::new().unwrap();
22296        let db_path = dir.path().join("test.db");
22297        let storage = SqliteStorage::open(&db_path).unwrap();
22298
22299        let agent = Agent {
22300            id: None,
22301            slug: "codex".into(),
22302            name: "Codex".into(),
22303            version: None,
22304            kind: AgentKind::Cli,
22305        };
22306
22307        let id1 = storage.ensure_agent(&agent).unwrap();
22308        let id2 = storage.ensure_agent(&agent).unwrap();
22309        assert_eq!(id1, id2);
22310    }
22311
22312    #[test]
22313    fn ensure_agent_unchanged_preserves_updated_at() {
22314        let dir = TempDir::new().unwrap();
22315        let db_path = dir.path().join("test.db");
22316        let storage = SqliteStorage::open(&db_path).unwrap();
22317
22318        let agent = Agent {
22319            id: None,
22320            slug: "codex".into(),
22321            name: "Codex".into(),
22322            version: Some("1.0".into()),
22323            kind: AgentKind::Cli,
22324        };
22325
22326        storage.ensure_agent(&agent).unwrap();
22327        let initial_updated_at: i64 = storage
22328            .conn
22329            .query_row_map(
22330                "SELECT updated_at FROM agents WHERE slug = ?1",
22331                fparams![agent.slug.as_str()],
22332                |row| row.get_typed(0),
22333            )
22334            .unwrap();
22335        std::thread::sleep(std::time::Duration::from_millis(5));
22336
22337        storage.ensure_agent(&agent).unwrap();
22338        let fetched_updated_at: i64 = storage
22339            .conn
22340            .query_row_map(
22341                "SELECT updated_at FROM agents WHERE slug = ?1",
22342                fparams![agent.slug.as_str()],
22343                |row| row.get_typed(0),
22344            )
22345            .unwrap();
22346
22347        assert_eq!(fetched_updated_at, initial_updated_at);
22348    }
22349
22350    #[test]
22351    fn ensure_agent_changed_metadata_updates_cached_slug() {
22352        let dir = TempDir::new().unwrap();
22353        let db_path = dir.path().join("test.db");
22354        let storage = SqliteStorage::open(&db_path).unwrap();
22355
22356        let mut agent = Agent {
22357            id: None,
22358            slug: "codex".into(),
22359            name: "Codex".into(),
22360            version: Some("1.0".into()),
22361            kind: AgentKind::Cli,
22362        };
22363
22364        let id1 = storage.ensure_agent(&agent).unwrap();
22365        agent.name = "Codex CLI".into();
22366        agent.version = Some("1.1".into());
22367        let id2 = storage.ensure_agent(&agent).unwrap();
22368
22369        let fetched: (String, Option<String>) = storage
22370            .conn
22371            .query_row_map(
22372                "SELECT name, version FROM agents WHERE slug = ?1",
22373                fparams![agent.slug.as_str()],
22374                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
22375            )
22376            .unwrap();
22377
22378        assert_eq!(id1, id2);
22379        assert_eq!(fetched, ("Codex CLI".into(), Some("1.1".into())));
22380    }
22381
22382    #[test]
22383    fn list_agents_returns_inserted() {
22384        let dir = TempDir::new().unwrap();
22385        let db_path = dir.path().join("test.db");
22386        let storage = SqliteStorage::open(&db_path).unwrap();
22387
22388        let agent = Agent {
22389            id: None,
22390            slug: "new_agent".into(),
22391            name: "New Agent".into(),
22392            version: None,
22393            kind: AgentKind::VsCode,
22394        };
22395        storage.ensure_agent(&agent).unwrap();
22396
22397        let agents = storage.list_agents().unwrap();
22398        assert!(agents.iter().any(|a| a.slug == "new_agent"));
22399    }
22400
22401    // =========================================================================
22402    // Workspace storage tests (bead yln.4)
22403    // =========================================================================
22404
22405    #[test]
22406    fn ensure_workspace_creates_new() {
22407        let dir = TempDir::new().unwrap();
22408        let db_path = dir.path().join("test.db");
22409        let storage = SqliteStorage::open(&db_path).unwrap();
22410
22411        let id = storage
22412            .ensure_workspace(Path::new("/home/user/project"), Some("My Project"))
22413            .unwrap();
22414        assert!(id > 0);
22415    }
22416
22417    #[test]
22418    fn ensure_workspace_returns_existing() {
22419        let dir = TempDir::new().unwrap();
22420        let db_path = dir.path().join("test.db");
22421        let storage = SqliteStorage::open(&db_path).unwrap();
22422
22423        let path = Path::new("/home/user/myproject");
22424        let id1 = storage.ensure_workspace(path, None).unwrap();
22425        let id2 = storage.ensure_workspace(path, None).unwrap();
22426        assert_eq!(id1, id2);
22427    }
22428
22429    #[test]
22430    fn ensure_workspace_changed_display_name_updates_cached_path() {
22431        let dir = TempDir::new().unwrap();
22432        let db_path = dir.path().join("test.db");
22433        let storage = SqliteStorage::open(&db_path).unwrap();
22434
22435        let path = Path::new("/home/user/myproject");
22436        let id1 = storage.ensure_workspace(path, Some("Before")).unwrap();
22437        let id2 = storage.ensure_workspace(path, Some("After")).unwrap();
22438
22439        let display_name: Option<String> = storage
22440            .conn
22441            .query_row_map(
22442                "SELECT display_name FROM workspaces WHERE path = ?1",
22443                fparams![path.to_string_lossy().as_ref()],
22444                |row| row.get_typed(0),
22445            )
22446            .unwrap();
22447
22448        assert_eq!(id1, id2);
22449        assert_eq!(display_name.as_deref(), Some("After"));
22450    }
22451
22452    #[test]
22453    fn list_workspaces_returns_inserted() {
22454        let dir = TempDir::new().unwrap();
22455        let db_path = dir.path().join("test.db");
22456        let storage = SqliteStorage::open(&db_path).unwrap();
22457
22458        storage
22459            .ensure_workspace(Path::new("/test/workspace"), Some("Test WS"))
22460            .unwrap();
22461
22462        let workspaces = storage.list_workspaces().unwrap();
22463        assert!(
22464            workspaces
22465                .iter()
22466                .any(|w| w.path.to_str() == Some("/test/workspace"))
22467        );
22468    }
22469
22470    // =========================================================================
22471    // Source storage tests (bead yln.4)
22472    // =========================================================================
22473
22474    #[test]
22475    fn upsert_source_creates_new() {
22476        let dir = TempDir::new().unwrap();
22477        let db_path = dir.path().join("test.db");
22478        let storage = SqliteStorage::open(&db_path).unwrap();
22479
22480        let source = Source {
22481            id: "test-laptop".into(),
22482            kind: SourceKind::Ssh,
22483            host_label: Some("test.local".into()),
22484            machine_id: Some("test-machine-id".into()),
22485            platform: None,
22486            config_json: None,
22487            created_at: Some(SqliteStorage::now_millis()),
22488            updated_at: None,
22489        };
22490
22491        storage.upsert_source(&source).unwrap();
22492        let fetched = storage.get_source("test-laptop").unwrap();
22493        assert!(fetched.is_some());
22494        assert_eq!(fetched.unwrap().host_label, Some("test.local".into()));
22495    }
22496
22497    #[test]
22498    fn upsert_source_updates_existing() {
22499        let dir = TempDir::new().unwrap();
22500        let db_path = dir.path().join("test.db");
22501        let storage = SqliteStorage::open(&db_path).unwrap();
22502
22503        let source1 = Source {
22504            id: "my-source".into(),
22505            kind: SourceKind::Ssh,
22506            host_label: Some("Original Label".into()),
22507            machine_id: None,
22508            platform: None,
22509            config_json: None,
22510            created_at: Some(SqliteStorage::now_millis()),
22511            updated_at: None,
22512        };
22513        storage.upsert_source(&source1).unwrap();
22514
22515        let source2 = Source {
22516            id: "my-source".into(),
22517            kind: SourceKind::Ssh,
22518            host_label: Some("Updated Label".into()),
22519            machine_id: None,
22520            platform: Some("linux".into()),
22521            config_json: None,
22522            created_at: Some(SqliteStorage::now_millis()),
22523            updated_at: Some(SqliteStorage::now_millis()),
22524        };
22525        storage.upsert_source(&source2).unwrap();
22526
22527        let fetched = storage.get_source("my-source").unwrap().unwrap();
22528        assert_eq!(fetched.host_label, Some("Updated Label".into()));
22529        assert!(fetched.platform.is_some());
22530    }
22531
22532    #[test]
22533    fn upsert_source_unchanged_preserves_updated_at() {
22534        let dir = TempDir::new().unwrap();
22535        let db_path = dir.path().join("test.db");
22536        let storage = SqliteStorage::open(&db_path).unwrap();
22537
22538        let source = Source {
22539            id: "stable-source".into(),
22540            kind: SourceKind::Ssh,
22541            host_label: Some("builder.local".into()),
22542            machine_id: None,
22543            platform: Some("linux".into()),
22544            config_json: Some(serde_json::json!({"role": "bench"})),
22545            created_at: None,
22546            updated_at: None,
22547        };
22548
22549        storage.upsert_source(&source).unwrap();
22550        let initial = storage.get_source("stable-source").unwrap().unwrap();
22551        std::thread::sleep(std::time::Duration::from_millis(5));
22552
22553        storage.upsert_source(&source).unwrap();
22554        let fetched = storage.get_source("stable-source").unwrap().unwrap();
22555
22556        assert_eq!(fetched.created_at, initial.created_at);
22557        assert_eq!(fetched.updated_at, initial.updated_at);
22558        assert_eq!(fetched.host_label, initial.host_label);
22559        assert_eq!(fetched.platform, initial.platform);
22560        assert_eq!(fetched.config_json, initial.config_json);
22561    }
22562
22563    #[test]
22564    fn ensure_source_for_conversation_recreates_remote_source_after_delete() {
22565        let dir = TempDir::new().unwrap();
22566        let db_path = dir.path().join("test.db");
22567        let storage = SqliteStorage::open(&db_path).unwrap();
22568
22569        let conversation = Conversation {
22570            id: None,
22571            agent_slug: "codex".into(),
22572            workspace: Some(PathBuf::from("/ws/cache-recreate")),
22573            external_id: Some("cache-recreate".into()),
22574            title: Some("Cache Recreate".into()),
22575            source_path: PathBuf::from("/log/cache-recreate.jsonl"),
22576            started_at: Some(1_700_000_000_000),
22577            ended_at: Some(1_700_000_000_001),
22578            approx_tokens: Some(16),
22579            metadata_json: serde_json::json!({}),
22580            messages: vec![Message {
22581                id: None,
22582                idx: 0,
22583                role: MessageRole::User,
22584                author: Some("tester".into()),
22585                created_at: Some(1_700_000_000_000),
22586                content: "cache recreate".into(),
22587                extra_json: serde_json::json!({}),
22588                snippets: Vec::new(),
22589            }],
22590            source_id: "cache-remote-source".into(),
22591            origin_host: Some("builder-cache".into()),
22592        };
22593
22594        storage
22595            .ensure_source_for_conversation(&conversation)
22596            .unwrap();
22597        assert!(storage.get_source("cache-remote-source").unwrap().is_some());
22598
22599        let deleted = storage.delete_source("cache-remote-source", false).unwrap();
22600        assert!(deleted);
22601        assert!(storage.get_source("cache-remote-source").unwrap().is_none());
22602
22603        storage
22604            .ensure_source_for_conversation(&conversation)
22605            .unwrap();
22606        let recreated = storage.get_source("cache-remote-source").unwrap();
22607        assert!(recreated.is_some());
22608        assert_eq!(
22609            recreated.unwrap().host_label.as_deref(),
22610            Some("builder-cache")
22611        );
22612    }
22613
22614    #[test]
22615    fn delete_source_removes_entry() {
22616        let dir = TempDir::new().unwrap();
22617        let db_path = dir.path().join("test.db");
22618        let storage = SqliteStorage::open(&db_path).unwrap();
22619
22620        let source = Source {
22621            id: "to-delete".into(),
22622            kind: SourceKind::Local,
22623            host_label: None,
22624            machine_id: None,
22625            platform: None,
22626            config_json: None,
22627            created_at: Some(SqliteStorage::now_millis()),
22628            updated_at: None,
22629        };
22630        storage.upsert_source(&source).unwrap();
22631
22632        let deleted = storage.delete_source("to-delete", false).unwrap();
22633        assert!(deleted);
22634
22635        let fetched = storage.get_source("to-delete").unwrap();
22636        assert!(fetched.is_none());
22637    }
22638
22639    #[test]
22640    fn delete_source_cannot_delete_local() {
22641        let dir = TempDir::new().unwrap();
22642        let db_path = dir.path().join("test.db");
22643        let storage = SqliteStorage::open(&db_path).unwrap();
22644
22645        let result = storage.delete_source(LOCAL_SOURCE_ID, false);
22646        assert!(result.is_err());
22647    }
22648
22649    #[test]
22650    fn list_sources_includes_local() {
22651        let dir = TempDir::new().unwrap();
22652        let db_path = dir.path().join("test.db");
22653        let storage = SqliteStorage::open(&db_path).unwrap();
22654
22655        let sources = storage.list_sources().unwrap();
22656        assert!(sources.iter().any(|s| s.id == LOCAL_SOURCE_ID));
22657    }
22658
22659    #[test]
22660    fn insert_conversation_tree_blank_local_source_normalizes_to_local_id() {
22661        let dir = TempDir::new().unwrap();
22662        let db_path = dir.path().join("test.db");
22663        let storage = SqliteStorage::open(&db_path).unwrap();
22664
22665        let agent_id = storage
22666            .ensure_agent(&Agent {
22667                id: None,
22668                slug: "codex".into(),
22669                name: "Codex".into(),
22670                version: None,
22671                kind: AgentKind::Cli,
22672            })
22673            .unwrap();
22674
22675        let conversation = Conversation {
22676            id: None,
22677            agent_slug: "codex".into(),
22678            workspace: None,
22679            external_id: Some("blank-local-source".into()),
22680            title: Some("Blank local source".into()),
22681            source_path: dir.path().join("blank-local.jsonl"),
22682            started_at: Some(1_700_000_000_000),
22683            ended_at: Some(1_700_000_000_001),
22684            approx_tokens: None,
22685            metadata_json: serde_json::Value::Null,
22686            messages: vec![Message {
22687                id: None,
22688                idx: 0,
22689                role: MessageRole::User,
22690                author: None,
22691                created_at: Some(1_700_000_000_000),
22692                content: "hello".into(),
22693                extra_json: serde_json::Value::Null,
22694                snippets: Vec::new(),
22695            }],
22696            source_id: "   ".into(),
22697            origin_host: None,
22698        };
22699
22700        storage
22701            .insert_conversation_tree(agent_id, None, &conversation)
22702            .unwrap();
22703
22704        assert!(storage.get_source("   ").unwrap().is_none());
22705        let source = storage
22706            .get_source(LOCAL_SOURCE_ID)
22707            .unwrap()
22708            .expect("local source row should exist");
22709        assert_eq!(source.kind, SourceKind::Local);
22710        assert_eq!(source.host_label, None);
22711
22712        let conversations = storage.list_conversations(10, 0).unwrap();
22713        assert_eq!(conversations.len(), 1);
22714        assert_eq!(conversations[0].source_id, LOCAL_SOURCE_ID);
22715        assert_eq!(conversations[0].origin_host, None);
22716    }
22717
22718    #[test]
22719    fn repeated_local_inserts_do_not_touch_bootstrap_source_row() {
22720        let dir = TempDir::new().unwrap();
22721        let db_path = dir.path().join("test.db");
22722        let storage = SqliteStorage::open(&db_path).unwrap();
22723
22724        let agent_id = storage
22725            .ensure_agent(&Agent {
22726                id: None,
22727                slug: "codex".into(),
22728                name: "Codex".into(),
22729                version: None,
22730                kind: AgentKind::Cli,
22731            })
22732            .unwrap();
22733
22734        let bootstrap_updated_at: i64 = storage
22735            .conn
22736            .query_row_map(
22737                "SELECT updated_at FROM sources WHERE id = ?1",
22738                fparams![LOCAL_SOURCE_ID],
22739                |row| row.get_typed(0),
22740            )
22741            .unwrap();
22742
22743        let make_conversation = |external_id: &str, suffix: &str| Conversation {
22744            id: None,
22745            agent_slug: "codex".into(),
22746            workspace: None,
22747            external_id: Some(external_id.into()),
22748            title: Some(format!("Local source {suffix}")),
22749            source_path: dir.path().join(format!("local-{suffix}.jsonl")),
22750            started_at: Some(1_700_000_000_000),
22751            ended_at: Some(1_700_000_000_001),
22752            approx_tokens: None,
22753            metadata_json: serde_json::Value::Null,
22754            messages: vec![Message {
22755                id: None,
22756                idx: 0,
22757                role: MessageRole::User,
22758                author: None,
22759                created_at: Some(1_700_000_000_000),
22760                content: format!("hello-{suffix}"),
22761                extra_json: serde_json::Value::Null,
22762                snippets: Vec::new(),
22763            }],
22764            source_id: LOCAL_SOURCE_ID.into(),
22765            origin_host: None,
22766        };
22767
22768        std::thread::sleep(std::time::Duration::from_millis(5));
22769        storage
22770            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-1", "one"))
22771            .unwrap();
22772        let after_first_insert: i64 = storage
22773            .conn
22774            .query_row_map(
22775                "SELECT updated_at FROM sources WHERE id = ?1",
22776                fparams![LOCAL_SOURCE_ID],
22777                |row| row.get_typed(0),
22778            )
22779            .unwrap();
22780
22781        std::thread::sleep(std::time::Duration::from_millis(5));
22782        storage
22783            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-2", "two"))
22784            .unwrap();
22785        let after_second_insert: i64 = storage
22786            .conn
22787            .query_row_map(
22788                "SELECT updated_at FROM sources WHERE id = ?1",
22789                fparams![LOCAL_SOURCE_ID],
22790                |row| row.get_typed(0),
22791            )
22792            .unwrap();
22793
22794        assert_eq!(after_first_insert, bootstrap_updated_at);
22795        assert_eq!(after_second_insert, bootstrap_updated_at);
22796    }
22797
22798    #[test]
22799    fn insert_conversation_tree_blank_remote_source_normalizes_to_origin_host() {
22800        let dir = TempDir::new().unwrap();
22801        let db_path = dir.path().join("test.db");
22802        let storage = SqliteStorage::open(&db_path).unwrap();
22803
22804        let agent_id = storage
22805            .ensure_agent(&Agent {
22806                id: None,
22807                slug: "codex".into(),
22808                name: "Codex".into(),
22809                version: None,
22810                kind: AgentKind::Cli,
22811            })
22812            .unwrap();
22813
22814        let conversation = Conversation {
22815            id: None,
22816            agent_slug: "codex".into(),
22817            workspace: None,
22818            external_id: Some("blank-remote-source".into()),
22819            title: Some("Blank remote source".into()),
22820            source_path: dir.path().join("blank-remote.jsonl"),
22821            started_at: Some(1_700_000_000_000),
22822            ended_at: Some(1_700_000_000_001),
22823            approx_tokens: None,
22824            metadata_json: serde_json::Value::Null,
22825            messages: vec![Message {
22826                id: None,
22827                idx: 0,
22828                role: MessageRole::User,
22829                author: None,
22830                created_at: Some(1_700_000_000_000),
22831                content: "hello".into(),
22832                extra_json: serde_json::Value::Null,
22833                snippets: Vec::new(),
22834            }],
22835            source_id: "   ".into(),
22836            origin_host: Some("user@work-laptop".into()),
22837        };
22838
22839        storage
22840            .insert_conversation_tree(agent_id, None, &conversation)
22841            .unwrap();
22842
22843        assert!(storage.get_source("   ").unwrap().is_none());
22844        let source = storage
22845            .get_source("user@work-laptop")
22846            .unwrap()
22847            .expect("normalized remote source row should exist");
22848        assert_eq!(source.kind, SourceKind::Ssh);
22849        assert_eq!(source.host_label.as_deref(), Some("user@work-laptop"));
22850
22851        let conversations = storage.list_conversations(10, 0).unwrap();
22852        assert_eq!(conversations.len(), 1);
22853        assert_eq!(conversations[0].source_id, "user@work-laptop");
22854        assert_eq!(
22855            conversations[0].origin_host.as_deref(),
22856            Some("user@work-laptop")
22857        );
22858    }
22859
22860    #[test]
22861    fn insert_conversations_batched_normalizes_host_only_remote_source_id() {
22862        let dir = TempDir::new().unwrap();
22863        let db_path = dir.path().join("test.db");
22864        let storage = SqliteStorage::open(&db_path).unwrap();
22865
22866        let agent_id = storage
22867            .ensure_agent(&Agent {
22868                id: None,
22869                slug: "codex".into(),
22870                name: "Codex".into(),
22871                version: None,
22872                kind: AgentKind::Cli,
22873            })
22874            .unwrap();
22875
22876        let conversation = Conversation {
22877            id: None,
22878            agent_slug: "codex".into(),
22879            workspace: None,
22880            external_id: Some("batched-blank-remote-source".into()),
22881            title: Some("Batched blank remote source".into()),
22882            source_path: dir.path().join("batched-blank-remote.jsonl"),
22883            started_at: Some(1_700_000_000_000),
22884            ended_at: Some(1_700_000_000_001),
22885            approx_tokens: None,
22886            metadata_json: serde_json::Value::Null,
22887            messages: vec![Message {
22888                id: None,
22889                idx: 0,
22890                role: MessageRole::User,
22891                author: None,
22892                created_at: Some(1_700_000_000_000),
22893                content: "hello".into(),
22894                extra_json: serde_json::Value::Null,
22895                snippets: Vec::new(),
22896            }],
22897            source_id: "   ".into(),
22898            origin_host: Some("user@batch-host".into()),
22899        };
22900
22901        storage
22902            .insert_conversations_batched(&[(agent_id, None, &conversation)])
22903            .unwrap();
22904
22905        assert!(storage.get_source("   ").unwrap().is_none());
22906        let source = storage
22907            .get_source("user@batch-host")
22908            .unwrap()
22909            .expect("normalized batched remote source row should exist");
22910        assert_eq!(source.kind, SourceKind::Ssh);
22911        assert_eq!(source.host_label.as_deref(), Some("user@batch-host"));
22912
22913        let conversations = storage.list_conversations(10, 0).unwrap();
22914        assert_eq!(conversations.len(), 1);
22915        assert_eq!(conversations[0].source_id, "user@batch-host");
22916        assert_eq!(
22917            conversations[0].origin_host.as_deref(),
22918            Some("user@batch-host")
22919        );
22920    }
22921
22922    #[test]
22923    fn get_source_ids_excludes_local() {
22924        let dir = TempDir::new().unwrap();
22925        let db_path = dir.path().join("test.db");
22926        let storage = SqliteStorage::open(&db_path).unwrap();
22927
22928        // Add a non-local source
22929        let source = Source {
22930            id: "remote-1".into(),
22931            kind: SourceKind::Ssh,
22932            host_label: Some("server".into()),
22933            machine_id: None,
22934            platform: None,
22935            config_json: None,
22936            created_at: Some(SqliteStorage::now_millis()),
22937            updated_at: None,
22938        };
22939        storage.upsert_source(&source).unwrap();
22940
22941        let ids = storage.get_source_ids().unwrap();
22942        assert!(!ids.contains(&LOCAL_SOURCE_ID.to_string()));
22943        assert!(ids.contains(&"remote-1".to_string()));
22944    }
22945
22946    // =========================================================================
22947    // Scan timestamp tests (bead yln.4)
22948    // =========================================================================
22949
22950    #[test]
22951    fn get_last_scan_ts_returns_none_initially() {
22952        let dir = TempDir::new().unwrap();
22953        let db_path = dir.path().join("test.db");
22954        let storage = SqliteStorage::open(&db_path).unwrap();
22955
22956        let ts = storage.get_last_scan_ts().unwrap();
22957        assert!(ts.is_none());
22958    }
22959
22960    #[test]
22961    fn set_and_get_last_scan_ts() {
22962        let dir = TempDir::new().unwrap();
22963        let db_path = dir.path().join("test.db");
22964        let storage = SqliteStorage::open(&db_path).unwrap();
22965
22966        let expected_ts = 1700000000000_i64;
22967        storage.set_last_scan_ts(expected_ts).unwrap();
22968
22969        let actual_ts = storage.get_last_scan_ts().unwrap();
22970        assert_eq!(actual_ts, Some(expected_ts));
22971    }
22972
22973    // =========================================================================
22974    // now_millis utility test (bead yln.4)
22975    // =========================================================================
22976
22977    #[test]
22978    fn now_millis_returns_reasonable_value() {
22979        let ts = SqliteStorage::now_millis();
22980        // Should be after Jan 1, 2020 (approx 1577836800000)
22981        assert!(ts > 1577836800000);
22982        // Should be before Jan 1, 2100 (approx 4102444800000)
22983        assert!(ts < 4102444800000);
22984    }
22985
22986    // =========================================================================
22987    // Binary Metadata Serialization Tests (Opt 3.1)
22988    // =========================================================================
22989
22990    #[test]
22991    fn msgpack_roundtrip_basic_object() {
22992        let value = serde_json::json!({
22993            "key": "value",
22994            "number": 42,
22995            "nested": { "inner": true }
22996        });
22997
22998        let bytes = serialize_json_to_msgpack(&value).expect("should serialize");
22999        let recovered = deserialize_msgpack_to_json(&bytes);
23000
23001        assert_eq!(value, recovered);
23002    }
23003
23004    #[test]
23005    fn msgpack_returns_none_for_null() {
23006        let value = serde_json::Value::Null;
23007        assert!(serialize_json_to_msgpack(&value).is_none());
23008    }
23009
23010    #[test]
23011    fn message_insert_stores_null_extra_json_as_sql_null() {
23012        let dir = TempDir::new().unwrap();
23013        let db_path = dir.path().join("test.db");
23014        let storage = SqliteStorage::open(&db_path).unwrap();
23015        let agent_id = storage
23016            .ensure_agent(&Agent {
23017                id: None,
23018                slug: "codex".into(),
23019                name: "Codex".into(),
23020                version: None,
23021                kind: AgentKind::Cli,
23022            })
23023            .unwrap();
23024        let conversation = Conversation {
23025            id: None,
23026            agent_slug: "codex".into(),
23027            workspace: None,
23028            external_id: Some("null-extra-json".into()),
23029            title: Some("Null extra_json".into()),
23030            source_path: PathBuf::from("/tmp/null-extra-json.jsonl"),
23031            started_at: Some(1_700_000_000_000),
23032            ended_at: Some(1_700_000_000_001),
23033            approx_tokens: None,
23034            metadata_json: serde_json::Value::Null,
23035            messages: vec![Message {
23036                id: None,
23037                idx: 0,
23038                role: MessageRole::User,
23039                author: None,
23040                created_at: Some(1_700_000_000_000),
23041                content: "null metadata message".into(),
23042                extra_json: serde_json::Value::Null,
23043                snippets: Vec::new(),
23044            }],
23045            source_id: LOCAL_SOURCE_ID.into(),
23046            origin_host: None,
23047        };
23048
23049        let conversation_id = storage
23050            .insert_conversation_tree(agent_id, None, &conversation)
23051            .unwrap()
23052            .conversation_id;
23053
23054        let (extra_json, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23055            .conn
23056            .query_row_map(
23057                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23058                fparams![conversation_id],
23059                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23060            )
23061            .unwrap();
23062        assert!(extra_json.is_none());
23063        assert!(extra_bin.is_none());
23064
23065        let stored = storage.fetch_messages(conversation_id).unwrap();
23066        assert!(stored[0].extra_json.is_null());
23067    }
23068
23069    #[test]
23070    fn message_insert_stores_nonempty_extra_json_as_msgpack_only() {
23071        let dir = TempDir::new().unwrap();
23072        let db_path = dir.path().join("test.db");
23073        let storage = SqliteStorage::open(&db_path).unwrap();
23074        let agent_id = storage
23075            .ensure_agent(&Agent {
23076                id: None,
23077                slug: "codex".into(),
23078                name: "Codex".into(),
23079                version: None,
23080                kind: AgentKind::Cli,
23081            })
23082            .unwrap();
23083        let extra_json = serde_json::json!({ "idx": 7, "kind": "profile" });
23084        let conversation = Conversation {
23085            id: None,
23086            agent_slug: "codex".into(),
23087            workspace: None,
23088            external_id: Some("msgpack-extra-json".into()),
23089            title: Some("MessagePack extra_json".into()),
23090            source_path: PathBuf::from("/tmp/msgpack-extra-json.jsonl"),
23091            started_at: Some(1_700_000_000_000),
23092            ended_at: Some(1_700_000_000_001),
23093            approx_tokens: None,
23094            metadata_json: serde_json::Value::Null,
23095            messages: vec![Message {
23096                id: None,
23097                idx: 0,
23098                role: MessageRole::User,
23099                author: None,
23100                created_at: Some(1_700_000_000_000),
23101                content: "msgpack metadata message".into(),
23102                extra_json: extra_json.clone(),
23103                snippets: Vec::new(),
23104            }],
23105            source_id: LOCAL_SOURCE_ID.into(),
23106            origin_host: None,
23107        };
23108
23109        let conversation_id = storage
23110            .insert_conversation_tree(agent_id, None, &conversation)
23111            .unwrap()
23112            .conversation_id;
23113
23114        let (extra_json_text, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23115            .conn
23116            .query_row_map(
23117                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23118                fparams![conversation_id],
23119                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23120            )
23121            .unwrap();
23122        assert!(extra_json_text.is_none());
23123        assert!(extra_bin.is_some());
23124
23125        let stored = storage.fetch_messages(conversation_id).unwrap();
23126        assert_eq!(stored[0].extra_json, extra_json);
23127    }
23128
23129    #[test]
23130    fn conversation_insert_preserves_null_metadata_json_as_json_null() {
23131        let dir = TempDir::new().unwrap();
23132        let db_path = dir.path().join("test.db");
23133        let storage = SqliteStorage::open(&db_path).unwrap();
23134        let agent_id = storage
23135            .ensure_agent(&Agent {
23136                id: None,
23137                slug: "codex".into(),
23138                name: "Codex".into(),
23139                version: None,
23140                kind: AgentKind::Cli,
23141            })
23142            .unwrap();
23143        let conversation = Conversation {
23144            id: None,
23145            agent_slug: "codex".into(),
23146            workspace: None,
23147            external_id: Some("null-conversation-metadata".into()),
23148            title: Some("Null conversation metadata".into()),
23149            source_path: PathBuf::from("/tmp/null-conversation-metadata.jsonl"),
23150            started_at: Some(1_700_000_000_000),
23151            ended_at: Some(1_700_000_000_001),
23152            approx_tokens: None,
23153            metadata_json: serde_json::Value::Null,
23154            messages: vec![Message {
23155                id: None,
23156                idx: 0,
23157                role: MessageRole::User,
23158                author: None,
23159                created_at: Some(1_700_000_000_000),
23160                content: "null conversation metadata message".into(),
23161                extra_json: serde_json::Value::Null,
23162                snippets: Vec::new(),
23163            }],
23164            source_id: LOCAL_SOURCE_ID.into(),
23165            origin_host: None,
23166        };
23167
23168        storage
23169            .insert_conversation_tree(agent_id, None, &conversation)
23170            .unwrap();
23171
23172        let (metadata_json, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23173            .conn
23174            .query_row_map(
23175                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23176                fparams!["null-conversation-metadata"],
23177                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23178            )
23179            .unwrap();
23180        assert_eq!(metadata_json.as_deref(), Some("null"));
23181        assert!(metadata_bin.is_none());
23182
23183        let listed = storage.list_conversations(10, 0).unwrap();
23184        assert!(listed[0].metadata_json.is_null());
23185    }
23186
23187    #[test]
23188    fn conversation_insert_stores_nonempty_metadata_as_msgpack_only() {
23189        let dir = TempDir::new().unwrap();
23190        let db_path = dir.path().join("test.db");
23191        let storage = SqliteStorage::open(&db_path).unwrap();
23192        let agent_id = storage
23193            .ensure_agent(&Agent {
23194                id: None,
23195                slug: "codex".into(),
23196                name: "Codex".into(),
23197                version: None,
23198                kind: AgentKind::Cli,
23199            })
23200            .unwrap();
23201        let metadata_json = serde_json::json!({ "bench": true, "source": "profile" });
23202        let conversation = Conversation {
23203            id: None,
23204            agent_slug: "codex".into(),
23205            workspace: None,
23206            external_id: Some("msgpack-conversation-metadata".into()),
23207            title: Some("MessagePack conversation metadata".into()),
23208            source_path: PathBuf::from("/tmp/msgpack-conversation-metadata.jsonl"),
23209            started_at: Some(1_700_000_000_000),
23210            ended_at: Some(1_700_000_000_001),
23211            approx_tokens: None,
23212            metadata_json: metadata_json.clone(),
23213            messages: vec![Message {
23214                id: None,
23215                idx: 0,
23216                role: MessageRole::User,
23217                author: None,
23218                created_at: Some(1_700_000_000_000),
23219                content: "msgpack conversation metadata message".into(),
23220                extra_json: serde_json::Value::Null,
23221                snippets: Vec::new(),
23222            }],
23223            source_id: LOCAL_SOURCE_ID.into(),
23224            origin_host: None,
23225        };
23226
23227        storage
23228            .insert_conversation_tree(agent_id, None, &conversation)
23229            .unwrap();
23230
23231        let (metadata_text, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23232            .conn
23233            .query_row_map(
23234                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23235                fparams!["msgpack-conversation-metadata"],
23236                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23237            )
23238            .unwrap();
23239        assert!(metadata_text.is_none());
23240        assert!(metadata_bin.is_some());
23241
23242        let listed = storage.list_conversations(10, 0).unwrap();
23243        assert_eq!(listed[0].metadata_json, metadata_json);
23244    }
23245
23246    #[test]
23247    fn msgpack_returns_none_for_empty_object() {
23248        let value = serde_json::json!({});
23249        assert!(serialize_json_to_msgpack(&value).is_none());
23250    }
23251
23252    #[test]
23253    fn parse_historical_json_column_preserves_large_payloads_as_raw_json() {
23254        let raw = format!("{{\"blob\":\"{}\"}}", "x".repeat(1_000_000));
23255
23256        let value = parse_historical_json_column(Some(raw.clone()));
23257
23258        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23259        assert_eq!(json_value_size_hint(&value), raw.len());
23260    }
23261
23262    #[test]
23263    fn parse_historical_json_column_preserves_small_payloads_as_raw_json() {
23264        let raw = String::from("{\"ok\":true,\"n\":1}");
23265
23266        let value = parse_historical_json_column(Some(raw.clone()));
23267
23268        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23269    }
23270
23271    #[test]
23272    fn msgpack_serializes_non_empty_array() {
23273        let value = serde_json::json!([1, 2, 3]);
23274        let bytes = serialize_json_to_msgpack(&value).expect("should serialize array");
23275        let recovered = deserialize_msgpack_to_json(&bytes);
23276        assert_eq!(value, recovered);
23277    }
23278
23279    #[test]
23280    fn msgpack_smaller_than_json() {
23281        let value = serde_json::json!({
23282            "field_name_one": "some_value",
23283            "field_name_two": 123456,
23284            "field_name_three": [1, 2, 3, 4, 5],
23285            "field_name_four": { "nested": true }
23286        });
23287
23288        let json_bytes = serde_json::to_vec(&value).unwrap();
23289        let msgpack_bytes = serialize_json_to_msgpack(&value).unwrap();
23290
23291        // MessagePack should be smaller due to more compact encoding
23292        assert!(
23293            msgpack_bytes.len() < json_bytes.len(),
23294            "MessagePack ({} bytes) should be smaller than JSON ({} bytes)",
23295            msgpack_bytes.len(),
23296            json_bytes.len()
23297        );
23298    }
23299
23300    #[test]
23301    fn migration_v7_adds_binary_columns() {
23302        let dir = TempDir::new().unwrap();
23303        let db_path = dir.path().join("test.db");
23304        let storage = SqliteStorage::open(&db_path).unwrap();
23305
23306        // Verify metadata_bin column exists
23307        let has_metadata_bin = storage
23308            .raw()
23309            .query("PRAGMA table_info(conversations)")
23310            .unwrap()
23311            .iter()
23312            .any(|row| row.get_typed::<String>(1).unwrap() == "metadata_bin");
23313        assert!(
23314            has_metadata_bin,
23315            "conversations should have metadata_bin column"
23316        );
23317
23318        // Verify extra_bin column exists
23319        let has_extra_bin = storage
23320            .raw()
23321            .query("PRAGMA table_info(messages)")
23322            .unwrap()
23323            .iter()
23324            .any(|row| row.get_typed::<String>(1).unwrap() == "extra_bin");
23325        assert!(has_extra_bin, "messages should have extra_bin column");
23326    }
23327
23328    #[test]
23329    fn insert_conversation_tree_rehydrates_append_tail_state_cache_after_manual_clear() {
23330        let dir = TempDir::new().unwrap();
23331        let db_path = dir.path().join("append-tail-state-cache.db");
23332        let storage = SqliteStorage::open(&db_path).unwrap();
23333        let agent_id = storage
23334            .ensure_agent(&Agent {
23335                id: None,
23336                slug: "codex".into(),
23337                name: "Codex".into(),
23338                version: None,
23339                kind: AgentKind::Cli,
23340            })
23341            .unwrap();
23342        let workspace = PathBuf::from("/ws/profiled-append-remote");
23343        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
23344
23345        let initial = make_profiled_append_remote_merge_conversation(11, 5);
23346        let insert_outcome = storage
23347            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
23348            .unwrap();
23349        let conversation_id = insert_outcome.conversation_id;
23350
23351        let initial_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23352            .raw()
23353            .query_row_map(
23354                "SELECT ended_at, last_message_idx, last_message_created_at
23355                 FROM conversation_tail_state
23356                 WHERE conversation_id = ?1",
23357                fparams![conversation_id],
23358                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23359            )
23360            .unwrap();
23361        assert_eq!(initial_tail, (Some(111_005), Some(4), Some(111_004)));
23362
23363        storage
23364            .raw()
23365            .execute_compat(
23366                "UPDATE conversations SET ended_at = ?1 WHERE id = ?2",
23367                fparams![111_999_i64, conversation_id],
23368            )
23369            .unwrap();
23370        storage
23371            .raw()
23372            .execute_compat(
23373                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
23374                fparams![conversation_id],
23375            )
23376            .unwrap();
23377
23378        let appended = make_profiled_append_remote_merge_conversation(11, 10);
23379        let append_outcome = storage
23380            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
23381            .unwrap();
23382        assert_eq!(append_outcome.inserted_indices, vec![5, 6, 7, 8, 9]);
23383
23384        let final_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23385            .raw()
23386            .query_row_map(
23387                "SELECT ended_at, last_message_idx, last_message_created_at
23388                 FROM conversation_tail_state
23389                 WHERE conversation_id = ?1",
23390                fparams![conversation_id],
23391                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23392            )
23393            .unwrap();
23394        assert_eq!(final_tail, (Some(111_999), Some(9), Some(111_009)));
23395    }
23396
23397    #[test]
23398    fn msgpack_deserialize_empty_returns_default() {
23399        let recovered = deserialize_msgpack_to_json(&[]);
23400        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23401    }
23402
23403    #[test]
23404    fn msgpack_deserialize_garbage_returns_default() {
23405        // Use truncated msgpack data that will fail to parse
23406        // 0x85 indicates a fixmap with 5 elements, but we don't provide them
23407        let recovered = deserialize_msgpack_to_json(&[0x85]);
23408        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23409    }
23410
23411    #[test]
23412    fn stats_aggregator_collects_and_expands() {
23413        let mut agg = StatsAggregator::new();
23414        assert!(agg.is_empty());
23415
23416        // Record some stats
23417        // Day 100, agent "claude", source "local"
23418        agg.record("claude", "local", 100, 5, 500);
23419        // Day 100, agent "codex", source "local"
23420        agg.record("codex", "local", 100, 3, 300);
23421        // Day 101, agent "claude", source "local"
23422        agg.record("claude", "local", 101, 2, 200);
23423
23424        assert!(!agg.is_empty());
23425        assert_eq!(agg.raw_entry_count(), 3);
23426
23427        let entries = agg.expand();
23428        // Each raw entry expands to 4 permutations.
23429        // But (all, local) and (all, all) will aggregate.
23430        //
23431        // Raw:
23432        // 1. (100, claude, local) -> 1 sess, 5 msgs, 500 chars
23433        // 2. (100, codex, local)  -> 1 sess, 3 msgs, 300 chars
23434        // 3. (101, claude, local) -> 1 sess, 2 msgs, 200 chars
23435        //
23436        // Expanded 1 (day 100):
23437        // - (100, claude, local): 1 sess, 5 msgs, 500 chars
23438        // - (100, all, local):    1 (from claude) + 1 (from codex) = 2 sess, 8 msgs, 800 chars
23439        // - (100, claude, all):   1 sess, 5 msgs, 500 chars
23440        // - (100, codex, local):  1 sess, 3 msgs, 300 chars
23441        // - (100, codex, all):    1 sess, 3 msgs, 300 chars
23442        // - (100, all, all):      2 sess, 8 msgs, 800 chars
23443        //
23444        // Expanded 3 (day 101):
23445        // - (101, claude, local): 1 sess, 2 msgs, 200 chars
23446        // - (101, all, local):    1 sess, 2 msgs, 200 chars
23447        // - (101, claude, all):   1 sess, 2 msgs, 200 chars
23448        // - (101, all, all):      1 sess, 2 msgs, 200 chars
23449        //
23450        // Total unique keys in expanded map:
23451        // Day 100: (claude, local), (codex, local), (all, local), (claude, all), (codex, all), (all, all) = 6
23452        // Day 101: (claude, local), (all, local), (claude, all), (all, all) = 4
23453        // Total = 10 entries
23454
23455        assert_eq!(entries.len(), 10);
23456
23457        // Verify totals for day 100, all/all
23458        let day100_all = entries
23459            .iter()
23460            .find(|(d, a, s, _)| *d == 100 && a == "all" && s == "all")
23461            .unwrap();
23462        assert_eq!(day100_all.3.session_count_delta, 2);
23463        assert_eq!(day100_all.3.message_count_delta, 8);
23464        assert_eq!(day100_all.3.total_chars_delta, 800);
23465    }
23466
23467    // =========================================================================
23468    // LazyFrankenDb tests (bd-1ueu)
23469    // =========================================================================
23470
23471    #[test]
23472    fn lazy_franken_db_not_open_before_get() {
23473        let dir = TempDir::new().unwrap();
23474        let db_path = dir.path().join("lazy_test.db");
23475
23476        // Create a real DB so the path exists
23477        let _storage = SqliteStorage::open(&db_path).unwrap();
23478
23479        let lazy = LazyFrankenDb::new(db_path);
23480        assert!(
23481            !lazy.is_open(),
23482            "LazyFrankenDb must not open on construction"
23483        );
23484    }
23485
23486    #[test]
23487    fn lazy_franken_db_opens_on_first_get() {
23488        let dir = TempDir::new().unwrap();
23489        let db_path = dir.path().join("lazy_test.db");
23490
23491        // Create a real DB so the path exists
23492        let _storage = SqliteStorage::open(&db_path).unwrap();
23493        drop(_storage);
23494
23495        let lazy = LazyFrankenDb::new(db_path);
23496        assert!(!lazy.is_open());
23497
23498        let conn = lazy.get("test").expect("should open successfully");
23499        let count: i64 = conn
23500            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |r| {
23501                r.get_typed(0)
23502            })
23503            .unwrap();
23504        assert_eq!(count, 0);
23505        drop(conn);
23506
23507        assert!(lazy.is_open(), "LazyFrankenDb must be open after get()");
23508    }
23509
23510    #[test]
23511    fn lazy_franken_db_reuses_connection() {
23512        let dir = TempDir::new().unwrap();
23513        let db_path = dir.path().join("lazy_test.db");
23514        let _storage = SqliteStorage::open(&db_path).unwrap();
23515        drop(_storage);
23516
23517        let lazy = LazyFrankenDb::new(db_path);
23518
23519        // First access opens
23520        {
23521            let conn = lazy.get("first").unwrap();
23522            conn.execute_batch("CREATE TABLE IF NOT EXISTS test_tbl (id INTEGER)")
23523                .unwrap();
23524        }
23525
23526        // Second access reuses (table still exists)
23527        {
23528            let conn = lazy.get("second").unwrap();
23529            let count: i64 = conn
23530                .query_row_map("SELECT COUNT(*) FROM test_tbl", fparams![], |r| {
23531                    r.get_typed(0)
23532                })
23533                .unwrap();
23534            assert_eq!(count, 0);
23535        }
23536    }
23537
23538    #[test]
23539    fn lazy_franken_db_not_found_error() {
23540        let dir = TempDir::new().unwrap();
23541        let db_path = dir.path().join("nonexistent.db");
23542
23543        let lazy = LazyFrankenDb::new(db_path);
23544        let result = lazy.get("test");
23545        assert!(result.is_err());
23546        assert!(
23547            matches!(result.unwrap_err(), LazyDbError::NotFound(_)),
23548            "should return NotFound for missing DB"
23549        );
23550    }
23551
23552    #[test]
23553    fn lazy_franken_db_path_accessor() {
23554        let path = PathBuf::from("/tmp/test_lazy.db");
23555        let lazy = LazyFrankenDb::new(path.clone());
23556        assert_eq!(lazy.path(), path.as_path());
23557    }
23558
23559    // =========================================================================
23560    // Pricing / cost estimation tests (bead z9fse.10)
23561    // =========================================================================
23562
23563    #[test]
23564    fn sql_like_match_basic_patterns() {
23565        assert!(sql_like_match("claude-opus-4-20250101", "claude-opus-4%"));
23566        assert!(sql_like_match("claude-opus-4", "claude-opus-4%"));
23567        assert!(!sql_like_match("claude-sonnet-4", "claude-opus-4%"));
23568
23569        // Middle wildcard (gemini pattern)
23570        assert!(sql_like_match("gemini-2.0-flash-001", "gemini-2%flash%"));
23571        assert!(sql_like_match("gemini-2-flash", "gemini-2%flash%"));
23572        assert!(!sql_like_match("gemini-2-pro", "gemini-2%flash%"));
23573
23574        // Exact match
23575        assert!(sql_like_match("hello", "hello"));
23576        assert!(!sql_like_match("hello!", "hello"));
23577
23578        // Underscore wildcard
23579        assert!(sql_like_match("gpt-4o", "gpt-4_"));
23580        assert!(!sql_like_match("gpt-4oo", "gpt-4_"));
23581
23582        // Case insensitive
23583        assert!(sql_like_match("Claude-Opus-4", "claude-opus-4%"));
23584    }
23585
23586    #[test]
23587    fn date_str_to_day_id_converts_correctly() {
23588        // 2025-10-01 is 2100 days after 2020-01-01
23589        assert_eq!(date_str_to_day_id("2025-10-01").unwrap(), 2100);
23590        // 2024-04-01 is 1552 days after 2020-01-01
23591        assert_eq!(date_str_to_day_id("2024-04-01").unwrap(), 1552);
23592        assert!(date_str_to_day_id("invalid").is_err());
23593    }
23594
23595    #[test]
23596    fn pricing_table_lookup_selects_matching_entry() {
23597        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23598        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23599        let table = PricingTable {
23600            entries: vec![
23601                PricingEntry {
23602                    model_pattern: "claude-opus-4%".into(),
23603                    provider: "anthropic".into(),
23604                    input_cost_per_mtok: 15.0,
23605                    output_cost_per_mtok: 75.0,
23606                    cache_read_cost_per_mtok: Some(1.5),
23607                    cache_creation_cost_per_mtok: Some(18.75),
23608                    effective_day_id: effective_day,
23609                },
23610                PricingEntry {
23611                    model_pattern: "claude-sonnet-4%".into(),
23612                    provider: "anthropic".into(),
23613                    input_cost_per_mtok: 3.0,
23614                    output_cost_per_mtok: 15.0,
23615                    cache_read_cost_per_mtok: Some(0.3),
23616                    cache_creation_cost_per_mtok: Some(3.75),
23617                    effective_day_id: effective_day,
23618                },
23619            ],
23620        };
23621
23622        let result = table.lookup("claude-opus-4-20260101", lookup_day);
23623        assert!(result.is_some());
23624        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23625
23626        let result = table.lookup("claude-sonnet-4-latest", lookup_day);
23627        assert!(result.is_some());
23628        assert_eq!(result.unwrap().input_cost_per_mtok, 3.0);
23629
23630        assert!(table.lookup("unknown-model", lookup_day).is_none());
23631    }
23632
23633    #[test]
23634    fn pricing_table_lookup_respects_effective_date() {
23635        let effective_day_1 = date_str_to_day_id("2025-10-01").unwrap();
23636        let effective_day_2 = date_str_to_day_id("2026-01-01").unwrap();
23637        let table = PricingTable {
23638            entries: vec![
23639                PricingEntry {
23640                    model_pattern: "claude-opus-4%".into(),
23641                    provider: "anthropic".into(),
23642                    input_cost_per_mtok: 15.0,
23643                    output_cost_per_mtok: 75.0,
23644                    cache_read_cost_per_mtok: None,
23645                    cache_creation_cost_per_mtok: None,
23646                    effective_day_id: effective_day_1,
23647                },
23648                PricingEntry {
23649                    model_pattern: "claude-opus-4%".into(),
23650                    provider: "anthropic".into(),
23651                    input_cost_per_mtok: 12.0,
23652                    output_cost_per_mtok: 60.0,
23653                    cache_read_cost_per_mtok: None,
23654                    cache_creation_cost_per_mtok: None,
23655                    effective_day_id: effective_day_2,
23656                },
23657            ],
23658        };
23659
23660        // Before price drop
23661        let result = table.lookup("claude-opus-4", date_str_to_day_id("2025-11-01").unwrap());
23662        assert!(result.is_some());
23663        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23664
23665        // After price drop
23666        let result = table.lookup("claude-opus-4", date_str_to_day_id("2026-02-01").unwrap());
23667        assert!(result.is_some());
23668        assert_eq!(result.unwrap().input_cost_per_mtok, 12.0);
23669
23670        // Before all pricing
23671        assert!(
23672            table
23673                .lookup("claude-opus-4", date_str_to_day_id("2024-01-01").unwrap())
23674                .is_none()
23675        );
23676    }
23677
23678    #[test]
23679    fn pricing_table_lookup_specificity_tiebreak() {
23680        let effective_day = date_str_to_day_id("2025-01-01").unwrap();
23681        let lookup_day = date_str_to_day_id("2026-01-01").unwrap();
23682        let table = PricingTable {
23683            entries: vec![
23684                PricingEntry {
23685                    model_pattern: "gpt-4%".into(),
23686                    provider: "openai".into(),
23687                    input_cost_per_mtok: 10.0,
23688                    output_cost_per_mtok: 30.0,
23689                    cache_read_cost_per_mtok: None,
23690                    cache_creation_cost_per_mtok: None,
23691                    effective_day_id: effective_day,
23692                },
23693                PricingEntry {
23694                    model_pattern: "gpt-4-turbo%".into(),
23695                    provider: "openai".into(),
23696                    input_cost_per_mtok: 5.0,
23697                    output_cost_per_mtok: 15.0,
23698                    cache_read_cost_per_mtok: None,
23699                    cache_creation_cost_per_mtok: None,
23700                    effective_day_id: effective_day,
23701                },
23702            ],
23703        };
23704
23705        // Longer pattern wins for specific model
23706        let result = table.lookup("gpt-4-turbo-2025", lookup_day);
23707        assert!(result.is_some());
23708        assert_eq!(result.unwrap().input_cost_per_mtok, 5.0);
23709
23710        // Shorter pattern matches broader model
23711        let result = table.lookup("gpt-4o", lookup_day);
23712        assert!(result.is_some());
23713        assert_eq!(result.unwrap().input_cost_per_mtok, 10.0);
23714    }
23715
23716    #[test]
23717    fn pricing_table_compute_cost_basic() {
23718        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23719        let table = PricingTable {
23720            entries: vec![PricingEntry {
23721                model_pattern: "claude-opus-4%".into(),
23722                provider: "anthropic".into(),
23723                input_cost_per_mtok: 15.0,
23724                output_cost_per_mtok: 75.0,
23725                cache_read_cost_per_mtok: Some(1.5),
23726                cache_creation_cost_per_mtok: Some(18.75),
23727                effective_day_id: effective_day,
23728            }],
23729        };
23730
23731        let cost = table.compute_cost(
23732            Some("claude-opus-4-latest"),
23733            date_str_to_day_id("2026-02-06").unwrap(),
23734            Some(1000),
23735            Some(500),
23736            None,
23737            None,
23738        );
23739        assert!(cost.is_some());
23740        // 1000 * 15.0 / 1M + 500 * 75.0 / 1M = 0.015 + 0.0375 = 0.0525
23741        assert!((cost.unwrap() - 0.0525).abs() < 1e-10);
23742    }
23743
23744    #[test]
23745    fn pricing_table_compute_cost_with_cache() {
23746        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23747        let table = PricingTable {
23748            entries: vec![PricingEntry {
23749                model_pattern: "claude-opus-4%".into(),
23750                provider: "anthropic".into(),
23751                input_cost_per_mtok: 15.0,
23752                output_cost_per_mtok: 75.0,
23753                cache_read_cost_per_mtok: Some(1.5),
23754                cache_creation_cost_per_mtok: Some(18.75),
23755                effective_day_id: effective_day,
23756            }],
23757        };
23758
23759        let cost = table.compute_cost(
23760            Some("claude-opus-4-latest"),
23761            date_str_to_day_id("2026-02-06").unwrap(),
23762            Some(1_000_000),
23763            Some(100_000),
23764            Some(500_000),
23765            Some(200_000),
23766        );
23767        assert!(cost.is_some());
23768        // input excludes cache tokens to avoid double-charging them at both the
23769        // full input rate and the cache-specific rates.
23770        // non-cache input: 300K * 15/1M = 4.5, output: 100K * 75/1M = 7.5
23771        // cache_read: 500K * 1.5/1M = 0.75, cache_creation: 200K * 18.75/1M = 3.75
23772        // total = 16.5
23773        assert!((cost.unwrap() - 16.5).abs() < 1e-10);
23774    }
23775
23776    #[test]
23777    fn pricing_table_compute_cost_returns_none_for_unknown_model() {
23778        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23779        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23780        let table = PricingTable {
23781            entries: vec![PricingEntry {
23782                model_pattern: "claude-opus-4%".into(),
23783                provider: "anthropic".into(),
23784                input_cost_per_mtok: 15.0,
23785                output_cost_per_mtok: 75.0,
23786                cache_read_cost_per_mtok: None,
23787                cache_creation_cost_per_mtok: None,
23788                effective_day_id: effective_day,
23789            }],
23790        };
23791
23792        assert!(
23793            table
23794                .compute_cost(
23795                    Some("unknown-model"),
23796                    lookup_day,
23797                    Some(1000),
23798                    Some(500),
23799                    None,
23800                    None
23801                )
23802                .is_none()
23803        );
23804        assert!(
23805            table
23806                .compute_cost(None, lookup_day, Some(1000), Some(500), None, None)
23807                .is_none()
23808        );
23809        assert!(
23810            table
23811                .compute_cost(Some("claude-opus-4"), lookup_day, None, None, None, None)
23812                .is_none()
23813        );
23814    }
23815
23816    #[test]
23817    fn pricing_table_load_from_db() {
23818        let dir = TempDir::new().unwrap();
23819        let db_path = dir.path().join("test.db");
23820        let storage = SqliteStorage::open(&db_path).unwrap();
23821
23822        let table = PricingTable::load(&storage.conn).unwrap();
23823        assert!(!table.is_empty());
23824
23825        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23826
23827        let opus = table.lookup("claude-opus-4-latest", lookup_day);
23828        assert!(opus.is_some());
23829        assert_eq!(opus.unwrap().input_cost_per_mtok, 15.0);
23830
23831        let flash = table.lookup("gemini-2.0-flash-001", lookup_day);
23832        assert!(flash.is_some());
23833        assert_eq!(flash.unwrap().input_cost_per_mtok, 0.075);
23834    }
23835
23836    #[test]
23837    fn pricing_table_load_rejects_invalid_effective_date() {
23838        let dir = TempDir::new().unwrap();
23839        let db_path = dir.path().join("test.db");
23840        let storage = SqliteStorage::open(&db_path).unwrap();
23841
23842        storage
23843            .conn
23844            .execute_compat(
23845                "INSERT INTO model_pricing (
23846                    model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
23847                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
23848                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
23849                fparams![
23850                    "broken-model%",
23851                    "test",
23852                    1.0_f64,
23853                    2.0_f64,
23854                    Option::<f64>::None,
23855                    Option::<f64>::None,
23856                    "not-a-date"
23857                ],
23858            )
23859            .unwrap();
23860
23861        let err = PricingTable::load(&storage.conn).unwrap_err();
23862        assert!(err.to_string().contains("invalid effective_date"));
23863    }
23864
23865    #[test]
23866    fn pricing_diagnostics_tracks_coverage() {
23867        let mut diag = PricingDiagnostics::default();
23868        diag.record_priced();
23869        diag.record_priced();
23870        diag.record_unpriced(Some("custom-model-v1"));
23871        diag.record_unpriced(Some("custom-model-v1"));
23872        diag.record_unpriced(None);
23873
23874        assert_eq!(diag.priced_count, 2);
23875        assert_eq!(diag.unpriced_count, 3);
23876        assert_eq!(diag.unknown_models.len(), 2);
23877        assert_eq!(diag.unknown_models["custom-model-v1"], 2);
23878        assert_eq!(diag.unknown_models["(none)"], 1);
23879    }
23880
23881    // =========================================================================
23882    // FrankenStorage migration tests (bead 2j6p6)
23883    // =========================================================================
23884
23885    /// Helper: create a FrankenStorage wrapping an in-memory connection and
23886    /// run migrations. This exercises the same code path as `open()` but avoids
23887    /// frankensqlite's file-based autoindex renaming limitation (V5 uses
23888    /// ALTER TABLE RENAME which triggers sqlite_autoindex lookup issues on
23889    /// file-based pagers).
23890    fn franken_storage_in_memory() -> FrankenStorage {
23891        let conn = FrankenConnection::open(":memory:").unwrap();
23892        let storage = FrankenStorage::new(conn, PathBuf::from(":memory:"));
23893        storage.run_migrations().unwrap();
23894        storage.apply_config().unwrap();
23895        storage
23896    }
23897
23898    #[test]
23899    fn franken_migrations_create_all_tables() {
23900        let storage = franken_storage_in_memory();
23901
23902        // Should be at CURRENT_SCHEMA_VERSION.
23903        let version = storage.schema_version().unwrap();
23904        assert_eq!(
23905            version, CURRENT_SCHEMA_VERSION,
23906            "fresh FrankenStorage should be at current schema version"
23907        );
23908
23909        // Core tables from V1 should exist.
23910        let rows = storage
23911            .raw()
23912            .query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
23913            .unwrap();
23914        let table_names: Vec<String> = rows
23915            .iter()
23916            .filter_map(|r| r.get_typed::<String>(0).ok())
23917            .collect();
23918
23919        for required in [
23920            "meta",
23921            "agents",
23922            "workspaces",
23923            "conversations",
23924            "messages",
23925            "snippets",
23926            "tags",
23927            "conversation_tags",
23928        ] {
23929            assert!(
23930                table_names.contains(&required.to_string()),
23931                "missing table: {required}"
23932            );
23933        }
23934
23935        // V4 sources table.
23936        assert!(
23937            table_names.contains(&"sources".to_string()),
23938            "missing sources table"
23939        );
23940
23941        // V8 daily_stats table.
23942        assert!(
23943            table_names.contains(&"daily_stats".to_string()),
23944            "missing daily_stats table"
23945        );
23946
23947        // V9 embedding_jobs table.
23948        assert!(
23949            table_names.contains(&"embedding_jobs".to_string()),
23950            "missing embedding_jobs table"
23951        );
23952
23953        // V11 message_metrics, usage_hourly, usage_daily tables.
23954        for analytics_table in ["message_metrics", "usage_hourly", "usage_daily"] {
23955            assert!(
23956                table_names.contains(&analytics_table.to_string()),
23957                "missing table: {analytics_table}"
23958            );
23959        }
23960        assert!(
23961            table_names.contains(&"conversation_tail_state".to_string()),
23962            "missing conversation_tail_state table"
23963        );
23964        assert!(
23965            table_names.contains(&"conversation_external_lookup".to_string()),
23966            "missing conversation_external_lookup table"
23967        );
23968        assert!(
23969            table_names.contains(&"conversation_external_tail_lookup".to_string()),
23970            "missing conversation_external_tail_lookup table"
23971        );
23972
23973        // Fresh frankensqlite databases should record the combined V13 base
23974        // schema plus every additive post-V13 migration.
23975        let rows = storage
23976            .raw()
23977            .query("SELECT COUNT(*) FROM _schema_migrations;")
23978            .unwrap();
23979        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
23980        assert_eq!(
23981            count,
23982            (13..=CURRENT_SCHEMA_VERSION).count() as i64,
23983            "_schema_migrations should record the V13 base schema and post-V13 migrations"
23984        );
23985
23986        // The latest applied migration should be the current schema version.
23987        let rows = storage
23988            .raw()
23989            .query("SELECT version FROM _schema_migrations ORDER BY version;")
23990            .unwrap();
23991        let versions: Vec<i64> = rows
23992            .iter()
23993            .map(|row| row.get_typed(0))
23994            .collect::<std::result::Result<_, _>>()
23995            .unwrap();
23996        assert_eq!(
23997            versions,
23998            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
23999            "_schema_migrations should contain v13 through current"
24000        );
24001    }
24002
24003    #[test]
24004    fn franken_migrations_idempotent() {
24005        let storage = franken_storage_in_memory();
24006        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24007
24008        // Re-running migrations on the same connection is a no-op.
24009        storage.run_migrations().unwrap();
24010        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24011    }
24012
24013    #[test]
24014    fn migration_v20_backfills_conversation_external_tail_lookup() {
24015        let storage = franken_storage_in_memory();
24016        let agent_id = storage
24017            .ensure_agent(&Agent {
24018                id: None,
24019                slug: "codex".into(),
24020                name: "Codex".into(),
24021                version: None,
24022                kind: AgentKind::Cli,
24023            })
24024            .unwrap();
24025        let workspace_id = storage
24026            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
24027            .unwrap();
24028        let mut conv = make_profiled_storage_remote_conversation(1919, 2);
24029        conv.source_id = "profiled-storage-remote-source-東京".into();
24030        conv.external_id = Some("profiled-storage-remote-☃-1919".into());
24031        let outcome = storage
24032            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
24033            .unwrap();
24034        let external_id = conv.external_id.as_deref().unwrap();
24035        let lookup_key = conversation_external_lookup_key(&conv.source_id, agent_id, external_id);
24036
24037        storage
24038            .raw()
24039            .execute("DELETE FROM conversation_external_tail_lookup")
24040            .unwrap();
24041        storage
24042            .raw()
24043            .execute("DELETE FROM _schema_migrations WHERE version = 20")
24044            .unwrap();
24045        storage
24046            .raw()
24047            .execute_compat(
24048                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24049                fparams!["19"],
24050            )
24051            .unwrap();
24052
24053        storage.run_migrations().unwrap();
24054
24055        let backfilled: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
24056            .raw()
24057            .query_row_map(
24058                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
24059                 FROM conversation_external_tail_lookup
24060                 WHERE lookup_key = ?1",
24061                fparams![lookup_key.as_str()],
24062                |row| {
24063                    Ok((
24064                        row.get_typed(0)?,
24065                        row.get_typed(1)?,
24066                        row.get_typed(2)?,
24067                        row.get_typed(3)?,
24068                    ))
24069                },
24070            )
24071            .unwrap();
24072        assert_eq!(
24073            backfilled,
24074            (
24075                outcome.conversation_id,
24076                conv.ended_at,
24077                Some(1),
24078                conv.messages[1].created_at
24079            )
24080        );
24081        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24082    }
24083
24084    #[test]
24085    fn migration_v15_creates_lazy_tail_state_cache() {
24086        let conn = FrankenConnection::open(":memory:").unwrap();
24087        conn.execute_batch(
24088            "CREATE TABLE conversations (
24089                 id INTEGER PRIMARY KEY,
24090                 ended_at INTEGER
24091             );
24092             CREATE TABLE messages (
24093                 id INTEGER PRIMARY KEY,
24094                 conversation_id INTEGER NOT NULL,
24095                 idx INTEGER NOT NULL,
24096                 created_at INTEGER
24097             );
24098             INSERT INTO conversations(id, ended_at) VALUES
24099                 (1, 1710000000300),
24100                 (2, NULL);
24101             INSERT INTO messages(id, conversation_id, idx, created_at) VALUES
24102                 (10, 1, 0, 1710000000100),
24103                 (11, 1, 1, 1710000000200),
24104                 (12, 2, 0, 1710000000400);",
24105        )
24106        .unwrap();
24107
24108        conn.execute(
24109            "CREATE TABLE _schema_migrations (
24110                version INTEGER PRIMARY KEY,
24111                name TEXT NOT NULL,
24112                applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))
24113             );",
24114        )
24115        .unwrap();
24116
24117        assert!(
24118            apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24119            "v15 migration should apply once"
24120        );
24121        assert!(
24122            !apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24123            "v15 migration should be idempotent once recorded"
24124        );
24125
24126        let columns = conn.query("PRAGMA table_info(conversations);").unwrap();
24127        let column_names: HashSet<String> = columns
24128            .iter()
24129            .map(|row| row.get_typed(1))
24130            .collect::<std::result::Result<_, frankensqlite::FrankenError>>()
24131            .unwrap();
24132        assert!(column_names.contains("last_message_idx"));
24133        assert!(column_names.contains("last_message_created_at"));
24134
24135        let tail_rows: i64 = conn
24136            .query("SELECT COUNT(*) FROM conversation_tail_state;")
24137            .unwrap()
24138            .first()
24139            .unwrap()
24140            .get_typed(0)
24141            .unwrap();
24142        assert_eq!(
24143            tail_rows, 0,
24144            "v15 should create the cache without an open-time message scan"
24145        );
24146
24147        let applied: i64 = conn
24148            .query("SELECT COUNT(*) FROM _schema_migrations WHERE version = 15;")
24149            .unwrap()
24150            .first()
24151            .unwrap()
24152            .get_typed(0)
24153            .unwrap();
24154        assert_eq!(applied, 1);
24155    }
24156
24157    #[test]
24158    fn schema_repair_adds_missing_conversations_token_columns() {
24159        let conn = FrankenConnection::open(":memory:").unwrap();
24160        conn.execute_batch(
24161            "CREATE TABLE conversations (
24162                 id INTEGER PRIMARY KEY,
24163                 agent_id INTEGER NOT NULL,
24164                 source_path TEXT NOT NULL
24165             );",
24166        )
24167        .unwrap();
24168        let storage = FrankenStorage::new(conn, std::path::PathBuf::from(":memory:"));
24169
24170        storage.repair_missing_conversation_token_columns().unwrap();
24171        storage.repair_missing_conversation_token_columns().unwrap();
24172
24173        let columns = franken_table_column_names(&storage.conn, "conversations").unwrap();
24174        for &(column_name, _) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
24175            assert!(
24176                columns.contains(column_name),
24177                "schema repair should add conversations.{column_name}"
24178            );
24179        }
24180    }
24181
24182    #[test]
24183    fn franken_meta_schema_version_in_sync() {
24184        let storage = franken_storage_in_memory();
24185
24186        // meta.schema_version should be kept in sync.
24187        let rows = storage
24188            .raw()
24189            .query("SELECT value FROM meta WHERE key = 'schema_version';")
24190            .unwrap();
24191        let meta_version: String = rows.first().unwrap().get_typed(0).unwrap();
24192        assert_eq!(
24193            meta_version,
24194            CURRENT_SCHEMA_VERSION.to_string(),
24195            "meta.schema_version should match CURRENT_SCHEMA_VERSION"
24196        );
24197    }
24198
24199    #[test]
24200    fn franken_transition_from_meta_version() {
24201        let dir = TempDir::new().unwrap();
24202        let db_path = dir.path().join("test_transition.db");
24203
24204        // Simulate an existing database created by SqliteStorage at version 10.
24205        // We create just enough schema to test the transition.
24206        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24207        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24208            .unwrap();
24209        conn.execute("INSERT INTO meta(key, value) VALUES('schema_version', '10');")
24210            .unwrap();
24211        // Create a dummy conversations table so transition doesn't think it's corrupted.
24212        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24213            .unwrap();
24214        drop(conn);
24215
24216        // Now run the transition function.
24217        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24218        transition_from_meta_version(&conn).unwrap();
24219
24220        // _schema_migrations should exist with entries for versions 1..=10.
24221        let rows = conn
24222            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24223            .unwrap();
24224        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24225        assert_eq!(
24226            versions,
24227            (1..=10).collect::<Vec<i64>>(),
24228            "transition should backfill versions 1..=10"
24229        );
24230    }
24231
24232    #[test]
24233    fn franken_transition_from_current_meta_backfills_current_schema_marker() {
24234        let dir = TempDir::new().unwrap();
24235        let db_path = dir.path().join("test_current_transition.db");
24236
24237        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24238        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24239            .unwrap();
24240        conn.execute_compat(
24241            "INSERT INTO meta(key, value) VALUES('schema_version', ?1);",
24242            &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24243        )
24244        .unwrap();
24245        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24246            .unwrap();
24247        drop(conn);
24248
24249        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24250        transition_from_meta_version(&conn).unwrap();
24251
24252        let rows = conn
24253            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24254            .unwrap();
24255        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24256        assert_eq!(
24257            versions,
24258            (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24259            "current meta schema marker should backfill every known migration"
24260        );
24261    }
24262
24263    #[test]
24264    fn franken_transition_skips_when_already_done() {
24265        let dir = TempDir::new().unwrap();
24266        let db_path = dir.path().join("test_transition_skip.db");
24267
24268        // Create a DB that already has _schema_migrations.
24269        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24270        conn.execute(
24271            "CREATE TABLE _schema_migrations (version INTEGER PRIMARY KEY, name TEXT NOT NULL, applied_at TEXT NOT NULL DEFAULT 'now');",
24272        ).unwrap();
24273        conn.execute("INSERT INTO _schema_migrations (version, name) VALUES (1, 'test');")
24274            .unwrap();
24275
24276        // Transition should be a no-op.
24277        transition_from_meta_version(&conn).unwrap();
24278
24279        // Should still have exactly 1 entry.
24280        let rows = conn
24281            .query("SELECT COUNT(*) FROM _schema_migrations;")
24282            .unwrap();
24283        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24284        assert_eq!(
24285            count, 1,
24286            "transition should not re-run on already-transitioned DB"
24287        );
24288    }
24289
24290    #[test]
24291    fn franken_transition_fresh_db_is_noop() {
24292        let dir = TempDir::new().unwrap();
24293        let db_path = dir.path().join("test_fresh_noop.db");
24294
24295        // Empty database — no meta table, no tables at all.
24296        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24297        transition_from_meta_version(&conn).unwrap();
24298
24299        // _schema_migrations should NOT have been created.
24300        let res = conn.query("SELECT * FROM \"_schema_migrations\";");
24301        assert!(
24302            res.is_err(),
24303            "transition should not create _schema_migrations on fresh DB"
24304        );
24305    }
24306
24307    #[test]
24308    fn franken_transition_with_fts_virtual_table_succeeds() {
24309        let dir = TempDir::new().unwrap();
24310        let db_path = dir.path().join("test_transition_with_fts.db");
24311
24312        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24313        conn.execute_batch(
24314            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24315             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24316             CREATE TABLE conversations (id INTEGER PRIMARY KEY);
24317             CREATE VIRTUAL TABLE fts_messages USING fts5(
24318                 content,
24319                 title,
24320                 agent,
24321                 workspace,
24322                 source_path,
24323                 created_at,
24324                 content='',
24325                 tokenize='porter unicode61'
24326             );",
24327        )
24328        .unwrap();
24329        drop(conn);
24330
24331        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24332        transition_from_meta_version(&conn).unwrap();
24333
24334        let rows = conn
24335            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24336            .unwrap();
24337        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24338        assert_eq!(versions, (1..=13).collect::<Vec<i64>>());
24339    }
24340
24341    #[test]
24342    fn franken_storage_open_legacy_v13_with_fts_virtual_table_succeeds() {
24343        let dir = TempDir::new().unwrap();
24344        let db_path = dir.path().join("test_open_legacy_v13_with_fts.db");
24345
24346        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24347        conn.execute_batch(
24348            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24349             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24350             CREATE TABLE agents (
24351                 id INTEGER PRIMARY KEY,
24352                 slug TEXT NOT NULL
24353             );
24354             CREATE TABLE workspaces (
24355                 id INTEGER PRIMARY KEY,
24356                 path TEXT NOT NULL
24357             );
24358             CREATE TABLE sources (
24359                 id TEXT PRIMARY KEY,
24360                 kind TEXT NOT NULL,
24361                 host_label TEXT,
24362                 machine_id TEXT,
24363                 platform TEXT,
24364                 config_json TEXT,
24365                 created_at INTEGER NOT NULL,
24366                 updated_at INTEGER NOT NULL
24367             );
24368             CREATE TABLE conversations (
24369                 id INTEGER PRIMARY KEY,
24370                 agent_id INTEGER NOT NULL,
24371                 workspace_id INTEGER,
24372                 source_id TEXT NOT NULL DEFAULT 'local',
24373                 external_id TEXT,
24374                 title TEXT,
24375                 source_path TEXT NOT NULL,
24376                 started_at INTEGER,
24377                 ended_at INTEGER
24378             );
24379             CREATE TABLE messages (
24380                 id INTEGER PRIMARY KEY,
24381                 conversation_id INTEGER NOT NULL,
24382                 idx INTEGER NOT NULL,
24383                 role TEXT NOT NULL,
24384                 author TEXT,
24385                 created_at INTEGER,
24386                 content TEXT NOT NULL,
24387                 extra_json TEXT,
24388                 extra_bin BLOB
24389             );
24390             INSERT INTO agents(id, slug) VALUES (1, 'codex');
24391             INSERT INTO workspaces(id, path) VALUES (1, '/data/projects/coding_agent_session_search');
24392             INSERT INTO sources(id, kind, host_label, created_at, updated_at)
24393             VALUES ('local', 'local', NULL, 1710000000000, 1710000000000);
24394             INSERT INTO conversations(
24395                 id,
24396                 agent_id,
24397                 workspace_id,
24398                 source_id,
24399                 external_id,
24400                 title,
24401                 source_path,
24402                 started_at
24403             )
24404             VALUES (
24405                 1,
24406                 1,
24407                 1,
24408                 'local',
24409                 'legacy-session',
24410                 'legacy session',
24411                 '/tmp/legacy.jsonl',
24412                 1710000000000
24413             );
24414             INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content)
24415             VALUES (1, 1, 0, 'user', 'tester', 1710000000000, 'legacy content');
24416             CREATE VIRTUAL TABLE fts_messages USING fts5(
24417                 content,
24418                 title,
24419                 agent,
24420                 workspace,
24421                 source_path,
24422                 created_at,
24423                 message_id,
24424                 content='',
24425                 tokenize='porter unicode61'
24426             );",
24427        )
24428        .unwrap();
24429        drop(conn);
24430
24431        let storage = FrankenStorage::open(&db_path).unwrap();
24432        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24433
24434        let rows = storage
24435            .raw()
24436            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24437            .unwrap();
24438        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24439        assert_eq!(versions, (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>());
24440    }
24441
24442    #[test]
24443    fn franken_storage_open_repairs_duplicate_fts_messages_schema_rows() {
24444        let dir = TempDir::new().unwrap();
24445        let db_path = dir.path().join("test_open_repairs_duplicate_fts_schema.db");
24446
24447        let storage = FrankenStorage::open(&db_path).unwrap();
24448        let agent = Agent {
24449            id: None,
24450            slug: "codex".into(),
24451            name: "Codex".into(),
24452            version: None,
24453            kind: AgentKind::Cli,
24454        };
24455        let agent_id = storage.ensure_agent(&agent).unwrap();
24456        let conversation = Conversation {
24457            id: None,
24458            agent_slug: "codex".into(),
24459            workspace: Some(PathBuf::from("/tmp/workspace")),
24460            external_id: Some("dup-fts-schema".into()),
24461            title: Some("Duplicate FTS schema".into()),
24462            source_path: PathBuf::from("/tmp/dup-fts-schema.jsonl"),
24463            started_at: Some(1_700_000_000_000),
24464            ended_at: Some(1_700_000_000_100),
24465            approx_tokens: Some(42),
24466            metadata_json: serde_json::Value::Null,
24467            messages: vec![Message {
24468                id: None,
24469                idx: 0,
24470                role: MessageRole::User,
24471                author: Some("user".into()),
24472                created_at: Some(1_700_000_000_050),
24473                content: "message that should remain queryable".into(),
24474                extra_json: serde_json::Value::Null,
24475                snippets: Vec::new(),
24476            }],
24477            source_id: LOCAL_SOURCE_ID.into(),
24478            origin_host: None,
24479        };
24480        storage
24481            .insert_conversation_tree(agent_id, None, &conversation)
24482            .unwrap();
24483        drop(storage);
24484        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
24485
24486        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
24487        let conn = rusqlite_test_fixture_conn(&db_path);
24488        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
24489        conn.execute(
24490            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
24491             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
24492            [duplicate_legacy_fts_sql],
24493        )
24494        .unwrap();
24495        conn.execute(
24496            "DELETE FROM meta WHERE key = ?1",
24497            [FTS_FRANKEN_REBUILD_META_KEY],
24498        )
24499        .unwrap();
24500        // Simulate a pre-fix upgraded database that has never gone through the
24501        // authoritative frankensqlite FTS rebuild generation yet.
24502        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
24503
24504        let duplicate_rows: i64 = conn
24505            .query_row(
24506                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
24507                [],
24508                |row| row.get(0),
24509            )
24510            .unwrap();
24511        assert_eq!(duplicate_rows, 2);
24512        drop(conn);
24513
24514        let reopened = FrankenStorage::open(&db_path).unwrap();
24515        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24516        let generation_rows: Vec<String> = reopened
24517            .raw()
24518            .query_map_collect(
24519                "SELECT value FROM meta WHERE key = ?1",
24520                fparams![FTS_FRANKEN_REBUILD_META_KEY],
24521                |row| row.get_typed(0),
24522            )
24523            .unwrap();
24524        assert_eq!(
24525            generation_rows.len(),
24526            0,
24527            "canonical open should not eagerly rewrite FTS repair metadata"
24528        );
24529        reopened.ensure_search_fallback_fts_consistency().unwrap();
24530        let repaired = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24531        assert_eq!(franken_fts_schema_rows(&repaired).unwrap(), 1);
24532
24533        let total_messages: i64 = reopened
24534            .raw()
24535            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
24536                row.get_typed(0)
24537            })
24538            .unwrap();
24539        let total_fts_rows: i64 = reopened
24540            .raw()
24541            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
24542                row.get_typed(0)
24543            })
24544            .unwrap();
24545        assert_eq!(total_fts_rows, total_messages);
24546    }
24547
24548    #[test]
24549    fn franken_storage_open_fresh_db_keeps_single_franken_fts_schema_row() {
24550        let dir = TempDir::new().unwrap();
24551        let db_path = dir.path().join("fresh-franken-storage-open.db");
24552
24553        let storage = FrankenStorage::open(&db_path).unwrap();
24554        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24555
24556        // The FTS5 virtual table is no longer created eagerly by the
24557        // migration runner (V14 drops the old internal-content table and the
24558        // current contentless table is recreated lazily — see MIGRATION_V14).
24559        // Invoke the repair path to match normal cass startup, then assert
24560        // there is exactly one fts_messages entry in sqlite_schema (no
24561        // duplicates).
24562        storage
24563            .ensure_search_fallback_fts_consistency()
24564            .expect("ensure FTS consistency after fresh open");
24565        drop(storage);
24566
24567        let c_reader = FrankenConnection::open(db_path.to_string_lossy().into_owned())
24568            .expect("open DB via frankensqlite for sqlite_master inspection");
24569        assert_eq!(
24570            franken_fts_schema_rows(&c_reader).unwrap(),
24571            1,
24572            "exactly one fts_messages schema row should exist after ensure_search_fallback_fts_consistency"
24573        );
24574        drop(c_reader);
24575
24576        let storage = FrankenStorage::open(&db_path).unwrap();
24577        assert!(
24578            storage
24579                .raw()
24580                .query("SELECT rowid FROM fts_messages LIMIT 1")
24581                .is_ok(),
24582            "fts_messages must be queryable through frankensqlite after open"
24583        );
24584    }
24585
24586    #[test]
24587    fn franken_storage_open_repairs_missing_analytics_tables_when_version_markers_lie() {
24588        let dir = TempDir::new().unwrap();
24589        let db_path = dir.path().join("test_repair_missing_analytics.db");
24590
24591        {
24592            let storage = FrankenStorage::open(&db_path).unwrap();
24593            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24594        }
24595
24596        {
24597            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24598            for table in &[
24599                "usage_models_daily",
24600                "usage_daily",
24601                "usage_hourly",
24602                "message_metrics",
24603                "token_daily_stats",
24604                "token_usage",
24605                "model_pricing",
24606                "embedding_jobs",
24607                "daily_stats",
24608            ] {
24609                conn.execute(&format!("DROP TABLE IF EXISTS {table}"))
24610                    .unwrap();
24611            }
24612            conn.execute_compat(
24613                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24614                &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24615            )
24616            .unwrap();
24617        }
24618
24619        let repaired = FrankenStorage::open(&db_path).unwrap();
24620        assert_eq!(repaired.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24621
24622        let analytics_count: i64 = repaired
24623            .raw()
24624            .query_row_map(
24625                "SELECT COUNT(*) FROM sqlite_master
24626                 WHERE type='table'
24627                   AND name IN (
24628                     'daily_stats',
24629                     'embedding_jobs',
24630                     'token_usage',
24631                     'token_daily_stats',
24632                     'model_pricing',
24633                     'message_metrics',
24634                     'usage_hourly',
24635                     'usage_daily',
24636                     'usage_models_daily'
24637                   )",
24638                &[],
24639                |row| row.get_typed(0),
24640            )
24641            .unwrap();
24642        assert_eq!(
24643            analytics_count, 9,
24644            "open() should recreate the missing analytics tables even when schema_version already says current"
24645        );
24646    }
24647
24648    #[test]
24649    fn current_schema_repair_batches_cover_every_required_probe() {
24650        let missing_tables: Vec<&'static str> = REQUIRED_CURRENT_SCHEMA_TABLE_PROBES
24651            .iter()
24652            .map(|(table_name, _)| *table_name)
24653            .collect();
24654
24655        let batches = current_schema_repair_batches_for_missing_tables(&missing_tables).unwrap();
24656        let covered_tables: HashSet<&'static str> = batches
24657            .iter()
24658            .flat_map(|batch| batch.tables.iter().copied())
24659            .collect();
24660
24661        for table_name in missing_tables {
24662            assert!(
24663                covered_tables.contains(table_name),
24664                "missing repair coverage for {table_name}"
24665            );
24666        }
24667    }
24668
24669    #[test]
24670    fn current_schema_repair_batches_do_not_replay_core_schema_bootstrap() {
24671        for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
24672            assert!(
24673                !batch.sql.contains("CREATE TABLE IF NOT EXISTS meta"),
24674                "repair batch {} should not recreate meta",
24675                batch.name
24676            );
24677            assert!(
24678                !batch.sql.contains("CREATE TABLE IF NOT EXISTS agents"),
24679                "repair batch {} should not recreate agents",
24680                batch.name
24681            );
24682            assert!(
24683                !batch.sql.contains("CREATE TABLE IF NOT EXISTS workspaces"),
24684                "repair batch {} should not recreate workspaces",
24685                batch.name
24686            );
24687            assert!(
24688                !batch
24689                    .sql
24690                    .contains("CREATE TABLE IF NOT EXISTS conversations"),
24691                "repair batch {} should not recreate conversations",
24692                batch.name
24693            );
24694            assert!(
24695                !batch.sql.contains("CREATE TABLE IF NOT EXISTS messages"),
24696                "repair batch {} should not recreate messages",
24697                batch.name
24698            );
24699            assert!(
24700                !batch.sql.contains("CREATE TABLE IF NOT EXISTS snippets"),
24701                "repair batch {} should not recreate snippets",
24702                batch.name
24703            );
24704            assert!(
24705                !batch.sql.contains("CREATE VIRTUAL TABLE fts_messages"),
24706                "repair batch {} should not recreate FTS tables",
24707                batch.name
24708            );
24709            assert!(
24710                !batch.sql.contains("DROP TABLE"),
24711                "repair batch {} should never drop tables",
24712                batch.name
24713            );
24714        }
24715    }
24716
24717    #[test]
24718    fn build_cass_migrations_applies_combined_v13() {
24719        let conn = FrankenConnection::open(":memory:").unwrap();
24720        let base_result = build_cass_migrations_before_tail_cache()
24721            .run(&conn)
24722            .unwrap();
24723        assert!(apply_conversation_tail_state_cache_migration(&conn).unwrap());
24724        let post_result = build_cass_migrations_after_tail_cache().run(&conn).unwrap();
24725
24726        assert!(base_result.was_fresh);
24727        let mut applied = base_result.applied;
24728        applied.push(15);
24729        applied.extend(post_result.applied);
24730        assert_eq!(
24731            applied,
24732            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24733            "should apply combined V13 plus additive post-V13 migrations"
24734        );
24735        let current: i64 = conn
24736            .query("SELECT MAX(version) FROM _schema_migrations;")
24737            .unwrap()
24738            .first()
24739            .unwrap()
24740            .get_typed(0)
24741            .unwrap();
24742        assert_eq!(current, CURRENT_SCHEMA_VERSION);
24743    }
24744
24745    #[test]
24746    fn franken_insert_conversations_batched_populates_analytics_rollups() {
24747        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
24748        use frankensqlite::compat::{ConnectionExt, RowExt};
24749        use std::path::PathBuf;
24750
24751        let dir = TempDir::new().unwrap();
24752        let db_path = dir.path().join("franken-index.db");
24753        let storage = FrankenStorage::open(&db_path).unwrap();
24754
24755        let agent = Agent {
24756            id: None,
24757            slug: "claude_code".into(),
24758            name: "Claude Code".into(),
24759            version: Some("1.0".into()),
24760            kind: AgentKind::Cli,
24761        };
24762        let agent_id = storage.ensure_agent(&agent).unwrap();
24763
24764        let ts_ms = 1_770_551_400_000_i64;
24765        let usage_json = serde_json::json!({
24766            "message": {
24767                "model": "claude-opus-4-6",
24768                "usage": {
24769                    "input_tokens": 100,
24770                    "output_tokens": 50,
24771                    "cache_read_input_tokens": 25,
24772                    "cache_creation_input_tokens": 10,
24773                    "service_tier": "standard"
24774                }
24775            }
24776        });
24777
24778        let conv = Conversation {
24779            id: None,
24780            agent_slug: "claude_code".into(),
24781            workspace: Some(PathBuf::from("/tmp/workspace")),
24782            external_id: Some("franken-batch-upsert".into()),
24783            title: Some("Franken batch upsert".into()),
24784            source_path: PathBuf::from("/tmp/franken.jsonl"),
24785            started_at: Some(ts_ms),
24786            ended_at: Some(ts_ms + 60_000),
24787            approx_tokens: None,
24788            metadata_json: serde_json::Value::Null,
24789            messages: vec![
24790                Message {
24791                    id: None,
24792                    idx: 0,
24793                    role: MessageRole::User,
24794                    author: None,
24795                    created_at: Some(ts_ms),
24796                    content: "Please make a plan.".into(),
24797                    extra_json: serde_json::Value::Null,
24798                    snippets: vec![],
24799                },
24800                Message {
24801                    id: None,
24802                    idx: 1,
24803                    role: MessageRole::Agent,
24804                    author: None,
24805                    created_at: Some(ts_ms + 30_000),
24806                    content: "## Plan\n\n1. Reproduce\n2. Patch\n3. Verify".into(),
24807                    extra_json: usage_json,
24808                    snippets: vec![],
24809                },
24810            ],
24811            source_id: "local".into(),
24812            origin_host: None,
24813        };
24814
24815        let outcomes = storage
24816            .insert_conversations_batched(&[(agent_id, None, &conv)])
24817            .unwrap();
24818        assert_eq!(outcomes.len(), 1);
24819        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
24820
24821        let conn = storage.raw();
24822        let daily_stats_rows: i64 = conn
24823            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
24824                row.get_typed(0)
24825            })
24826            .unwrap();
24827        let token_daily_rows: i64 = conn
24828            .query_row_map(
24829                "SELECT COUNT(*) FROM token_daily_stats",
24830                fparams![],
24831                |row| row.get_typed(0),
24832            )
24833            .unwrap();
24834        let usage_daily_rows: i64 = conn
24835            .query_row_map("SELECT COUNT(*) FROM usage_daily", fparams![], |row| {
24836                row.get_typed(0)
24837            })
24838            .unwrap();
24839        let model_daily_rows: i64 = conn
24840            .query_row_map(
24841                "SELECT COUNT(*) FROM usage_models_daily",
24842                fparams![],
24843                |row| row.get_typed(0),
24844            )
24845            .unwrap();
24846
24847        assert!(daily_stats_rows > 0, "daily_stats should be populated");
24848        assert!(
24849            token_daily_rows > 0,
24850            "token_daily_stats should be populated"
24851        );
24852        assert!(usage_daily_rows > 0, "usage_daily should be populated");
24853        assert!(
24854            model_daily_rows > 0,
24855            "usage_models_daily should be populated"
24856        );
24857    }
24858
24859    // =========================================================================
24860    // FrankenConnectionManager tests (bead 3rlf8)
24861    // =========================================================================
24862
24863    #[test]
24864    fn connection_manager_creates_readers() {
24865        let dir = TempDir::new().unwrap();
24866        let db_path = dir.path().join("cm.db");
24867
24868        // Create the DB first
24869        let fs = FrankenStorage::open(&db_path).unwrap();
24870        drop(fs);
24871
24872        let config = ConnectionManagerConfig {
24873            reader_count: 3,
24874            max_writers: 2,
24875        };
24876        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
24877        assert_eq!(mgr.reader_count(), 3);
24878        assert_eq!(mgr.max_writers(), 2);
24879    }
24880
24881    #[test]
24882    fn connection_manager_clamps_zero_writer_limit_to_prevent_deadlock() {
24883        let dir = TempDir::new().unwrap();
24884        let db_path = dir.path().join("cm.db");
24885
24886        let fs = FrankenStorage::open(&db_path).unwrap();
24887        drop(fs);
24888
24889        let mgr = std::sync::Arc::new(
24890            FrankenConnectionManager::new(
24891                &db_path,
24892                ConnectionManagerConfig {
24893                    reader_count: 0,
24894                    max_writers: 0,
24895                },
24896            )
24897            .unwrap(),
24898        );
24899        assert_eq!(mgr.reader_count(), 1);
24900        assert_eq!(mgr.max_writers(), 1);
24901
24902        let (tx, rx) = std::sync::mpsc::channel();
24903        let mgr_for_thread = std::sync::Arc::clone(&mgr);
24904        std::thread::spawn(move || {
24905            let result = mgr_for_thread.writer().map(|mut guard| {
24906                guard.mark_committed();
24907            });
24908            tx.send(result.is_ok()).expect("writer result send");
24909        });
24910
24911        assert!(
24912            rx.recv_timeout(Duration::from_secs(10)).unwrap(),
24913            "writer acquisition should not block forever when configured with zero writer slots"
24914        );
24915    }
24916
24917    #[test]
24918    fn connection_manager_reader_round_robin() {
24919        let dir = TempDir::new().unwrap();
24920        let db_path = dir.path().join("cm.db");
24921
24922        let fs = FrankenStorage::open(&db_path).unwrap();
24923        drop(fs);
24924
24925        let config = ConnectionManagerConfig {
24926            reader_count: 2,
24927            max_writers: 1,
24928        };
24929        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
24930
24931        // Reader index should advance (round-robin)
24932        let idx_before = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
24933        let _r1 = mgr.reader();
24934        let idx_after = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
24935        assert_eq!(idx_after, idx_before + 1, "reader index should advance");
24936    }
24937
24938    #[test]
24939    fn connection_manager_writer_reads_and_writes() {
24940        use frankensqlite::compat::RowExt;
24941
24942        let dir = TempDir::new().unwrap();
24943        let db_path = dir.path().join("cm.db");
24944
24945        let fs = FrankenStorage::open(&db_path).unwrap();
24946        drop(fs);
24947
24948        let mgr = FrankenConnectionManager::new(&db_path, Default::default()).unwrap();
24949
24950        // Acquire writer and insert data
24951        {
24952            let mut guard = mgr.writer().unwrap();
24953            guard
24954                .storage()
24955                .raw()
24956                .execute("CREATE TABLE IF NOT EXISTS cm_test (id INTEGER PRIMARY KEY, val TEXT)")
24957                .unwrap();
24958            guard
24959                .storage()
24960                .raw()
24961                .execute("INSERT INTO cm_test (val) VALUES ('hello')")
24962                .unwrap();
24963            guard.mark_committed();
24964        }
24965
24966        // Verify via reader (returns MutexGuard<SendFrankenConnection>)
24967        let reader_guard = mgr.reader();
24968        let rows = reader_guard.query("SELECT val FROM cm_test").unwrap();
24969        assert_eq!(rows.len(), 1);
24970        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "hello");
24971    }
24972
24973    #[test]
24974    fn connection_manager_writer_guard_drops_releases_slot() {
24975        let dir = TempDir::new().unwrap();
24976        let db_path = dir.path().join("cm.db");
24977
24978        let fs = FrankenStorage::open(&db_path).unwrap();
24979        drop(fs);
24980
24981        let config = ConnectionManagerConfig {
24982            reader_count: 1,
24983            max_writers: 1,
24984        };
24985        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
24986
24987        // Acquire and release writer
24988        {
24989            let mut guard = mgr.writer().unwrap();
24990            guard.mark_committed();
24991        }
24992
24993        // Should be able to acquire again (slot released)
24994        let mut guard2 = mgr.writer().unwrap();
24995        guard2.mark_committed();
24996    }
24997
24998    #[test]
24999    fn connection_manager_concurrent_writer_works() {
25000        use frankensqlite::compat::RowExt;
25001
25002        let dir = TempDir::new().unwrap();
25003        let db_path = dir.path().join("cm.db");
25004
25005        let fs = FrankenStorage::open(&db_path).unwrap();
25006        drop(fs);
25007
25008        let config = ConnectionManagerConfig {
25009            reader_count: 1,
25010            max_writers: 2,
25011        };
25012        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25013
25014        {
25015            let mut guard = mgr.concurrent_writer().unwrap();
25016            guard
25017                .storage()
25018                .raw()
25019                .execute("CREATE TABLE IF NOT EXISTS cm_conc (id INTEGER PRIMARY KEY, val TEXT)")
25020                .unwrap();
25021            guard
25022                .storage()
25023                .raw()
25024                .execute("INSERT INTO cm_conc (val) VALUES ('concurrent')")
25025                .unwrap();
25026            guard.mark_committed();
25027        }
25028
25029        let reader_guard = mgr.reader();
25030        let rows = reader_guard.query("SELECT val FROM cm_conc").unwrap();
25031        assert_eq!(rows.len(), 1);
25032        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "concurrent");
25033    }
25034
25035    #[test]
25036    fn connection_manager_default_config() {
25037        let config = ConnectionManagerConfig::default();
25038        assert_eq!(config.reader_count, 4);
25039        assert!(config.max_writers > 0);
25040    }
25041
25042    #[test]
25043    fn purge_agent_archive_data_removes_only_target_agent_and_rebuilds_derived_tables() {
25044        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
25045        use std::path::PathBuf;
25046
25047        fn seed_conversation(storage: &FrankenStorage, agent_slug: &str, marker: &str) {
25048            let agent = Agent {
25049                id: None,
25050                slug: agent_slug.into(),
25051                name: agent_slug.into(),
25052                version: None,
25053                kind: AgentKind::Cli,
25054            };
25055            let agent_id = storage.ensure_agent(&agent).unwrap();
25056            let conversation = Conversation {
25057                id: None,
25058                agent_slug: agent_slug.into(),
25059                workspace: Some(PathBuf::from("/tmp/workspace")),
25060                external_id: Some(format!("{agent_slug}-{marker}")),
25061                title: Some(format!("{agent_slug} {marker}")),
25062                source_path: PathBuf::from(format!("/tmp/{agent_slug}-{marker}.jsonl")),
25063                started_at: Some(1_700_000_000_000),
25064                ended_at: Some(1_700_000_000_100),
25065                approx_tokens: None,
25066                metadata_json: serde_json::Value::Null,
25067                messages: vec![
25068                    Message {
25069                        id: None,
25070                        idx: 0,
25071                        role: MessageRole::User,
25072                        author: Some("user".into()),
25073                        created_at: Some(1_700_000_000_010),
25074                        content: format!("{agent_slug} {marker} user"),
25075                        extra_json: serde_json::Value::Null,
25076                        snippets: Vec::new(),
25077                    },
25078                    Message {
25079                        id: None,
25080                        idx: 1,
25081                        role: MessageRole::Agent,
25082                        author: Some("assistant".into()),
25083                        created_at: Some(1_700_000_000_020),
25084                        content: format!("{agent_slug} {marker} assistant"),
25085                        extra_json: serde_json::Value::Null,
25086                        snippets: Vec::new(),
25087                    },
25088                ],
25089                source_id: LOCAL_SOURCE_ID.into(),
25090                origin_host: None,
25091            };
25092            storage
25093                .insert_conversation_tree(agent_id, None, &conversation)
25094                .unwrap();
25095        }
25096
25097        let dir = TempDir::new().unwrap();
25098        let db_path = dir.path().join("agent_search.db");
25099        let storage = FrankenStorage::open(&db_path).unwrap();
25100
25101        seed_conversation(&storage, "openclaw", "purge-target");
25102        seed_conversation(&storage, "codex", "keep-target");
25103
25104        let purge = storage.purge_agent_archive_data("openclaw").unwrap();
25105        assert_eq!(purge.conversations_deleted, 1);
25106        assert_eq!(purge.messages_deleted, 2);
25107
25108        storage.rebuild_fts().unwrap();
25109        storage.rebuild_analytics().unwrap();
25110        storage.rebuild_daily_stats().unwrap();
25111        storage.rebuild_token_daily_stats().unwrap();
25112
25113        let agents = storage.list_agents().unwrap();
25114        assert_eq!(agents.len(), 1);
25115        assert_eq!(agents[0].slug, "codex");
25116        assert_eq!(storage.total_conversation_count().unwrap(), 1);
25117        assert_eq!(storage.total_message_count().unwrap(), 2);
25118
25119        let fts_rows: i64 = storage
25120            .raw()
25121            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
25122                row.get_typed(0)
25123            })
25124            .unwrap();
25125        assert_eq!(fts_rows, 2);
25126
25127        let total_daily_sessions: i64 = storage
25128            .raw()
25129            .query_row_map(
25130                "SELECT COALESCE(SUM(session_count), 0)
25131                 FROM daily_stats
25132                 WHERE agent_slug = 'all' AND source_id = 'all'",
25133                fparams![],
25134                |row| row.get_typed(0),
25135            )
25136            .unwrap();
25137        assert_eq!(total_daily_sessions, 1);
25138
25139        let openclaw_token_rows: i64 = storage
25140            .raw()
25141            .query_row_map(
25142                "SELECT COUNT(*) FROM token_daily_stats WHERE agent_slug = 'openclaw'",
25143                fparams![],
25144                |row| row.get_typed(0),
25145            )
25146            .unwrap();
25147        assert_eq!(openclaw_token_rows, 0);
25148    }
25149
25150    /// Regression for cass#202: a `Connection` dropped mid-transaction can
25151    /// leave child rows persisted without a matching parent. The next indexer
25152    /// pass then trips `FOREIGN KEY constraint failed` on every write, the
25153    /// session never gets marked indexed, and the pending backlog grows
25154    /// without bound. `cleanup_orphan_fk_rows` is the indexer-startup
25155    /// self-heal that breaks the cycle.
25156    #[test]
25157    fn cleanup_orphan_fk_rows_removes_orphans_and_is_noop_on_clean_db() {
25158        let dir = TempDir::new().unwrap();
25159        let db_path = dir.path().join("orphan_fk_self_heal.db");
25160        let storage = FrankenStorage::open(&db_path).unwrap();
25161
25162        // Plant orphan rows directly: rows whose FK parent does not exist.
25163        // FK enforcement is temporarily off so the planted rows can land.
25164        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25165
25166        // Seed a real conversation so a subset of children DO have valid
25167        // parents — we want the cleanup to be precise, not a table-flush.
25168        storage
25169            .raw()
25170            .execute_compat(
25171                "INSERT INTO agents(id, slug, name, kind, created_at, updated_at) \
25172                 VALUES(1, 'test-agent', 'Test Agent', 'cli', 0, 0)",
25173                fparams![],
25174            )
25175            .unwrap();
25176        storage
25177            .raw()
25178            .execute_compat(
25179                "INSERT INTO conversations(id, agent_id, source_id, source_path, started_at) \
25180                 VALUES(1, 1, 'local', '/tmp/real.jsonl', 0)",
25181                fparams![],
25182            )
25183            .unwrap();
25184        storage
25185            .raw()
25186            .execute_compat(
25187                "INSERT INTO messages(id, conversation_id, idx, role, content) \
25188                 VALUES(1, 1, 0, 'user', 'real message')",
25189                fparams![],
25190            )
25191            .unwrap();
25192
25193        // Plant orphan messages referencing conversation_id=99999 (does not exist)
25194        // and conversation_id=0 (the specific shape reported in #202). Distinct
25195        // (conversation_id, idx) pairs are required by the UNIQUE constraint.
25196        for (mid, cid, idx) in [(101_i64, 99_999_i64, 0_i64), (102, 0, 0), (103, 0, 1)] {
25197            storage
25198                .raw()
25199                .execute_compat(
25200                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25201                     VALUES(?1, ?2, ?3, 'user', 'orphan message')",
25202                    fparams![mid, cid, idx],
25203                )
25204                .unwrap();
25205        }
25206
25207        // Rows below are not directly orphaned because their immediate
25208        // `messages` parent exists, but that parent is itself orphaned. The
25209        // cleanup deletes them explicitly before deleting orphan messages so the
25210        // FK cascade engine does not have to run one delete program per orphan.
25211        for message_id in [1_i64, 101_i64, 102_i64] {
25212            storage
25213                .raw()
25214                .execute_compat(
25215                    "INSERT INTO message_metrics(
25216                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25217                         role, content_chars, content_tokens_est
25218                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 13, 2)",
25219                    fparams![message_id],
25220                )
25221                .unwrap();
25222            storage
25223                .raw()
25224                .execute_compat(
25225                    "INSERT INTO token_usage(
25226                         message_id, conversation_id, agent_id, timestamp_ms, day_id,
25227                         role, content_chars
25228                     ) VALUES(?1, 1, 1, 0, 0, 'user', 13)",
25229                    fparams![message_id],
25230                )
25231                .unwrap();
25232        }
25233
25234        // Plant a directly-orphan snippet — message_id=99999 does not exist
25235        // anywhere, so this exercises the snippets DELETE path rather than
25236        // riding on the cascade from the orphan-message DELETE.
25237        storage
25238            .raw()
25239            .execute_compat(
25240                "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text) \
25241                 VALUES(99999, '/tmp/orphan-snippet.rs', 1, 2, 'rust', 'fn main() {}')",
25242                fparams![],
25243            )
25244            .unwrap();
25245
25246        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25247
25248        // Sanity: the planted orphans are visible.
25249        let messages_before: i64 = storage
25250            .raw()
25251            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25252                row.get_typed(0)
25253            })
25254            .unwrap();
25255        assert_eq!(messages_before, 4); // 1 real + 3 orphans
25256        let snippets_before: i64 = storage
25257            .raw()
25258            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25259                row.get_typed(0)
25260            })
25261            .unwrap();
25262        assert_eq!(snippets_before, 1);
25263        let metrics_before: i64 = storage
25264            .raw()
25265            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25266                row.get_typed(0)
25267            })
25268            .unwrap();
25269        assert_eq!(metrics_before, 3);
25270        let token_usage_before: i64 = storage
25271            .raw()
25272            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25273                row.get_typed(0)
25274            })
25275            .unwrap();
25276        assert_eq!(token_usage_before, 3);
25277
25278        // Run the self-heal.
25279        let report = storage.cleanup_orphan_fk_rows().unwrap();
25280
25281        // 3 orphan messages + 1 directly-orphan snippet = 4 primary orphans
25282        // reported. Dependent message_metrics/token_usage rows for orphan
25283        // messages are pruned too, but they are not double-counted because the
25284        // orphan message is the root row that made them invalid.
25285        let messages_after: i64 = storage
25286            .raw()
25287            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25288                row.get_typed(0)
25289            })
25290            .unwrap();
25291        assert_eq!(messages_after, 1, "real message must be preserved");
25292        let snippets_after: i64 = storage
25293            .raw()
25294            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25295                row.get_typed(0)
25296            })
25297            .unwrap();
25298        assert_eq!(snippets_after, 0);
25299        let metrics_after: i64 = storage
25300            .raw()
25301            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25302                row.get_typed(0)
25303            })
25304            .unwrap();
25305        assert_eq!(metrics_after, 1, "real message metric must be preserved");
25306        let token_usage_after: i64 = storage
25307            .raw()
25308            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25309                row.get_typed(0)
25310            })
25311            .unwrap();
25312        assert_eq!(token_usage_after, 1, "real token row must be preserved");
25313
25314        assert_eq!(report.total, 4, "report total: {:?}", report);
25315        let messages_count = report
25316            .per_table
25317            .iter()
25318            .find(|(t, _)| *t == "messages")
25319            .map(|(_, c)| *c);
25320        assert_eq!(messages_count, Some(3));
25321        let snippets_count = report
25322            .per_table
25323            .iter()
25324            .find(|(t, _)| *t == "snippets")
25325            .map(|(_, c)| *c);
25326        assert_eq!(snippets_count, Some(1));
25327
25328        // Second invocation on a now-clean DB must be a no-op.
25329        let second = storage.cleanup_orphan_fk_rows().unwrap();
25330        assert_eq!(second.total, 0);
25331        assert!(second.per_table.is_empty());
25332    }
25333
25334    #[test]
25335    fn cleanup_orphan_fk_rows_handles_more_than_one_delete_chunk() {
25336        let dir = TempDir::new().unwrap();
25337        let db_path = dir.path().join("orphan_fk_chunked_self_heal.db");
25338        let storage = FrankenStorage::open(&db_path).unwrap();
25339        let orphan_count = ORPHAN_FK_ID_CHUNK_SIZE + 3;
25340
25341        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25342        {
25343            let mut tx = storage.raw().transaction().unwrap();
25344            for idx in 0..orphan_count {
25345                let message_id = 10_000_i64 + i64::try_from(idx).unwrap();
25346                let conversation_id = 20_000_i64 + i64::try_from(idx).unwrap();
25347                tx.execute_compat(
25348                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25349                     VALUES(?1, ?2, 0, 'user', 'orphan message')",
25350                    fparams![message_id, conversation_id],
25351                )
25352                .unwrap();
25353                tx.execute_compat(
25354                    "INSERT INTO message_metrics(
25355                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25356                         role, content_chars, content_tokens_est
25357                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 14, 2)",
25358                    fparams![message_id],
25359                )
25360                .unwrap();
25361            }
25362            tx.commit().unwrap();
25363        }
25364        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25365
25366        let report = storage.cleanup_orphan_fk_rows().unwrap();
25367
25368        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25369        let messages_count = report
25370            .per_table
25371            .iter()
25372            .find(|(table, _)| *table == "messages")
25373            .map(|(_, count)| *count);
25374        assert_eq!(messages_count, Some(i64::try_from(orphan_count).unwrap()));
25375        let messages_after: i64 = storage
25376            .raw()
25377            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25378                row.get_typed(0)
25379            })
25380            .unwrap();
25381        assert_eq!(messages_after, 0);
25382        let metrics_after: i64 = storage
25383            .raw()
25384            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25385                row.get_typed(0)
25386            })
25387            .unwrap();
25388        assert_eq!(metrics_after, 0);
25389    }
25390
25391    #[test]
25392    fn cleanup_orphan_fk_rows_pages_direct_child_orphans() {
25393        let dir = TempDir::new().unwrap();
25394        let db_path = dir.path().join("direct_orphan_fk_paged_self_heal.db");
25395        let storage = FrankenStorage::open(&db_path).unwrap();
25396        let orphan_count = (ORPHAN_FK_ID_CHUNK_SIZE * 2) + 5;
25397
25398        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25399        {
25400            let mut tx = storage.raw().transaction().unwrap();
25401            for idx in 0..orphan_count {
25402                let message_id = 50_000_i64 + i64::try_from(idx).unwrap();
25403                tx.execute_compat(
25404                    "INSERT INTO message_metrics(
25405                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25406                         role, content_chars, content_tokens_est
25407                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 21, 3)",
25408                    fparams![message_id],
25409                )
25410                .unwrap();
25411            }
25412            tx.commit().unwrap();
25413        }
25414        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25415
25416        let report = storage.cleanup_orphan_fk_rows().unwrap();
25417
25418        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25419        let metrics_count = report
25420            .per_table
25421            .iter()
25422            .filter(|(table, _)| *table == "message_metrics")
25423            .map(|(_, count)| *count)
25424            .sum::<i64>();
25425        assert_eq!(metrics_count, i64::try_from(orphan_count).unwrap());
25426        assert_eq!(
25427            report
25428                .per_table
25429                .iter()
25430                .filter(|(table, _)| *table == "message_metrics")
25431                .count(),
25432            1,
25433            "paged cleanup should aggregate report entries by table: {report:?}"
25434        );
25435        let metrics_after: i64 = storage
25436            .raw()
25437            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25438                row.get_typed(0)
25439            })
25440            .unwrap();
25441        assert_eq!(metrics_after, 0);
25442    }
25443}