use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
use crate::sources::provenance::{LOCAL_SOURCE_ID, Source, SourceKind};
use anyhow::{Context, Result, anyhow, bail};
use frankensqlite::{
Connection as FrankenConnection, Row as FrankenRow, SqliteValue,
compat::{
ConnectionExt as FrankenConnectionExt, OpenFlags as FrankenOpenFlags,
OptionalExtension as FrankenOptionalExtension, ParamValue, RowExt as FrankenRowExt,
Transaction as FrankenTransaction, TransactionExt as FrankenTransactionExt,
open_with_flags as open_franken_with_flags, param_slice_to_values, params_from_iter,
},
migrate::MigrationRunner,
};
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::{BufRead, BufReader, Write};
use std::process::{Command, Stdio};
use std::sync::{
Arc,
atomic::{AtomicBool, AtomicI8, AtomicI64, AtomicU64, AtomicUsize, Ordering},
};
macro_rules! fparams {
() => {
&[] as &[ParamValue]
};
($($val:expr),+ $(,)?) => {
&[$(ParamValue::from($val)),+] as &[ParamValue]
};
}
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use thiserror::Error;
use tracing::info;
const DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT: Duration = Duration::from_secs(30);
const DOCTOR_MUTATION_LOCK_MAX_METADATA_READ: u64 = 64 * 1024;
#[derive(Debug, Error)]
pub enum LazyDbError {
#[error("Database not found at {0}")]
NotFound(PathBuf),
#[error("Failed to open FrankenSQLite database at {path}: {source}")]
FrankenOpenFailed {
path: PathBuf,
source: frankensqlite::FrankenError,
},
}
pub struct SendFrankenConnection(FrankenConnection, i64, u64);
unsafe impl Send for SendFrankenConnection {}
impl SendFrankenConnection {
pub(crate) fn new(conn: FrankenConnection) -> Self {
Self(
conn,
UNSET_INDEX_WRITER_CHECKPOINT_PAGES,
UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS,
)
}
pub(crate) fn new_with_index_writer_state(
conn: FrankenConnection,
checkpoint_pages: i64,
busy_timeout_ms: u64,
) -> Self {
Self(conn, checkpoint_pages, busy_timeout_ms)
}
pub(crate) fn into_parts(self) -> (FrankenConnection, i64, u64) {
(self.0, self.1, self.2)
}
}
impl std::ops::Deref for SendFrankenConnection {
type Target = FrankenConnection;
fn deref(&self) -> &FrankenConnection {
&self.0
}
}
pub struct LazyFrankenDb {
path: PathBuf,
conn: parking_lot::Mutex<Option<SendFrankenConnection>>,
}
pub struct LazyFrankenDbGuard<'a>(parking_lot::MutexGuard<'a, Option<SendFrankenConnection>>);
impl std::fmt::Debug for LazyFrankenDbGuard<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("LazyFrankenDbGuard")
.field(&self.0.is_some())
.finish()
}
}
impl std::ops::Deref for LazyFrankenDbGuard<'_> {
type Target = FrankenConnection;
fn deref(&self) -> &FrankenConnection {
self.0
.as_ref()
.expect("LazyFrankenDb connection must be initialized before access")
}
}
impl LazyFrankenDb {
pub fn new(path: PathBuf) -> Self {
Self {
path,
conn: parking_lot::Mutex::new(None),
}
}
pub fn from_overrides(data_dir: &Option<PathBuf>, db_override: Option<PathBuf>) -> Self {
let data_dir = data_dir.clone().unwrap_or_else(crate::default_data_dir);
let path = db_override.unwrap_or_else(|| data_dir.join("agent_search.db"));
Self::new(path)
}
pub fn get(&self, reason: &str) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
let mut guard = self.conn.lock();
if guard.is_none() {
if !self.path.exists() {
return Err(LazyDbError::NotFound(self.path.clone()));
}
let start = Instant::now();
let _doctor_guard = acquire_doctor_mutation_db_open_guard(
&self.path,
DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT,
)
.map_err(|err| LazyDbError::FrankenOpenFailed {
path: self.path.clone(),
source: frankensqlite::FrankenError::Internal(err.to_string()),
})?;
let conn =
FrankenConnection::open(self.path.to_string_lossy().into_owned()).map_err(|e| {
LazyDbError::FrankenOpenFailed {
path: self.path.clone(),
source: e,
}
})?;
let elapsed_ms = start.elapsed().as_millis();
info!(
path = %self.path.display(),
elapsed_ms = elapsed_ms,
reason = reason,
"lazily opened FrankenSQLite database"
);
*guard = Some(SendFrankenConnection::new(conn));
}
Ok(LazyFrankenDbGuard(guard))
}
pub fn get_with_timeout(
&self,
reason: &str,
timeout: Duration,
) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
let mut guard = self.conn.lock();
if guard.is_none() {
if !self.path.exists() {
return Err(LazyDbError::NotFound(self.path.clone()));
}
let start = Instant::now();
let path_owned = self.path.to_string_lossy().into_owned();
let path_for_guard = self.path.clone();
let (tx, rx) = std::sync::mpsc::channel();
std::thread::spawn(move || {
let _doctor_guard =
match acquire_doctor_mutation_db_open_guard(&path_for_guard, timeout) {
Ok(guard) => guard,
Err(err) => {
let _ = tx
.send(Err(frankensqlite::FrankenError::Internal(err.to_string())));
return;
}
};
let _ =
tx.send(FrankenConnection::open(path_owned).map(SendFrankenConnection::new));
});
let conn = rx
.recv_timeout(timeout)
.map_err(|_| LazyDbError::FrankenOpenFailed {
path: self.path.clone(),
source: frankensqlite::FrankenError::Internal(format!(
"database open timed out after {}s (possible corruption or lock contention)",
timeout.as_secs()
)),
})?
.map_err(|e| LazyDbError::FrankenOpenFailed {
path: self.path.clone(),
source: e,
})?;
let elapsed_ms = start.elapsed().as_millis();
info!(
path = %self.path.display(),
elapsed_ms = elapsed_ms,
reason = reason,
"lazily opened FrankenSQLite database (with timeout)"
);
*guard = Some(conn);
}
Ok(LazyFrankenDbGuard(guard))
}
pub fn path(&self) -> &Path {
&self.path
}
pub fn is_open(&self) -> bool {
self.conn.lock().is_some()
}
}
static FRANKEN_RETRY_JITTER_STATE: AtomicU64 = AtomicU64::new(0x9e37_79b9_7f4a_7c15);
static DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH: AtomicUsize = AtomicUsize::new(0);
static MESSAGE_LOOKUP_TRACE_ENABLED: AtomicBool = AtomicBool::new(false);
static MESSAGE_LOOKUP_EXACT_IDX_PROBES: AtomicU64 = AtomicU64::new(0);
static MESSAGE_LOOKUP_BOUNDED_QUERIES: AtomicU64 = AtomicU64::new(0);
static MESSAGE_LOOKUP_FULL_SCAN_QUERIES: AtomicU64 = AtomicU64::new(0);
static MESSAGE_LOOKUP_ROWS_MATERIALIZED: AtomicU64 = AtomicU64::new(0);
#[derive(Debug, Clone, Copy, Default, Serialize)]
pub(crate) struct MessageLookupTraceCounters {
pub exact_idx_probes: u64,
pub bounded_lookup_queries: u64,
pub full_scan_queries: u64,
pub rows_materialized: u64,
}
impl MessageLookupTraceCounters {
pub(crate) fn saturating_sub(self, before: Self) -> Self {
Self {
exact_idx_probes: self
.exact_idx_probes
.saturating_sub(before.exact_idx_probes),
bounded_lookup_queries: self
.bounded_lookup_queries
.saturating_sub(before.bounded_lookup_queries),
full_scan_queries: self
.full_scan_queries
.saturating_sub(before.full_scan_queries),
rows_materialized: self
.rows_materialized
.saturating_sub(before.rows_materialized),
}
}
pub(crate) fn lookups_against_global(self) -> u64 {
self.exact_idx_probes.saturating_add(self.rows_materialized)
}
}
pub(crate) fn set_message_lookup_trace_enabled(enabled: bool) -> bool {
MESSAGE_LOOKUP_TRACE_ENABLED.swap(enabled, Ordering::Relaxed)
}
pub(crate) fn message_lookup_trace_snapshot() -> MessageLookupTraceCounters {
MessageLookupTraceCounters {
exact_idx_probes: MESSAGE_LOOKUP_EXACT_IDX_PROBES.load(Ordering::Relaxed),
bounded_lookup_queries: MESSAGE_LOOKUP_BOUNDED_QUERIES.load(Ordering::Relaxed),
full_scan_queries: MESSAGE_LOOKUP_FULL_SCAN_QUERIES.load(Ordering::Relaxed),
rows_materialized: MESSAGE_LOOKUP_ROWS_MATERIALIZED.load(Ordering::Relaxed),
}
}
fn record_message_lookup_exact_idx_probe() {
if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
MESSAGE_LOOKUP_EXACT_IDX_PROBES.fetch_add(1, Ordering::Relaxed);
}
}
fn record_message_lookup_bounded_queries(query_count: u64, rows: usize) {
if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
MESSAGE_LOOKUP_BOUNDED_QUERIES.fetch_add(query_count, Ordering::Relaxed);
MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
}
}
fn record_message_lookup_full_scan_query(rows: usize) {
if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
MESSAGE_LOOKUP_FULL_SCAN_QUERIES.fetch_add(1, Ordering::Relaxed);
MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
}
}
pub(crate) struct DoctorMutationDbOpenBypassGuard;
impl Drop for DoctorMutationDbOpenBypassGuard {
fn drop(&mut self) {
DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_sub(1, Ordering::SeqCst);
}
}
pub(crate) fn enter_doctor_mutation_db_open_bypass() -> DoctorMutationDbOpenBypassGuard {
DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_add(1, Ordering::SeqCst);
DoctorMutationDbOpenBypassGuard
}
fn doctor_mutation_db_open_bypass_active() -> bool {
DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.load(Ordering::SeqCst) > 0
}
fn next_franken_retry_jitter_ms(max_inclusive: u64) -> u64 {
let mut value = FRANKEN_RETRY_JITTER_STATE.fetch_add(0x9e37_79b9_7f4a_7c15, Ordering::Relaxed);
value ^= value >> 30;
value = value.wrapping_mul(0xbf58_476d_1ce4_e5b9);
value ^= value >> 27;
value = value.wrapping_mul(0x94d0_49bb_1331_11eb);
value ^= value >> 31;
value % max_inclusive.saturating_add(1)
}
pub(crate) fn sleep_with_franken_retry_backoff(
backoff: &mut Duration,
remaining: Duration,
max_backoff: Duration,
) {
let capped = (*backoff).min(remaining);
let extra_budget = remaining.saturating_sub(capped).min(capped);
let extra_ms = extra_budget.as_millis().min(u128::from(u64::MAX)) as u64;
let sleep_for = if extra_ms == 0 {
capped
} else {
capped
.saturating_add(Duration::from_millis(next_franken_retry_jitter_ms(
extra_ms,
)))
.min(remaining)
};
std::thread::sleep(sleep_for);
*backoff = backoff.saturating_mul(2).min(max_backoff);
}
struct DoctorMutationDbOpenGuard(Option<fs::File>);
impl Drop for DoctorMutationDbOpenGuard {
fn drop(&mut self) {
if let Some(file) = self.0.as_ref() {
let _ = fs2::FileExt::unlock(file);
}
}
}
fn doctor_mutation_lock_path_for_db_open(db_path: &Path) -> Option<PathBuf> {
if db_path.file_name().and_then(|name| name.to_str()) != Some("agent_search.db") {
return None;
}
Some(
db_path
.parent()?
.join("doctor")
.join("locks")
.join("doctor-repair.lock"),
)
}
fn doctor_lock_metadata_pid_is_current_process(raw: &str) -> bool {
raw.lines().any(|line| {
let Some((key, value)) = line.split_once('=') else {
return false;
};
key.trim() == "pid"
&& value
.trim()
.parse::<u32>()
.is_ok_and(|pid| pid == std::process::id())
})
}
fn doctor_lock_file_pid_is_current_process(file: &fs::File) -> bool {
use std::io::Read as _;
let Ok(mut file) = file.try_clone() else {
return false;
};
let mut raw = String::new();
let _ = std::io::Read::take(&mut file, DOCTOR_MUTATION_LOCK_MAX_METADATA_READ)
.read_to_string(&mut raw);
doctor_lock_metadata_pid_is_current_process(&raw)
}
fn acquire_doctor_mutation_db_open_guard(
db_path: &Path,
timeout: Duration,
) -> Result<DoctorMutationDbOpenGuard> {
let Some(lock_path) = doctor_mutation_lock_path_for_db_open(db_path) else {
return Ok(DoctorMutationDbOpenGuard(None));
};
if doctor_mutation_db_open_bypass_active() {
return Ok(DoctorMutationDbOpenGuard(None));
}
if let Some(parent) = lock_path.parent() {
fs::create_dir_all(parent).with_context(|| {
format!(
"creating doctor mutation lock directory {} before opening {}",
parent.display(),
db_path.display()
)
})?;
}
let deadline = Instant::now() + timeout;
let mut backoff = Duration::from_millis(4);
loop {
let file = fs::OpenOptions::new()
.create(true)
.truncate(false)
.read(true)
.write(true)
.open(&lock_path)
.with_context(|| {
format!(
"opening doctor mutation lock {} before opening {}",
lock_path.display(),
db_path.display()
)
})?;
if doctor_lock_file_pid_is_current_process(&file) {
return Ok(DoctorMutationDbOpenGuard(None));
}
match fs2::FileExt::try_lock_shared(&file) {
Ok(()) => return Ok(DoctorMutationDbOpenGuard(Some(file))),
Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
let now = Instant::now();
if now >= deadline {
return Err(anyhow!(
"doctor mutation lock {} is active while opening {}; refusing to open during repair after waiting {}ms",
lock_path.display(),
db_path.display(),
timeout.as_millis()
));
}
let remaining = deadline.saturating_duration_since(now);
sleep_with_franken_retry_backoff(
&mut backoff,
remaining,
Duration::from_millis(128),
);
}
Err(err) => {
return Err(anyhow!(
"failed to acquire shared doctor mutation lock {} before opening {}: {}",
lock_path.display(),
db_path.display(),
err
));
}
}
}
}
pub(crate) fn open_franken_storage_with_timeout(
path: &Path,
timeout: Duration,
) -> Result<FrankenStorage> {
if !path.exists() {
return Err(anyhow!("Database not found at {}", path.display()));
}
let deadline = Instant::now() + timeout;
let mut backoff = Duration::from_millis(4);
loop {
match FrankenStorage::open(path) {
Ok(storage) => return Ok(storage),
Err(err) if retryable_franken_anyhow(&err) => {
let now = Instant::now();
if now >= deadline {
return Err(err);
}
let remaining = deadline.saturating_duration_since(now);
sleep_with_franken_retry_backoff(
&mut backoff,
remaining,
Duration::from_millis(128),
);
}
Err(err) => return Err(err),
}
}
}
pub(crate) fn open_current_schema_storage_with_timeout(
path: &Path,
timeout: Duration,
) -> Result<Option<FrankenStorage>> {
if !path.exists() {
return Ok(None);
}
let mut storage = FrankenStorage::new(
open_franken_raw_connection_with_timeout(path, timeout)?,
path.to_path_buf(),
);
storage.apply_open_stage_busy_timeout();
let version = storage
.raw()
.query("SELECT value FROM meta WHERE key = 'schema_version';")
.ok()
.and_then(|rows| rows.first().cloned())
.and_then(|row| row.get_typed::<String>(0).ok())
.and_then(|raw| raw.parse::<i64>().ok());
if version != Some(CURRENT_SCHEMA_VERSION) {
if let Err(close_err) = storage.close_without_checkpoint_in_place() {
tracing::debug!(
error = %close_err,
db_path = %path.display(),
"open_current_schema_storage_with_timeout: close_without_checkpoint_in_place failed; falling back to best-effort close"
);
storage.close_best_effort_in_place();
}
return Ok(None);
}
transition_from_meta_version(&storage.conn)?;
storage.repair_missing_current_schema_objects()?;
storage.apply_config()?;
Ok(Some(storage))
}
pub(crate) fn open_franken_readonly_storage_with_timeout(
path: &Path,
timeout: Duration,
) -> Result<FrankenStorage> {
if !path.exists() {
return Err(anyhow!("Database not found at {}", path.display()));
}
let deadline = Instant::now() + timeout;
let mut backoff = Duration::from_millis(4);
loop {
match FrankenStorage::open_readonly(path) {
Ok(storage) => return Ok(storage),
Err(err) if retryable_franken_anyhow(&err) => {
let now = Instant::now();
if now >= deadline {
return Err(err);
}
let remaining = deadline.saturating_duration_since(now);
sleep_with_franken_retry_backoff(
&mut backoff,
remaining,
Duration::from_millis(128),
);
}
Err(err) => return Err(err),
}
}
}
pub(crate) fn open_franken_raw_connection_with_timeout(
path: &Path,
timeout: Duration,
) -> Result<FrankenConnection> {
if !path.exists() {
return Err(anyhow!("Database not found at {}", path.display()));
}
let path_str = path.to_string_lossy().to_string();
let deadline = Instant::now() + timeout;
let mut backoff = Duration::from_millis(4);
loop {
let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
match FrankenConnection::open(&path_str)
.with_context(|| format!("opening raw frankensqlite db at {}", path.display()))
{
Ok(conn) => return Ok(conn),
Err(err) if retryable_franken_anyhow(&err) => {
let now = Instant::now();
if now >= deadline {
return Err(err);
}
let remaining = deadline.saturating_duration_since(now);
sleep_with_franken_retry_backoff(
&mut backoff,
remaining,
Duration::from_millis(128),
);
}
Err(err) => return Err(err),
}
}
}
pub(crate) fn open_franken_raw_readonly_connection_with_timeout(
path: &Path,
timeout: Duration,
) -> Result<FrankenConnection> {
if !path.exists() {
return Err(anyhow!("Database not found at {}", path.display()));
}
let path_str = path.to_string_lossy().to_string();
let deadline = Instant::now() + timeout;
let mut backoff = Duration::from_millis(4);
loop {
let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
match open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
.with_context(|| {
format!(
"opening raw frankensqlite db readonly at {}",
path.display()
)
}) {
Ok(conn) => return Ok(conn),
Err(err) if retryable_franken_anyhow(&err) => {
let now = Instant::now();
if now >= deadline {
return Err(err);
}
let remaining = deadline.saturating_duration_since(now);
sleep_with_franken_retry_backoff(
&mut backoff,
remaining,
Duration::from_millis(128),
);
}
Err(err) => return Err(err),
}
}
}
pub(crate) fn retryable_franken_error(err: &frankensqlite::FrankenError) -> bool {
matches!(
err,
frankensqlite::FrankenError::Busy
| frankensqlite::FrankenError::BusyRecovery
| frankensqlite::FrankenError::BusySnapshot { .. }
| frankensqlite::FrankenError::DatabaseLocked { .. }
| frankensqlite::FrankenError::LockFailed { .. }
| frankensqlite::FrankenError::WriteConflict { .. }
| frankensqlite::FrankenError::SerializationFailure { .. }
) || retryable_storage_error_message(&err.to_string())
}
pub(crate) fn retryable_storage_error_message(message: &str) -> bool {
let lower = message.to_ascii_lowercase();
lower.contains("busy")
|| lower.contains("locked")
|| lower.contains("locking")
|| lower.contains("contention")
|| lower.contains("temporarily unavailable")
|| lower.contains("would block")
}
pub(crate) fn retryable_franken_anyhow(err: &anyhow::Error) -> bool {
err.chain().any(|cause| {
cause
.downcast_ref::<frankensqlite::FrankenError>()
.is_some_and(retryable_franken_error)
|| retryable_storage_error_message(&cause.to_string())
})
}
impl Drop for LazyFrankenDb {
fn drop(&mut self) {
let Some(mut conn) = self.conn.get_mut().take() else {
return;
};
conn.0.close_best_effort_in_place();
}
}
#[derive(Debug, Clone)]
pub struct ConnectionManagerConfig {
pub reader_count: usize,
pub max_writers: usize,
}
impl Default for ConnectionManagerConfig {
fn default() -> Self {
let cpus = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(4);
Self {
reader_count: 4,
max_writers: cpus,
}
}
}
pub struct FrankenConnectionManager {
db_path: PathBuf,
readers: Vec<parking_lot::Mutex<SendFrankenConnection>>,
reader_idx: std::sync::atomic::AtomicUsize,
writer_tokens: (
crossbeam_channel::Sender<()>,
crossbeam_channel::Receiver<()>,
),
config: ConnectionManagerConfig,
}
unsafe impl Send for FrankenConnectionManager {}
unsafe impl Sync for FrankenConnectionManager {}
impl FrankenConnectionManager {
pub fn new(db_path: impl Into<PathBuf>, config: ConnectionManagerConfig) -> Result<Self> {
let db_path = db_path.into();
let path_str = db_path.to_string_lossy().to_string();
let reader_count = config.reader_count.max(1);
let mut readers = Vec::with_capacity(reader_count);
for _ in 0..reader_count {
let conn = FrankenConnection::open(&path_str)
.with_context(|| format!("opening reader connection at {}", db_path.display()))?;
let _ = conn.execute("PRAGMA busy_timeout = 5000;"); let _ = conn.execute("PRAGMA cache_size = -16384;"); readers.push(parking_lot::Mutex::new(SendFrankenConnection::new(conn)));
}
let max_writers = config.max_writers.max(1);
let (tx, rx) = crossbeam_channel::bounded(max_writers);
for _ in 0..max_writers {
tx.send(())
.map_err(|_| anyhow!("writer token channel closed during initialization"))?;
}
Ok(Self {
db_path,
readers,
reader_idx: std::sync::atomic::AtomicUsize::new(0),
writer_tokens: (tx, rx),
config: ConnectionManagerConfig {
reader_count,
max_writers,
},
})
}
pub fn reader(&self) -> parking_lot::MutexGuard<'_, SendFrankenConnection> {
let idx = self
.reader_idx
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
self.readers[idx % self.readers.len()].lock()
}
pub fn writer(&self) -> Result<WriterGuard<'_>> {
self.writer_tokens
.1
.recv()
.map_err(|_| anyhow!("writer token channel closed"))?;
let path_str = self.db_path.to_string_lossy().to_string();
let conn = match FrankenConnection::open(&path_str) {
Ok(c) => c,
Err(e) => {
let _ = self.writer_tokens.0.send(());
return Err(anyhow::Error::from(e).context(format!(
"opening writer connection at {}",
self.db_path.display()
)));
}
};
let storage = FrankenStorage::new(conn, self.db_path.clone());
if let Err(e) = storage.apply_config() {
let _ = self.writer_tokens.0.send(());
return Err(e);
}
Ok(WriterGuard {
storage,
mgr: self,
committed: false,
})
}
pub fn concurrent_writer(&self) -> Result<WriterGuard<'_>> {
self.writer_tokens
.1
.recv()
.map_err(|_| anyhow!("writer token channel closed"))?;
let path_str = self.db_path.to_string_lossy().to_string();
let conn = match FrankenConnection::open(&path_str) {
Ok(c) => c,
Err(e) => {
let _ = self.writer_tokens.0.send(());
return Err(anyhow::Error::from(e).context(format!(
"opening concurrent writer at {}",
self.db_path.display()
)));
}
};
let storage = FrankenStorage::new(conn, self.db_path.clone());
if let Err(e) = storage.apply_config() {
let _ = self.writer_tokens.0.send(());
return Err(e);
}
let _ = storage.raw().execute("PRAGMA cache_size = -4096;");
Ok(WriterGuard {
storage,
mgr: self,
committed: false,
})
}
pub fn db_path(&self) -> &Path {
&self.db_path
}
pub fn reader_count(&self) -> usize {
self.readers.len()
}
pub fn max_writers(&self) -> usize {
self.config.max_writers
}
}
impl Drop for FrankenConnectionManager {
fn drop(&mut self) {
for reader in &mut self.readers {
reader.get_mut().0.close_best_effort_in_place();
}
}
}
pub struct WriterGuard<'a> {
storage: FrankenStorage,
mgr: &'a FrankenConnectionManager,
committed: bool,
}
impl<'a> WriterGuard<'a> {
pub fn storage(&self) -> &FrankenStorage {
&self.storage
}
pub fn mark_committed(&mut self) {
self.committed = true;
}
}
impl Drop for WriterGuard<'_> {
fn drop(&mut self) {
if !self.committed {
let _ = self.storage.raw().execute("ROLLBACK;");
}
self.storage.close_best_effort_in_place();
let _ = self.mgr.writer_tokens.0.send(());
}
}
fn serialize_json_to_msgpack(value: &serde_json::Value) -> Option<Vec<u8>> {
if value.is_null() || value.as_object().is_some_and(|o| o.is_empty()) {
return None;
}
rmp_serde::to_vec(value).ok()
}
fn deserialize_msgpack_to_json(bytes: &[u8]) -> serde_json::Value {
if bytes.is_empty() {
return serde_json::Value::Object(serde_json::Map::new());
}
rmp_serde::from_slice(bytes).unwrap_or_else(|e| {
tracing::debug!(
error = %e,
bytes_len = bytes.len(),
"Failed to deserialize metadata - returning empty object"
);
serde_json::Value::Object(serde_json::Map::new())
})
}
fn franken_read_metadata_compat(
row: &FrankenRow,
json_idx: usize,
bin_idx: usize,
) -> serde_json::Value {
if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
&& !bytes.is_empty()
{
return deserialize_msgpack_to_json(&bytes);
}
if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
return serde_json::from_str(&json_str)
.unwrap_or_else(|_| serde_json::Value::Object(serde_json::Map::new()));
}
serde_json::Value::Object(serde_json::Map::new())
}
fn franken_read_message_extra_compat(
row: &FrankenRow,
json_idx: usize,
bin_idx: usize,
) -> serde_json::Value {
if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
&& !bytes.is_empty()
{
return deserialize_msgpack_to_json(&bytes);
}
if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
return serde_json::from_str(&json_str).unwrap_or(serde_json::Value::Null);
}
serde_json::Value::Null
}
#[derive(Debug, Error)]
pub enum MigrationError {
#[error("Rebuild required: {reason}")]
RebuildRequired {
reason: String,
backup_path: Option<std::path::PathBuf>,
},
#[error("Database error: {0}")]
Database(#[from] frankensqlite::FrankenError),
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
#[error("{0}")]
Other(String),
}
impl From<anyhow::Error> for MigrationError {
fn from(e: anyhow::Error) -> Self {
MigrationError::Other(e.to_string())
}
}
const MAX_BACKUPS: usize = 3;
const BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA: &str = "PRAGMA busy_timeout = 30000;";
const USER_DATA_FILES: &[&str] = &["bookmarks.db", "tui_state.json", "sources.toml", ".env"];
pub fn is_user_data_file(path: &Path) -> bool {
path.file_name()
.and_then(|n| n.to_str())
.map(|name| USER_DATA_FILES.contains(&name))
.unwrap_or(false)
}
pub const FTS5_REGISTER_SQL: &str = "\
CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(\
content, title, agent, workspace, source_path, \
created_at UNINDEXED, \
content='', tokenize='porter'\
)";
const FTS_FRANKEN_REBUILD_META_KEY: &str = "fts_frankensqlite_rebuild_generation";
const FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY: &str = "fts_frankensqlite_archive_fingerprint";
const FTS_FRANKEN_REBUILD_GENERATION: i64 = 1;
const DAILY_STATS_HEALTH_META_KEY: &str = "daily_stats_archive_fingerprint";
const DAILY_STATS_HEALTH_GENERATION_META_KEY: &str = "daily_stats_health_generation";
const DAILY_STATS_HEALTH_GENERATION: i64 = 1;
pub const FTS5_DELETE_ALL_SQL: &str =
"INSERT INTO fts_messages(fts_messages) VALUES('delete-all');";
#[cfg(test)]
pub(crate) fn materialize_fresh_fts_schema_via_rusqlite(db_path: &Path) -> Result<()> {
let storage = FrankenStorage::open(db_path).with_context(|| {
format!(
"opening frankensqlite db at {} for FTS materialization",
db_path.display()
)
})?;
storage.rebuild_fts_via_frankensqlite().map(|_| ())
}
#[cfg(test)]
pub(crate) fn rebuild_fts_via_rusqlite(db_path: &Path) -> Result<usize> {
let storage = FrankenStorage::open(db_path).with_context(|| {
format!(
"opening frankensqlite db at {} for FTS rebuild",
db_path.display()
)
})?;
let inserted = storage.rebuild_fts_via_frankensqlite()?;
storage.record_fts_franken_rebuild_generation()?;
Ok(inserted)
}
pub(crate) fn ensure_fts_consistency_via_rusqlite(db_path: &Path) -> Result<FtsConsistencyRepair> {
let storage = FrankenStorage::open(db_path).with_context(|| {
format!(
"opening frankensqlite db at {} for FTS consistency check",
db_path.display()
)
})?;
storage.ensure_search_fallback_fts_consistency()
}
pub fn create_backup(db_path: &Path) -> Result<Option<std::path::PathBuf>, MigrationError> {
if !bundle_path_exists(db_path)? {
return Ok(None);
}
if !copyable_bundle_file_exists(db_path)? {
return Ok(None);
}
let _ = copyable_bundle_sidecar_sources(db_path)?;
let backup_path = unique_backup_path(db_path);
let vacuum_stage_path = vacuum_stage_backup_path(&backup_path);
match vacuum_into_backup_stage(db_path, &vacuum_stage_path) {
Ok(()) => {
fs::rename(&vacuum_stage_path, &backup_path)?;
}
Err(err) if backup_vacuum_error_requires_consistent_retry(&err) => {
tracing::warn!(
db_path = %db_path.display(),
error = %err,
"create_backup: VACUUM INTO hit transient contention; refusing raw WAL bundle copy"
);
return Err(MigrationError::Database(err));
}
Err(err) => {
tracing::warn!(
db_path = %db_path.display(),
error = %err,
"create_backup: VACUUM INTO failed; falling back to raw evidence copy"
);
}
}
if backup_path.exists() {
sync_file_if_exists(&backup_path)?;
if let Some(parent) = backup_path.parent() {
sync_parent_directory(parent)?;
}
return Ok(Some(backup_path));
}
copy_database_bundle(db_path, &backup_path)?;
Ok(Some(backup_path))
}
fn vacuum_into_backup_stage(
db_path: &Path,
stage_path: &Path,
) -> std::result::Result<(), frankensqlite::FrankenError> {
let mut conn = open_franken_with_flags(
&db_path.to_string_lossy(),
FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
)?;
let result = (|| {
conn.execute(BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA)?;
let path_str = stage_path.to_string_lossy();
conn.execute_compat("VACUUM INTO ?", fparams![path_str.as_ref()])?;
Ok(())
})();
if let Err(close_err) = conn.close_in_place() {
tracing::warn!(
error = %close_err,
db_path = %db_path.display(),
"create_backup: close_in_place failed after VACUUM INTO; falling back to best-effort close"
);
conn.close_best_effort_in_place();
}
result
}
fn backup_vacuum_error_requires_consistent_retry(err: &frankensqlite::FrankenError) -> bool {
retryable_franken_error(err)
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct DatabaseBundleMoveResult {
pub database: bool,
pub wal: bool,
pub shm: bool,
}
impl DatabaseBundleMoveResult {
pub fn moved_any(&self) -> bool {
self.database || self.wal || self.shm
}
}
fn database_sidecar_path(path: &Path, suffix: &str) -> PathBuf {
PathBuf::from(format!("{}{}", path.to_string_lossy(), suffix))
}
pub(crate) fn move_database_bundle(
source_root: &Path,
destination_root: &Path,
) -> std::io::Result<DatabaseBundleMoveResult> {
let mut moved = DatabaseBundleMoveResult::default();
if let Some(parent) = destination_root.parent() {
fs::create_dir_all(parent)?;
sync_parent_directory(parent)?;
}
if bundle_path_exists(source_root)? {
fs::rename(source_root, destination_root)?;
moved.database = true;
}
let wal_source = database_sidecar_path(source_root, "-wal");
if bundle_path_exists(&wal_source)? {
fs::rename(&wal_source, database_sidecar_path(destination_root, "-wal"))?;
moved.wal = true;
}
let shm_source = database_sidecar_path(source_root, "-shm");
if bundle_path_exists(&shm_source)? {
fs::rename(&shm_source, database_sidecar_path(destination_root, "-shm"))?;
moved.shm = true;
}
if moved.moved_any() {
if let Some(parent) = source_root.parent() {
sync_parent_directory(parent)?;
}
if let Some(parent) = destination_root.parent() {
sync_parent_directory(parent)?;
}
}
Ok(moved)
}
fn bundle_path_exists(path: &Path) -> std::io::Result<bool> {
match fs::symlink_metadata(path) {
Ok(_) => Ok(true),
Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
Err(err) => Err(err),
}
}
fn copy_database_bundle(source_root: &Path, destination_root: &Path) -> Result<()> {
if let Some(parent) = destination_root.parent() {
fs::create_dir_all(parent).with_context(|| {
format!(
"creating destination directory for database bundle copy: {}",
parent.display()
)
})?;
sync_parent_directory(parent)
.with_context(|| format!("syncing destination directory {}", parent.display()))?;
}
if !copyable_bundle_file_exists(source_root)? {
bail!(
"database bundle root is missing before copy: {}",
source_root.display()
);
}
let sidecars = copyable_bundle_sidecar_sources(source_root)?;
fs::copy(source_root, destination_root).with_context(|| {
format!(
"copying database bundle {} -> {}",
source_root.display(),
destination_root.display()
)
})?;
sync_file_if_exists(destination_root).with_context(|| {
format!(
"syncing copied database bundle {}",
destination_root.display()
)
})?;
for (source_sidecar, suffix) in sidecars {
let destination_sidecar = database_sidecar_path(destination_root, suffix);
fs::copy(&source_sidecar, &destination_sidecar).with_context(|| {
format!(
"copying database bundle sidecar {} -> {}",
source_sidecar.display(),
destination_sidecar.display()
)
})?;
sync_file_if_exists(&destination_sidecar).with_context(|| {
format!(
"syncing copied database bundle sidecar {}",
destination_sidecar.display()
)
})?;
}
if let Some(parent) = destination_root.parent() {
sync_parent_directory(parent)
.with_context(|| format!("syncing destination directory {}", parent.display()))?;
}
Ok(())
}
fn copyable_bundle_sidecar_sources(source_root: &Path) -> Result<Vec<(PathBuf, &'static str)>> {
let mut sidecars = Vec::new();
for suffix in ["-wal", "-shm"] {
let source_sidecar = database_sidecar_path(source_root, suffix);
if copyable_bundle_file_exists(&source_sidecar)? {
sidecars.push((source_sidecar, suffix));
}
}
Ok(sidecars)
}
fn copyable_bundle_file_exists(path: &Path) -> Result<bool> {
match fs::symlink_metadata(path) {
Ok(metadata) => {
let file_type = metadata.file_type();
if file_type.is_symlink() {
bail!(
"refusing to copy database bundle symlink: {}",
path.display()
);
}
if !file_type.is_file() {
bail!(
"refusing to copy non-file database bundle path: {}",
path.display()
);
}
Ok(true)
}
Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
Err(err) => Err(err).with_context(|| {
format!(
"checking database bundle path before copy: {}",
path.display()
)
}),
}
}
pub(crate) fn remove_database_files(path: &Path) -> std::io::Result<()> {
let mut removed_any = false;
match fs::remove_file(path) {
Ok(()) => removed_any = true,
Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
Err(err) => return Err(err),
}
for suffix in ["-wal", "-shm"] {
match fs::remove_file(database_sidecar_path(path, suffix)) {
Ok(()) => removed_any = true,
Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
Err(err) => return Err(err),
}
}
if removed_any && let Some(parent) = path.parent() {
sync_parent_directory(parent)?;
}
Ok(())
}
#[cfg(not(windows))]
fn sync_parent_directory(path: &Path) -> std::io::Result<()> {
fs::File::open(path)?.sync_all()
}
#[cfg(windows)]
fn sync_parent_directory(_path: &Path) -> std::io::Result<()> {
Ok(())
}
fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
if path.exists() {
fs::File::open(path)?.sync_all()?;
}
Ok(())
}
pub fn cleanup_old_backups(db_path: &Path, keep_count: usize) -> Result<(), std::io::Error> {
let parent = match db_path.parent() {
Some(p) => p,
None => return Ok(()),
};
let db_name = db_path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
let prefix = format!("{}.backup.", db_name);
let mut backups: Vec<(std::path::PathBuf, SystemTime)> = Vec::new();
if let Ok(entries) = fs::read_dir(parent) {
for entry in entries.flatten() {
let path = entry.path();
if let Some(name) = path.file_name().and_then(|n| n.to_str())
&& is_backup_root_name(name, &prefix)
&& let Ok(meta) = fs::metadata(&path)
&& meta.is_file()
&& let Ok(mtime) = meta.modified()
{
backups.push((path, mtime));
}
}
}
backups.sort_by_key(|entry| std::cmp::Reverse(entry.1));
for (path, _) in backups.into_iter().skip(keep_count) {
let _ = fs::remove_file(&path);
let _ = fs::remove_file(database_sidecar_path(&path, "-wal"));
let _ = fs::remove_file(database_sidecar_path(&path, "-shm"));
}
Ok(())
}
#[derive(Debug, Clone)]
pub(crate) struct HistoricalDatabaseBundle {
root_path: PathBuf,
total_bytes: u64,
modified_at_ms: i64,
supports_direct_readonly: bool,
probe: HistoricalBundleProbe,
}
#[derive(Debug, Clone, Copy, Default)]
struct HistoricalBundleProbe {
schema_version: Option<i64>,
fts_schema_rows: Option<i64>,
fts_queryable: bool,
max_message_id: i64,
}
#[cfg(test)]
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct SqliteDatabaseHealthProbe {
pub schema_version: Option<i64>,
pub quick_check_ok: bool,
pub fts_schema_rows: i64,
pub fts_queryable: bool,
pub message_count: i64,
pub max_message_id: i64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum FtsConsistencyRepair {
AlreadyHealthy {
rows: usize,
},
IncrementalCatchUp {
inserted_rows: usize,
total_rows: usize,
},
Rebuilt {
inserted_rows: usize,
},
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub struct HistoricalSalvageOutcome {
pub bundles_considered: usize,
pub bundles_imported: usize,
pub conversations_imported: usize,
pub messages_imported: usize,
}
impl HistoricalSalvageOutcome {
pub(crate) fn accumulate(&mut self, other: Self) {
self.bundles_considered += other.bundles_considered;
self.bundles_imported += other.bundles_imported;
self.conversations_imported += other.conversations_imported;
self.messages_imported += other.messages_imported;
}
}
#[derive(Debug)]
struct HistoricalReadConnection {
conn: FrankenConnection,
method: &'static str,
root_path: PathBuf,
_tempdir: Option<tempfile::TempDir>,
}
const HISTORICAL_RECOVERY_CORE_SCHEMA: &str = r"
CREATE TABLE sources (
id TEXT PRIMARY KEY,
kind TEXT,
host_label TEXT,
machine_id TEXT,
platform TEXT,
config_json TEXT,
created_at INTEGER,
updated_at INTEGER
);
CREATE TABLE agents (
id INTEGER PRIMARY KEY,
slug TEXT,
name TEXT,
version TEXT,
kind TEXT,
created_at INTEGER,
updated_at INTEGER
);
CREATE TABLE workspaces (
id INTEGER PRIMARY KEY,
path TEXT,
display_name TEXT
);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
external_id TEXT,
title TEXT,
source_path TEXT,
started_at INTEGER,
ended_at INTEGER,
approx_tokens INTEGER,
metadata_json TEXT,
origin_host TEXT,
metadata_bin BLOB,
total_input_tokens INTEGER,
total_output_tokens INTEGER,
total_cache_read_tokens INTEGER,
total_cache_creation_tokens INTEGER,
grand_total_tokens INTEGER,
estimated_cost_usd REAL,
primary_model TEXT,
api_call_count INTEGER,
tool_call_count INTEGER,
user_message_count INTEGER,
assistant_message_count INTEGER,
last_message_idx INTEGER,
last_message_created_at INTEGER
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
role TEXT,
author TEXT,
created_at INTEGER,
content TEXT,
extra_json TEXT,
extra_bin BLOB
);
CREATE TABLE snippets (
id INTEGER PRIMARY KEY,
message_id INTEGER,
file_path TEXT,
start_line INTEGER,
end_line INTEGER,
language TEXT,
snippet_text TEXT
);
";
const HISTORICAL_SALVAGE_LEDGER_VERSION: u32 = 2;
const HISTORICAL_SALVAGE_PROGRESS_VERSION: u32 = 1;
const SOURCE_PATH_MERGE_START_TOLERANCE_MS: i64 = 5 * 60 * 1000;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct HistoricalBundleProgress {
progress_version: u32,
path: String,
bytes: u64,
modified_at_ms: i64,
method: String,
last_completed_source_row_id: i64,
conversations_imported: usize,
messages_imported: usize,
updated_at_ms: i64,
}
#[derive(Debug, Clone)]
struct HistoricalBatchEntry {
source_row_id: i64,
agent_id: i64,
workspace_id: Option<i64>,
conversation: Conversation,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
struct HistoricalBatchImportTotals {
inserted_source_rows: usize,
inserted_messages: usize,
}
fn historical_bundle_root_paths(db_path: &Path) -> Vec<PathBuf> {
let mut roots = Vec::new();
let Some(parent) = db_path.parent() else {
return roots;
};
let db_name = db_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("agent_search.db");
let db_stem = db_path
.file_stem()
.and_then(|n| n.to_str())
.unwrap_or("agent_search");
let mut push_root = |path: PathBuf| {
if path == db_path {
return;
}
if !roots.iter().any(|existing| existing == &path) {
roots.push(path);
}
};
if let Ok(entries) = fs::read_dir(parent) {
for entry in entries.flatten() {
let path = entry.path();
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
continue;
};
if has_db_sidecar_suffix(name) {
continue;
}
if name.starts_with(&format!("{db_name}.backup."))
|| name.starts_with(&format!("{db_stem}.corrupt."))
{
push_root(path);
}
}
}
let backups_dir = parent.join("backups");
if let Ok(entries) = fs::read_dir(backups_dir) {
for entry in entries.flatten() {
let path = entry.path();
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
continue;
};
if has_db_sidecar_suffix(name) {
continue;
}
if name.starts_with(&format!("{db_name}.")) && name.ends_with(".bak") {
push_root(path);
}
}
}
push_named_database_children(&mut roots, db_path, &parent.join("repair-lab"), db_name);
push_named_database_children(&mut roots, db_path, &parent.join("snapshots"), db_name);
roots
}
fn push_named_database_children(
roots: &mut Vec<PathBuf>,
canonical_db_path: &Path,
dir: &Path,
db_name: &str,
) {
if let Ok(entries) = fs::read_dir(dir) {
for entry in entries.flatten() {
let candidate = entry.path().join(db_name);
if candidate == canonical_db_path {
continue;
}
if candidate.exists() && !roots.iter().any(|existing| existing == &candidate) {
roots.push(candidate);
}
}
}
}
fn file_mtime_ms(path: &Path) -> i64 {
fs::metadata(path)
.and_then(|meta| meta.modified())
.ok()
.and_then(|ts| ts.duration_since(UNIX_EPOCH).ok())
.map(|d| d.as_millis() as i64)
.unwrap_or(0)
}
fn bundle_total_bytes(root_path: &Path) -> u64 {
let mut total = fs::metadata(root_path).map(|meta| meta.len()).unwrap_or(0);
for suffix in ["-wal", "-shm"] {
let sidecar = database_sidecar_path(root_path, suffix);
total = total.saturating_add(fs::metadata(sidecar).map(|meta| meta.len()).unwrap_or(0));
}
total
}
pub(crate) fn discover_historical_database_bundles(
db_path: &Path,
) -> Vec<HistoricalDatabaseBundle> {
let mut bundles: Vec<_> = historical_bundle_root_paths(db_path)
.into_iter()
.filter(|root| root.exists())
.map(|root_path| {
let modified_at_ms = file_mtime_ms(&root_path);
let total_bytes = bundle_total_bytes(&root_path);
let supports_direct_readonly = historical_bundle_supports_direct_readonly(&root_path);
let probe = probe_historical_bundle(&root_path);
HistoricalDatabaseBundle {
modified_at_ms,
total_bytes,
supports_direct_readonly,
root_path,
probe,
}
})
.filter(|bundle| bundle.total_bytes > 0)
.collect();
fn bundle_priority(path: &Path) -> i32 {
let path_str = path.to_string_lossy();
if path_str.contains("/repair-lab/replay-") {
return 5;
}
if path_str.contains("/repair-lab/") {
return 4;
}
if path_str.contains("/snapshots/") {
return 3;
}
if path_str.contains(".corrupt.") || path_str.contains("failed-baseline-seed") {
return 0;
}
1
}
fn bundle_health_rank(bundle: &HistoricalDatabaseBundle) -> i32 {
let fts_clean = match bundle.probe.fts_schema_rows {
Some(1) => bundle.probe.fts_queryable,
Some(0) => bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION),
_ => false,
};
let clean_schema14_fts =
bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION) && fts_clean;
if clean_schema14_fts {
return 5;
}
if fts_clean {
return 4;
}
if bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION)
&& bundle.supports_direct_readonly
{
return 3;
}
if bundle.supports_direct_readonly {
return 2;
}
1
}
bundles.sort_by(|left, right| {
bundle_health_rank(right)
.cmp(&bundle_health_rank(left))
.then_with(|| right.probe.max_message_id.cmp(&left.probe.max_message_id))
.then_with(|| bundle_priority(&right.root_path).cmp(&bundle_priority(&left.root_path)))
.then_with(|| {
right
.supports_direct_readonly
.cmp(&left.supports_direct_readonly)
})
.then_with(|| right.total_bytes.cmp(&left.total_bytes))
.then_with(|| right.modified_at_ms.cmp(&left.modified_at_ms))
.then_with(|| right.root_path.cmp(&left.root_path))
});
bundles
}
fn probe_historical_bundle(root_path: &Path) -> HistoricalBundleProbe {
let Ok(conn) = open_historical_bundle_readonly(root_path) else {
return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or_default();
};
let schema_version = read_meta_schema_version(&conn).ok().flatten();
let fts_schema_rows: Option<i64> = conn
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
fparams![],
|row| row.get_typed(0),
)
.ok();
let fts_queryable =
historical_bundle_fts_queryable_via_frankensqlite(root_path, fts_schema_rows);
let max_message_id: i64 = conn
.query_row_map(
"SELECT COALESCE(MAX(id), 0) FROM messages",
fparams![],
|row| row.get_typed(0),
)
.unwrap_or(0);
let probe = HistoricalBundleProbe {
schema_version,
fts_schema_rows,
fts_queryable,
max_message_id,
};
if probe.schema_version.is_none()
&& probe.fts_schema_rows.is_none()
&& probe.max_message_id == 0
{
return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or(probe);
}
probe
}
fn probe_historical_bundle_via_sqlite3_metadata(root_path: &Path) -> Option<HistoricalBundleProbe> {
let bundle_uri = format!("file:{}?immutable=1", root_path.to_string_lossy());
let output = Command::new("sqlite3")
.arg("-batch")
.arg("-noheader")
.arg(&bundle_uri)
.arg(
"PRAGMA writable_schema=ON;
SELECT COALESCE((SELECT value FROM meta WHERE key = 'schema_version'), '');
SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages';
SELECT COALESCE(MAX(id), 0) FROM messages;",
)
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8(output.stdout).ok()?;
let mut lines = stdout.lines();
let schema_version = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
let fts_schema_rows = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
let max_message_id = lines
.next()
.and_then(|raw| raw.trim().parse::<i64>().ok())
.unwrap_or(0);
Some(HistoricalBundleProbe {
schema_version,
fts_schema_rows,
fts_queryable: false,
max_message_id,
})
}
fn historical_bundle_fts_queryable_via_frankensqlite(
root_path: &Path,
fts_schema_rows: Option<i64>,
) -> bool {
matches!(fts_schema_rows, Some(1))
&& FrankenStorage::open_readonly(root_path)
.map(|storage| {
storage
.raw()
.query("SELECT COUNT(*) FROM fts_messages")
.is_ok()
})
.unwrap_or(false)
}
fn historical_bundle_supports_direct_readonly(root_path: &Path) -> bool {
open_historical_bundle_readonly(root_path)
.and_then(|conn| historical_bundle_has_queryable_core_tables(&conn))
.is_ok()
}
fn historical_table_exists(conn: &FrankenConnection, table: &str) -> Result<bool> {
let found: Option<i64> = conn
.query_row_map(
"SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1 LIMIT 1",
fparams![table],
|row| row.get_typed(0),
)
.optional()
.with_context(|| format!("checking for historical table {table}"))?;
Ok(found.is_some())
}
fn probe_historical_table_reads(conn: &FrankenConnection, table: &str) -> Result<()> {
if !historical_table_exists(conn, table)? {
return Err(anyhow!(
"historical database missing required table {table}"
));
}
let sql = format!("SELECT rowid FROM {table} LIMIT 1");
let _: Option<i64> = conn
.query_row_map(&sql, fparams![], |row| row.get_typed(0))
.optional()
.with_context(|| format!("probing rows from historical table {table}"))?;
Ok(())
}
fn historical_bundle_has_queryable_core_tables(conn: &FrankenConnection) -> Result<()> {
probe_historical_table_reads(conn, "conversations")?;
probe_historical_table_reads(conn, "messages")?;
Ok(())
}
fn open_historical_bundle_readonly(root_path: &Path) -> Result<FrankenConnection> {
let path_str = root_path.to_string_lossy();
let flags = FrankenOpenFlags::SQLITE_OPEN_READ_ONLY;
let conn = open_franken_with_flags(&path_str, flags)
.with_context(|| format!("opening historical database {}", root_path.display()))?;
Ok(conn)
}
fn is_recoverable_insert_line(line: &str) -> bool {
[
"sources",
"agents",
"workspaces",
"conversations",
"messages",
"snippets",
]
.iter()
.any(|table| {
line.starts_with(&format!("INSERT INTO '{table}'"))
|| line.starts_with(&format!("INSERT OR IGNORE INTO '{table}'"))
|| line.starts_with(&format!("INSERT INTO \"{table}\""))
|| line.starts_with(&format!("INSERT OR IGNORE INTO \"{table}\""))
})
}
fn recover_historical_bundle_via_sqlite3(
bundle: &HistoricalDatabaseBundle,
) -> Result<HistoricalReadConnection> {
let tempdir = tempfile::TempDir::new().context("creating temporary salvage directory")?;
let recovered_db = tempdir.path().join("historical-recovered.db");
let temp_conn = FrankenConnection::open(recovered_db.to_string_lossy().as_ref())
.with_context(|| format!("creating recovered database {}", recovered_db.display()))?;
temp_conn
.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA)
.with_context(|| format!("initializing recovered schema {}", recovered_db.display()))?;
drop(temp_conn);
let bundle_uri = format!("file:{}?immutable=1", bundle.root_path.to_string_lossy());
let mut recover = Command::new("sqlite3")
.arg(&bundle_uri)
.arg(".recover")
.stdout(Stdio::piped())
.spawn()
.with_context(|| {
format!(
"launching sqlite3 .recover for historical bundle {}",
bundle.root_path.display()
)
})?;
let recover_stdout = recover
.stdout
.take()
.context("capturing sqlite3 .recover stdout")?;
let mut importer = Command::new("sqlite3")
.arg(&recovered_db)
.stdin(Stdio::piped())
.spawn()
.with_context(|| {
format!(
"launching sqlite3 importer for recovered bundle {}",
recovered_db.display()
)
})?;
{
let importer_stdin = importer
.stdin
.as_mut()
.context("opening sqlite3 importer stdin")?;
importer_stdin
.write_all(b"BEGIN;\n")
.context("starting recovery import transaction")?;
let reader = BufReader::new(recover_stdout);
for line in reader.lines() {
let line = line.context("reading sqlite3 .recover output")?;
if is_recoverable_insert_line(&line) {
importer_stdin
.write_all(line.as_bytes())
.context("writing recovered INSERT")?;
importer_stdin
.write_all(b"\n")
.context("writing recovered INSERT newline")?;
}
}
importer_stdin
.write_all(b"COMMIT;\n")
.context("committing recovery import transaction")?;
}
let recover_status = recover
.wait()
.context("waiting for sqlite3 .recover process")?;
if !recover_status.success() {
anyhow::bail!(
"sqlite3 .recover exited with status {} for {}",
recover_status,
bundle.root_path.display()
);
}
let importer_status = importer
.wait()
.context("waiting for sqlite3 recovery importer")?;
if !importer_status.success() {
anyhow::bail!(
"sqlite3 recovery importer exited with status {} for {}",
importer_status,
recovered_db.display()
);
}
let conn = open_historical_bundle_readonly(&recovered_db)?;
historical_bundle_has_queryable_core_tables(&conn)?;
Ok(HistoricalReadConnection {
conn,
method: "sqlite3-recover",
root_path: recovered_db,
_tempdir: Some(tempdir),
})
}
fn open_historical_bundle_for_salvage(
bundle: &HistoricalDatabaseBundle,
) -> Result<HistoricalReadConnection> {
match open_historical_bundle_readonly(&bundle.root_path) {
Ok(conn) => {
if historical_bundle_has_queryable_core_tables(&conn).is_ok() {
return Ok(HistoricalReadConnection {
conn,
method: "direct-readonly",
root_path: bundle.root_path.clone(),
_tempdir: None,
});
}
}
Err(err) => {
tracing::warn!(
path = %bundle.root_path.display(),
error = %err,
"historical bundle direct open failed; falling back to sqlite3 .recover"
);
}
}
recover_historical_bundle_via_sqlite3(bundle)
}
fn historical_bundle_counts(conn: &FrankenConnection) -> Result<(usize, usize)> {
let conversations: i64 =
conn.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})?;
let messages: i64 = conn.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})?;
Ok((
usize::try_from(conversations.max(0)).unwrap_or(usize::MAX),
usize::try_from(messages.max(0)).unwrap_or(usize::MAX),
))
}
fn clear_seeded_runtime_meta(conn: &FrankenConnection) -> Result<()> {
conn.execute(
"DELETE FROM meta
WHERE key LIKE 'historical_bundle_salvaged:%'
OR key IN ('last_scan_ts', 'last_indexed_at', 'last_embedded_message_id')",
)?;
Ok(())
}
fn record_historical_bundle_import(
conn: &FrankenConnection,
bundle: &HistoricalDatabaseBundle,
method: &str,
conversations_imported: usize,
messages_imported: usize,
) -> Result<()> {
let key = FrankenStorage::historical_bundle_meta_key(bundle);
let value = serde_json::json!({
"salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
"path": bundle.root_path.display().to_string(),
"bytes": bundle.total_bytes,
"modified_at_ms": bundle.modified_at_ms,
"method": method,
"conversations_imported": conversations_imported,
"messages_imported": messages_imported,
"recorded_at_ms": FrankenStorage::now_millis(),
});
let value_str = serde_json::to_string(&value)?;
conn.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams![key, value_str],
)?;
Ok(())
}
fn finalize_seeded_canonical_bundle_via_rusqlite(
canonical_db_path: &Path,
bundle: &HistoricalDatabaseBundle,
conversations_imported: usize,
messages_imported: usize,
) -> Result<()> {
let _fts_repair =
ensure_fts_consistency_via_rusqlite(canonical_db_path).with_context(|| {
format!(
"repairing staged canonical FTS consistency before finalization: {}",
canonical_db_path.display()
)
})?;
let path_str = canonical_db_path.to_string_lossy();
let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
format!(
"opening seeded canonical database for post-seed finalization: {}",
canonical_db_path.display()
)
})?;
conn.execute("PRAGMA busy_timeout = 30000;")
.with_context(|| {
format!(
"configuring busy timeout for seeded canonical database {}",
canonical_db_path.display()
)
})?;
let schema_version = read_meta_schema_version(&conn)?;
if let Some(version) = schema_version
&& version < CURRENT_SCHEMA_VERSION
&& version != 13
{
anyhow::bail!(
"seeded canonical bundle schema_version {version} is too old for baseline import and cannot be finalized automatically"
);
}
clear_seeded_runtime_meta(&conn)?;
conn.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
fparams![CURRENT_SCHEMA_VERSION.to_string()],
)?;
conn.execute_compat(
"INSERT OR IGNORE INTO _schema_migrations(version, name) VALUES(?1, 'fts_contentless')",
fparams![CURRENT_SCHEMA_VERSION],
)?;
record_historical_bundle_import(
&conn,
bundle,
"baseline-bulk-sql-copy",
conversations_imported,
messages_imported,
)?;
Ok(())
}
fn read_meta_schema_version(conn: &FrankenConnection) -> Result<Option<i64>> {
let version: Option<String> = conn
.query_row_map(
"SELECT value FROM meta WHERE key = 'schema_version'",
fparams![],
|row| row.get_typed(0),
)
.optional()?;
Ok(version.and_then(|raw| raw.parse::<i64>().ok()))
}
#[cfg(test)]
fn franken_fts_schema_rows(conn: &FrankenConnection) -> Result<i64> {
conn.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
fparams![],
|row| row.get_typed(0),
)
.context("counting sqlite_master rows for fts_messages via frankensqlite")
}
#[cfg(test)]
fn franken_fts_limit_probe(conn: &FrankenConnection) -> bool {
conn.query("SELECT COUNT(*) FROM fts_messages").is_ok()
}
#[cfg(test)]
#[allow(dead_code)]
pub(crate) fn probe_database_health_via_frankensqlite(
db_path: &Path,
) -> Result<SqliteDatabaseHealthProbe> {
let path_str = db_path.to_string_lossy();
let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
format!(
"opening frankensqlite db at {} for database health probe",
db_path.display()
)
})?;
conn.execute_batch("PRAGMA busy_timeout = 30000;")
.with_context(|| {
format!(
"configuring busy timeout for database health probe at {}",
db_path.display()
)
})?;
let schema_version = read_meta_schema_version(&conn)?;
let quick_check_status: String = conn
.query_row_map("PRAGMA quick_check(1)", fparams![], |row| row.get_typed(0))
.with_context(|| format!("running PRAGMA quick_check(1) for {}", db_path.display()))?;
let quick_check_ok = quick_check_status.trim().eq_ignore_ascii_case("ok");
let fts_schema_rows = franken_fts_schema_rows(&conn)?;
let fts_queryable = fts_schema_rows == 1 && franken_fts_limit_probe(&conn);
if !quick_check_ok {
return Ok(SqliteDatabaseHealthProbe {
schema_version,
quick_check_ok,
fts_schema_rows,
fts_queryable,
message_count: 0,
max_message_id: 0,
});
}
let message_count: i64 = conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.context("counting messages during frankensqlite database health probe")?;
let max_message_id: i64 = conn
.query_row_map(
"SELECT COALESCE(MAX(id), 0) FROM messages",
fparams![],
|row| row.get_typed(0),
)
.context("reading max message id during frankensqlite database health probe")?;
Ok(SqliteDatabaseHealthProbe {
schema_version,
quick_check_ok,
fts_schema_rows,
fts_queryable,
message_count,
max_message_id,
})
}
struct StagedHistoricalSeed {
tempdir: tempfile::TempDir,
db_path: PathBuf,
}
fn stage_historical_bundle_for_seed(
canonical_db_path: &Path,
source_root_path: &Path,
) -> Result<StagedHistoricalSeed> {
let canonical_parent = canonical_db_path.parent().unwrap_or_else(|| Path::new("."));
fs::create_dir_all(canonical_parent).with_context(|| {
format!(
"creating canonical database directory before bulk historical seed import: {}",
canonical_parent.display()
)
})?;
let tempdir = tempfile::TempDir::new_in(canonical_parent)
.context("creating temporary baseline seed directory")?;
let staged_seed_db = tempdir.path().join("baseline-seed-output.db");
copy_database_bundle(source_root_path, &staged_seed_db)?;
Ok(StagedHistoricalSeed {
tempdir,
db_path: staged_seed_db,
})
}
fn promote_staged_historical_seed(
canonical_db_path: &Path,
staged_seed: &StagedHistoricalSeed,
) -> Result<()> {
let canonical_backup = staged_seed
.tempdir
.path()
.join("pre-seed-canonical-backup.db");
let had_canonical = canonical_db_path.exists()
|| database_sidecar_path(canonical_db_path, "-wal").exists()
|| database_sidecar_path(canonical_db_path, "-shm").exists();
if had_canonical {
move_database_bundle(canonical_db_path, &canonical_backup).with_context(|| {
format!(
"backing up canonical database before promoting staged historical seed import: {}",
canonical_db_path.display()
)
})?;
}
if let Err(err) =
move_database_bundle(&staged_seed.db_path, canonical_db_path).with_context(|| {
format!(
"promoting staged historical seed database bundle {} into canonical path {}",
staged_seed.db_path.display(),
canonical_db_path.display()
)
})
{
if had_canonical {
let _ = move_database_bundle(&canonical_backup, canonical_db_path);
}
return Err(err);
}
Ok(())
}
pub(crate) fn seed_canonical_from_best_historical_bundle(
canonical_db_path: &Path,
) -> Result<Option<HistoricalSalvageOutcome>> {
let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
let mut last_seed_error: Option<anyhow::Error> = None;
for bundle in ordered_bundles {
if let Some(version) = bundle.probe.schema_version
&& version < 13
{
let err = anyhow!(
"historical bundle {} schema_version {version} is too old for baseline import",
bundle.root_path.display()
);
tracing::warn!(
path = %bundle.root_path.display(),
schema_version = version,
"historical bundle is too old for baseline seed import"
);
last_seed_error = Some(err);
continue;
}
let source = open_historical_bundle_for_salvage(&bundle).with_context(|| {
format!(
"opening historical seed bundle {} for baseline import",
bundle.root_path.display()
)
})?;
let (conversations_imported, messages_imported) = historical_bundle_counts(&source.conn)?;
let staged_seed = match stage_historical_bundle_for_seed(
canonical_db_path,
&source.root_path,
) {
Ok(staged_seed) => staged_seed,
Err(err) => {
tracing::warn!(
path = %bundle.root_path.display(),
error = %err,
"bulk baseline seed staging from historical bundle failed; trying next candidate"
);
last_seed_error = Some(err);
continue;
}
};
if let Err(err) = finalize_seeded_canonical_bundle_via_rusqlite(
&staged_seed.db_path,
&bundle,
conversations_imported,
messages_imported,
) {
tracing::warn!(
path = %bundle.root_path.display(),
error = %err,
"finalizing staged historical seed import failed; trying next candidate"
);
last_seed_error = Some(err);
continue;
}
if let Err(err) = promote_staged_historical_seed(canonical_db_path, &staged_seed) {
tracing::warn!(
path = %bundle.root_path.display(),
error = %err,
"promoting staged historical seed import failed; trying next candidate"
);
last_seed_error = Some(err);
continue;
}
tracing::info!(
path = %bundle.root_path.display(),
conversations_imported,
messages_imported,
"seeded empty canonical database from largest healthy historical bundle"
);
return Ok(Some(HistoricalSalvageOutcome {
bundles_considered: 0,
bundles_imported: 1,
conversations_imported,
messages_imported,
}));
}
if let Some(err) = last_seed_error {
return Err(err);
}
Ok(None)
}
fn parse_json_column(value: Option<String>) -> serde_json::Value {
value
.and_then(|raw| serde_json::from_str(&raw).ok())
.unwrap_or(serde_json::Value::Null)
}
const HISTORICAL_RAW_JSON_SENTINEL_KEY: &str = "__cass_historical_raw_json__";
fn wrap_historical_raw_json(raw: String) -> serde_json::Value {
serde_json::json!({ HISTORICAL_RAW_JSON_SENTINEL_KEY: raw })
}
fn historical_raw_json(value: &serde_json::Value) -> Option<&str> {
match value {
serde_json::Value::Object(map) if map.len() == 1 => map
.get(HISTORICAL_RAW_JSON_SENTINEL_KEY)
.and_then(serde_json::Value::as_str),
_ => None,
}
}
fn parse_historical_json_column(value: Option<String>) -> serde_json::Value {
match value {
Some(raw) if raw.trim().is_empty() => serde_json::Value::Null,
Some(raw) => wrap_historical_raw_json(raw),
None => serde_json::Value::Null,
}
}
fn historical_salvage_debug_enabled() -> bool {
std::env::var_os("CASS_DEBUG_HISTORICAL_SALVAGE").is_some()
}
#[derive(Debug, Clone, Copy)]
struct HistoricalImportBatchLimits {
conversations: usize,
messages: usize,
payload_chars: usize,
}
fn env_positive_usize(key: &str) -> Option<usize> {
dotenvy::var(key)
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
}
fn historical_import_batch_limits() -> HistoricalImportBatchLimits {
let cpu_count = std::thread::available_parallelism()
.map(std::num::NonZeroUsize::get)
.unwrap_or(1);
let default_limits = if cpu_count >= 32 {
HistoricalImportBatchLimits {
conversations: 128,
messages: 16_384,
payload_chars: 12_000_000,
}
} else {
HistoricalImportBatchLimits {
conversations: 32,
messages: 4_096,
payload_chars: 3_000_000,
}
};
HistoricalImportBatchLimits {
conversations: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CONVERSATIONS")
.unwrap_or(default_limits.conversations),
messages: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_MESSAGES")
.unwrap_or(default_limits.messages),
payload_chars: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CHARS")
.unwrap_or(default_limits.payload_chars),
}
}
fn json_value_size_hint(value: &serde_json::Value) -> usize {
if let Some(raw) = historical_raw_json(value) {
return raw.len();
}
match value {
serde_json::Value::Null => 0,
other => serde_json::to_string(other)
.map(|raw| raw.len())
.unwrap_or(0),
}
}
fn message_payload_size_hint(message: &Message) -> usize {
message
.content
.len()
.saturating_add(json_value_size_hint(&message.extra_json))
}
fn is_backup_root_name(name: &str, prefix: &str) -> bool {
name.starts_with(prefix) && !name.ends_with("-wal") && !name.ends_with("-shm")
}
fn has_db_sidecar_suffix(name: &str) -> bool {
const SIDECAR_SUFFIXES: &[&str] = &[
"-wal",
"-shm",
"-lock-shared",
"-lock-reserved",
"-lock-pending",
];
SIDECAR_SUFFIXES.iter().any(|suffix| name.ends_with(suffix))
}
pub const CURRENT_SCHEMA_VERSION: i64 = 20;
const MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION: i64 = 13;
#[derive(Debug, Clone)]
pub enum SchemaCheck {
Compatible,
NeedsMigration,
NeedsRebuild(String),
}
fn schema_check_error_requires_rebuild(err: &frankensqlite::FrankenError) -> bool {
matches!(
err,
frankensqlite::FrankenError::DatabaseCorrupt { .. }
| frankensqlite::FrankenError::WalCorrupt { .. }
| frankensqlite::FrankenError::NotADatabase { .. }
| frankensqlite::FrankenError::ShortRead { .. }
)
}
fn unique_backup_path(path: &Path) -> PathBuf {
static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0);
let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
path.with_file_name(format!(
"{file_name}.backup.{}.{}.{}",
std::process::id(),
timestamp,
nonce
))
}
fn vacuum_stage_backup_path(backup_path: &Path) -> PathBuf {
let file_name = backup_path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or("db.backup");
backup_path.with_file_name(format!(".{file_name}.vacuum-in-progress"))
}
fn check_schema_compatibility(
path: &Path,
) -> std::result::Result<SchemaCheck, frankensqlite::FrankenError> {
let mut conn = open_franken_with_flags(
&path.to_string_lossy(),
FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
)?;
let result = (|| {
let meta_exists: i32 = conn.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='meta'",
fparams![],
|row| row.get_typed(0),
)?;
if meta_exists == 0 {
let table_count: i32 = conn.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE type='table'",
fparams![],
|row| row.get_typed(0),
)?;
if table_count == 0 {
return Ok(SchemaCheck::NeedsMigration);
}
return Ok(SchemaCheck::NeedsRebuild(
"Database missing schema version metadata".to_string(),
));
}
let version: Option<i64> = conn
.query_row_map(
"SELECT value FROM meta WHERE key = 'schema_version'",
fparams![],
|row| Ok(row.get_typed::<String>(0)?.parse().ok()),
)
.ok()
.flatten();
match version {
Some(v) if v == SCHEMA_VERSION => Ok(SchemaCheck::Compatible),
Some(v) if (MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION..SCHEMA_VERSION).contains(&v) => {
Ok(SchemaCheck::NeedsMigration)
}
Some(v) if v > 0 && v < MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION => {
Ok(SchemaCheck::NeedsRebuild(format!(
"Schema version {} is too old for in-place migration; supported upgrade path starts at version {}",
v, MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION
)))
}
Some(v) => {
Ok(SchemaCheck::NeedsRebuild(format!(
"Schema version {} is newer than supported version {}",
v, SCHEMA_VERSION
)))
}
None => Ok(SchemaCheck::NeedsRebuild(
"Schema version not found or invalid".to_string(),
)),
}
})();
if let Err(close_err) = conn.close_in_place() {
tracing::warn!(
error = %close_err,
db_path = %path.display(),
"check_schema_compatibility: close_in_place failed; falling back to best-effort close"
);
conn.close_best_effort_in_place();
}
result
}
const SCHEMA_VERSION: i64 = CURRENT_SCHEMA_VERSION;
#[cfg(test)]
const MIGRATION_V1: &str = r"
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS agents (
id INTEGER PRIMARY KEY,
slug TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
version TEXT,
kind TEXT NOT NULL,
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS workspaces (
id INTEGER PRIMARY KEY,
path TEXT NOT NULL UNIQUE,
display_name TEXT
);
CREATE TABLE IF NOT EXISTS conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL REFERENCES agents(id),
workspace_id INTEGER REFERENCES workspaces(id),
external_id TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER,
ended_at INTEGER,
approx_tokens INTEGER,
metadata_json TEXT,
UNIQUE(agent_id, external_id)
);
CREATE TABLE IF NOT EXISTS messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
idx INTEGER NOT NULL,
role TEXT NOT NULL,
author TEXT,
created_at INTEGER,
content TEXT NOT NULL,
extra_json TEXT,
UNIQUE(conversation_id, idx)
);
CREATE TABLE IF NOT EXISTS snippets (
id INTEGER PRIMARY KEY,
message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
file_path TEXT,
start_line INTEGER,
end_line INTEGER,
language TEXT,
snippet_text TEXT
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS conversation_tags (
conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
PRIMARY KEY (conversation_id, tag_id)
);
CREATE INDEX IF NOT EXISTS idx_conversations_agent_started
ON conversations(agent_id, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_messages_conv_idx
ON messages(conversation_id, idx);
";
#[cfg(test)]
const MIGRATION_V2: &str = r"
CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
message_id UNINDEXED,
tokenize='porter'
);
INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
SELECT
m.content,
c.title,
a.slug,
w.path,
c.source_path,
m.created_at,
m.id
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
JOIN agents a ON c.agent_id = a.id
LEFT JOIN workspaces w ON c.workspace_id = w.id;
";
#[cfg(test)]
#[allow(dead_code)]
const MIGRATION_V3: &str = r"
DROP TABLE IF EXISTS fts_messages;
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
message_id UNINDEXED,
tokenize='porter'
);
INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
SELECT
m.content,
c.title,
a.slug,
w.path,
c.source_path,
m.created_at,
m.id
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
JOIN agents a ON c.agent_id = a.id
LEFT JOIN workspaces w ON c.workspace_id = w.id;
";
#[cfg(test)]
const MIGRATION_V4: &str = r"
-- Sources table for tracking where conversations come from
CREATE TABLE IF NOT EXISTS sources (
id TEXT PRIMARY KEY, -- source_id (e.g., 'local', 'work-laptop')
kind TEXT NOT NULL, -- 'local', 'ssh', etc.
host_label TEXT, -- display label
machine_id TEXT, -- optional stable machine id
platform TEXT, -- 'macos', 'linux', 'windows'
config_json TEXT, -- JSON blob for extra config (SSH params, path rewrites)
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL
);
-- Bootstrap: Insert the default 'local' source
INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
";
#[cfg(test)]
const MIGRATION_V5: &str = r"
-- Add provenance columns to conversations table
-- SQLite cannot alter unique constraints, so we need to recreate the table
-- Create new table with provenance columns and updated unique constraint
CREATE TABLE conversations_new (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL REFERENCES agents(id),
workspace_id INTEGER REFERENCES workspaces(id),
source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
external_id TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER,
ended_at INTEGER,
approx_tokens INTEGER,
metadata_json TEXT,
origin_host TEXT,
UNIQUE(source_id, agent_id, external_id)
);
-- Copy data from old table (all existing conversations get source_id='local')
INSERT INTO conversations_new (id, agent_id, workspace_id, source_id, external_id, title,
source_path, started_at, ended_at, approx_tokens, metadata_json, origin_host)
SELECT id, agent_id, workspace_id, 'local', external_id, title,
source_path, started_at, ended_at, approx_tokens, metadata_json, NULL
FROM conversations;
-- Drop old table and rename new
DROP TABLE conversations;
ALTER TABLE conversations_new RENAME TO conversations;
-- Recreate indexes
CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
";
#[cfg(test)]
const MIGRATION_V6: &str = r"
-- Optimize lookup by source_path (used by TUI detail view)
CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
";
#[cfg(test)]
const MIGRATION_V7: &str = r"
-- Add binary columns for MessagePack serialization (Opt 3.1)
-- Binary format is 50-70% smaller than JSON and faster to parse
ALTER TABLE conversations ADD COLUMN metadata_bin BLOB;
ALTER TABLE messages ADD COLUMN extra_bin BLOB;
";
#[cfg(test)]
const MIGRATION_V8: &str = r"
-- Opt 3.2: Daily stats materialized table for O(1) time-range histograms
-- Provides fast aggregated queries for stats/dashboard without full table scans
CREATE TABLE IF NOT EXISTS daily_stats (
day_id INTEGER NOT NULL, -- Days since 2020-01-01 (Unix epoch + offset)
agent_slug TEXT NOT NULL, -- 'all' for totals, or specific agent slug
source_id TEXT NOT NULL DEFAULT 'all', -- 'all' for totals, or specific source
session_count INTEGER NOT NULL DEFAULT 0,
message_count INTEGER NOT NULL DEFAULT 0,
total_chars INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL,
PRIMARY KEY (day_id, agent_slug, source_id)
);
CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
";
#[cfg(test)]
const MIGRATION_V9: &str = r"
-- Background embedding jobs tracking table
CREATE TABLE IF NOT EXISTS embedding_jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
db_path TEXT NOT NULL,
model_id TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending',
total_docs INTEGER NOT NULL DEFAULT 0,
completed_docs INTEGER NOT NULL DEFAULT 0,
error_message TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
started_at TEXT,
completed_at TEXT
);
-- Only one pending or running job per (db_path, model_id) at a time.
-- Multiple completed/failed/cancelled jobs are allowed for history.
CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
ON embedding_jobs(db_path, model_id)
WHERE status IN ('pending', 'running');
";
#[cfg(test)]
const MIGRATION_V10: &str = r"
-- Token analytics: per-message token usage ledger
CREATE TABLE IF NOT EXISTS token_usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
conversation_id INTEGER NOT NULL,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT NOT NULL DEFAULT 'local',
-- Timing
timestamp_ms INTEGER NOT NULL,
day_id INTEGER NOT NULL,
-- Model identification
model_name TEXT,
model_family TEXT,
model_tier TEXT,
service_tier TEXT,
provider TEXT,
-- Token counts (nullable — not all agents provide all fields)
input_tokens INTEGER,
output_tokens INTEGER,
cache_read_tokens INTEGER,
cache_creation_tokens INTEGER,
thinking_tokens INTEGER,
total_tokens INTEGER,
-- Cost estimation
estimated_cost_usd REAL,
-- Message context
role TEXT NOT NULL,
content_chars INTEGER NOT NULL,
has_tool_calls INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
-- Data quality
data_source TEXT NOT NULL DEFAULT 'api',
UNIQUE(message_id)
);
CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
-- Token analytics: pre-aggregated daily rollups
CREATE TABLE IF NOT EXISTS token_daily_stats (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
source_id TEXT NOT NULL DEFAULT 'all',
model_family TEXT NOT NULL DEFAULT 'all',
api_call_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_message_count INTEGER NOT NULL DEFAULT 0,
total_input_tokens INTEGER NOT NULL DEFAULT 0,
total_output_tokens INTEGER NOT NULL DEFAULT 0,
total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
grand_total_tokens INTEGER NOT NULL DEFAULT 0,
total_content_chars INTEGER NOT NULL DEFAULT 0,
total_tool_calls INTEGER NOT NULL DEFAULT 0,
estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
session_count INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL,
PRIMARY KEY (day_id, agent_slug, source_id, model_family)
);
CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
-- Model pricing lookup table
CREATE TABLE IF NOT EXISTS model_pricing (
model_pattern TEXT NOT NULL,
provider TEXT NOT NULL,
input_cost_per_mtok REAL NOT NULL,
output_cost_per_mtok REAL NOT NULL,
cache_read_cost_per_mtok REAL,
cache_creation_cost_per_mtok REAL,
effective_date TEXT NOT NULL,
PRIMARY KEY (model_pattern, effective_date)
);
-- Seed with current pricing (as of 2026-02)
INSERT OR IGNORE INTO model_pricing VALUES
('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
-- Extend conversations table with token summary columns
ALTER TABLE conversations ADD COLUMN total_input_tokens INTEGER;
ALTER TABLE conversations ADD COLUMN total_output_tokens INTEGER;
ALTER TABLE conversations ADD COLUMN total_cache_read_tokens INTEGER;
ALTER TABLE conversations ADD COLUMN total_cache_creation_tokens INTEGER;
ALTER TABLE conversations ADD COLUMN grand_total_tokens INTEGER;
ALTER TABLE conversations ADD COLUMN estimated_cost_usd REAL;
ALTER TABLE conversations ADD COLUMN primary_model TEXT;
ALTER TABLE conversations ADD COLUMN api_call_count INTEGER;
ALTER TABLE conversations ADD COLUMN tool_call_count INTEGER;
ALTER TABLE conversations ADD COLUMN user_message_count INTEGER;
ALTER TABLE conversations ADD COLUMN assistant_message_count INTEGER;
";
const MIGRATION_V14: &str = r"
-- Switch FTS5 from internal-content to contentless mode (CASS #163).
-- Drop the old V13 internal-content fts_messages first so that
-- sqlite_schema does not contain two conflicting CREATE VIRTUAL TABLE
-- entries, which makes the database completely unreadable.
-- The current contentless table is recreated lazily after open() only when the
-- frankensqlite FTS consistency check finds it missing or malformed.
DROP TABLE IF EXISTS fts_messages;
";
const MIGRATION_V15_TAIL_STATE_TABLE: &str = r"
CREATE TABLE IF NOT EXISTS conversation_tail_state (
-- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
-- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
conversation_id INTEGER PRIMARY KEY,
ended_at INTEGER,
last_message_idx INTEGER,
last_message_created_at INTEGER
);
";
const MIGRATION_V16: &str = r"
-- UNIQUE(conversation_id, idx) already creates sqlite_autoindex_messages_1,
-- which covers the same lookup/order key as idx_messages_conv_idx. Keeping both
-- doubles message insert index maintenance on the hot indexing path.
DROP INDEX IF EXISTS idx_messages_conv_idx;
";
const MIGRATION_V17: &str = r"
-- Drop the global messages(created_at) secondary index from the ingest hot
-- path. Search/time filters are served by the derived search layer and
-- conversation/analytics indexes, while this index is maintained on every
-- message insert.
DROP INDEX IF EXISTS idx_messages_created;
";
const MIGRATION_V18: &str = r"
-- Move append-tail state out of the wide, indexed conversations row. The hot
-- append path updates this cache for every appended conversation; keeping it in
-- a tiny rowid table avoids rewriting the large conversation record.
CREATE TABLE IF NOT EXISTS conversation_tail_state (
-- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
-- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
conversation_id INTEGER PRIMARY KEY,
ended_at INTEGER,
last_message_idx INTEGER,
last_message_created_at INTEGER
);
INSERT OR REPLACE INTO conversation_tail_state (
conversation_id, ended_at, last_message_idx, last_message_created_at
)
SELECT id, ended_at, last_message_idx, last_message_created_at
FROM conversations
WHERE ended_at IS NOT NULL
OR last_message_idx IS NOT NULL
OR last_message_created_at IS NOT NULL;
";
const MIGRATION_V19: &str = r"
-- Materialize external conversation provenance into one compact lookup key.
-- This keeps the hot append/new-conversation probe on a single primary-key
-- lookup instead of a composite conversations-table predicate.
CREATE TABLE IF NOT EXISTS conversation_external_lookup (
lookup_key TEXT PRIMARY KEY,
conversation_id INTEGER NOT NULL
);
INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
SELECT
CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
CAST(agent_id AS TEXT) || ':' ||
CAST(length(external_id) AS TEXT) || ':' || external_id,
id
FROM conversations
WHERE external_id IS NOT NULL;
";
const MIGRATION_V20: &str = r"
-- Fuse external conversation lookup with append-tail state. Append-heavy
-- workloads can resolve both the conversation id and tail plan from one
-- primary-key probe.
CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
lookup_key TEXT PRIMARY KEY,
conversation_id INTEGER NOT NULL,
ended_at INTEGER,
last_message_idx INTEGER,
last_message_created_at INTEGER
);
INSERT OR REPLACE INTO conversation_external_tail_lookup (
lookup_key,
conversation_id,
ended_at,
last_message_idx,
last_message_created_at
)
SELECT
CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
CAST(c.agent_id AS TEXT) || ':' ||
CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
c.id,
(SELECT ts.ended_at
FROM conversation_tail_state ts
WHERE ts.conversation_id = c.id),
(SELECT ts.last_message_idx
FROM conversation_tail_state ts
WHERE ts.conversation_id = c.id),
(SELECT ts.last_message_created_at
FROM conversation_tail_state ts
WHERE ts.conversation_id = c.id)
FROM conversations c
WHERE c.external_id IS NOT NULL;
";
#[derive(Debug, Clone)]
pub struct EmbeddingJobRow {
pub id: i64,
pub db_path: String,
pub model_id: String,
pub status: String,
pub total_docs: i64,
pub completed_docs: i64,
pub error_message: Option<String>,
pub created_at: String,
pub started_at: Option<String>,
pub completed_at: Option<String>,
}
#[derive(Debug, Clone)]
pub struct LexicalRebuildConversationRow {
pub id: Option<i64>,
pub agent_slug: String,
pub workspace: Option<PathBuf>,
pub external_id: Option<String>,
pub title: Option<String>,
pub source_path: PathBuf,
pub started_at: Option<i64>,
pub ended_at: Option<i64>,
pub source_id: String,
pub origin_host: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct LexicalRebuildConversationFootprintRow {
pub conversation_id: i64,
pub message_count: usize,
pub message_bytes: usize,
}
pub(crate) const LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE: usize = 4 * 1024;
const LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT: usize = 64;
fn lexical_rebuild_tail_metadata_coverage_is_sufficient(
total_conversations: usize,
covered_conversations: usize,
) -> bool {
total_conversations == 0
|| total_conversations.saturating_sub(covered_conversations.min(total_conversations))
<= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT
}
fn lexical_rebuild_message_count_from_tail_idx(last_message_idx: Option<i64>) -> Option<usize> {
let last_message_idx = u64::try_from(last_message_idx?).ok()?;
let high_water = last_message_idx.checked_add(1)?;
usize::try_from(high_water).ok()
}
fn lexical_rebuild_conversation_footprint_from_count(
conversation_id: i64,
message_count: usize,
) -> LexicalRebuildConversationFootprintRow {
LexicalRebuildConversationFootprintRow {
conversation_id,
message_count,
message_bytes: message_count
.saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE),
}
}
#[derive(Debug, Clone)]
pub struct LexicalRebuildMessageRow {
pub conversation_id: i64,
pub id: i64,
pub idx: i64,
pub role: String,
pub author: Option<String>,
pub created_at: Option<i64>,
pub content: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LexicalRebuildGroupedMessageRow {
pub idx: i64,
pub is_tool_role: bool,
pub created_at: Option<i64>,
pub content: String,
}
pub type LexicalRebuildGroupedMessageRows = SmallVec<[LexicalRebuildGroupedMessageRow; 32]>;
pub type SqliteStorage = FrankenStorage;
pub struct FrankenStorage {
conn: FrankenConnection,
db_path: PathBuf,
ephemeral_writer_preflight_verified: AtomicBool,
index_writer_checkpoint_pages: AtomicI64,
index_writer_busy_timeout_ms: AtomicU64,
cached_ephemeral_writer: parking_lot::Mutex<CachedEphemeralWriter>,
ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
ensured_conversation_sources: Arc<parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>>,
ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
fts_messages_present_cache: AtomicI8,
}
const DEFAULT_WAL_AUTOCHECKPOINT_PAGES: i64 = 4096;
const UNSET_INDEX_WRITER_CHECKPOINT_PAGES: i64 = i64::MIN;
const UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS: u64 = 0;
const FTS_MESSAGES_PRESENT_UNKNOWN: i8 = 0;
const FTS_MESSAGES_PRESENT_ABSENT: i8 = 1;
const FTS_MESSAGES_PRESENT_PRESENT: i8 = 2;
enum CachedEphemeralWriter {
Uninitialized,
Cached(Box<SendFrankenConnection>),
InUse,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct EnsuredAgentKey {
slug: String,
name: String,
version: Option<String>,
kind: String,
}
impl EnsuredAgentKey {
fn from_agent(agent: &Agent) -> Self {
Self {
slug: agent.slug.clone(),
name: agent.name.clone(),
version: agent.version.clone(),
kind: agent_kind_str(agent.kind.clone()),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct EnsuredWorkspaceKey {
path: String,
display_name: Option<String>,
}
impl EnsuredWorkspaceKey {
fn new(path: String, display_name: Option<&str>) -> Self {
Self {
path,
display_name: display_name.map(str::to_owned),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct EnsuredConversationSourceKey {
id: String,
kind: SourceKind,
host_label: Option<String>,
}
impl EnsuredConversationSourceKey {
fn from_source(source: &Source) -> Self {
Self {
id: source.id.clone(),
kind: source.kind,
host_label: source.host_label.clone(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct EnsuredDailyStatsKey {
day_id: i64,
agent_slug: String,
source_id: String,
}
impl EnsuredDailyStatsKey {
fn new(day_id: i64, agent_slug: &str, source_id: &str) -> Self {
Self {
day_id,
agent_slug: agent_slug.to_owned(),
source_id: source_id.to_owned(),
}
}
}
const AUTOCOMMIT_RETAIN_OFF_PRAGMAS: [&str; 2] = [
"PRAGMA fsqlite.autocommit_retain = OFF;",
"PRAGMA autocommit_retain = OFF;",
];
fn disable_autocommit_retain<E>(
mut execute: impl FnMut(&'static str) -> std::result::Result<(), E>,
) -> Result<&'static str>
where
E: std::fmt::Display,
{
let mut failures = Vec::new();
for pragma in AUTOCOMMIT_RETAIN_OFF_PRAGMAS {
match execute(pragma) {
Ok(()) => return Ok(pragma),
Err(err) => {
let error = err.to_string();
tracing::debug!(
%pragma,
error = %error,
"autocommit_retain PRAGMA variant not supported"
);
failures.push(format!("{pragma}: {error}"));
}
}
}
Err(anyhow!(
"failed to disable autocommit_retain on frankensqlite connection; \
refusing to keep a long-lived MVCC connection that may accumulate \
unbounded write snapshots. Upgrade frankensqlite to a version that \
supports one of these PRAGMAs or use a short-lived connection path. \
attempts: {}",
failures.join("; ")
))
}
impl FrankenStorage {
fn new(conn: FrankenConnection, db_path: PathBuf) -> Self {
Self::new_with_shared_caches(
conn,
db_path,
Arc::new(parking_lot::Mutex::new(HashMap::new())),
Arc::new(parking_lot::Mutex::new(HashMap::new())),
Arc::new(parking_lot::Mutex::new(HashSet::new())),
Arc::new(parking_lot::Mutex::new(HashSet::new())),
)
}
fn new_with_shared_caches(
conn: FrankenConnection,
db_path: PathBuf,
ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
ensured_conversation_sources: Arc<
parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
>,
ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
) -> Self {
Self {
conn,
db_path,
ephemeral_writer_preflight_verified: AtomicBool::new(false),
index_writer_checkpoint_pages: AtomicI64::new(UNSET_INDEX_WRITER_CHECKPOINT_PAGES),
index_writer_busy_timeout_ms: AtomicU64::new(UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS),
cached_ephemeral_writer: parking_lot::Mutex::new(CachedEphemeralWriter::Uninitialized),
ensured_agents,
ensured_workspaces,
ensured_conversation_sources,
ensured_daily_stats_keys,
fts_messages_present_cache: AtomicI8::new(FTS_MESSAGES_PRESENT_UNKNOWN),
}
}
fn apply_open_stage_busy_timeout(&self) {
if let Err(err) = self.conn.execute("PRAGMA busy_timeout = 5000;") {
tracing::debug!(
error = %err,
"failed to apply open-stage busy_timeout before migrations"
);
}
}
pub fn open(path: &Path) -> Result<Self> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
.with_context(|| format!("creating db directory {}", parent.display()))?;
}
let path_str = path.to_string_lossy().to_string();
let _doctor_guard =
acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
let conn = FrankenConnection::open(&path_str)
.with_context(|| format!("opening frankensqlite db at {}", path.display()))?;
let storage = Self::new(conn, path.to_path_buf());
storage.apply_open_stage_busy_timeout();
storage.run_migrations()?;
storage.repair_missing_current_schema_objects()?;
storage.apply_config()?;
Ok(storage)
}
pub fn open_writer(path: &Path) -> Result<Self> {
Self::open_writer_with_shared_caches(
path,
Arc::new(parking_lot::Mutex::new(HashMap::new())),
Arc::new(parking_lot::Mutex::new(HashMap::new())),
Arc::new(parking_lot::Mutex::new(HashSet::new())),
Arc::new(parking_lot::Mutex::new(HashSet::new())),
)
}
fn open_writer_with_shared_caches(
path: &Path,
ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
ensured_conversation_sources: Arc<
parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
>,
ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
) -> Result<Self> {
let path_str = path.to_string_lossy().to_string();
let _doctor_guard =
acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
let conn = FrankenConnection::open(&path_str)
.with_context(|| format!("opening frankensqlite writer at {}", path.display()))?;
let storage = Self::new_with_shared_caches(
conn,
path.to_path_buf(),
ensured_agents,
ensured_workspaces,
ensured_conversation_sources,
ensured_daily_stats_keys,
);
storage.apply_config()?;
Ok(storage)
}
pub(crate) fn acquire_cached_ephemeral_writer(&self) -> Result<(Self, bool)> {
let mut cached = self.cached_ephemeral_writer.lock();
match std::mem::replace(&mut *cached, CachedEphemeralWriter::InUse) {
CachedEphemeralWriter::Cached(conn) => {
let (conn, checkpoint_pages, busy_timeout_ms) = (*conn).into_parts();
let writer = Self::new_with_shared_caches(
conn,
self.db_path.clone(),
Arc::clone(&self.ensured_agents),
Arc::clone(&self.ensured_workspaces),
Arc::clone(&self.ensured_conversation_sources),
Arc::clone(&self.ensured_daily_stats_keys),
);
writer
.index_writer_checkpoint_pages
.store(checkpoint_pages, Ordering::Relaxed);
writer
.index_writer_busy_timeout_ms
.store(busy_timeout_ms, Ordering::Relaxed);
Ok((writer, true))
}
CachedEphemeralWriter::Uninitialized => {
drop(cached);
match Self::open_writer_with_shared_caches(
&self.db_path,
Arc::clone(&self.ensured_agents),
Arc::clone(&self.ensured_workspaces),
Arc::clone(&self.ensured_conversation_sources),
Arc::clone(&self.ensured_daily_stats_keys),
) {
Ok(writer) => Ok((writer, true)),
Err(err) => {
let mut cached = self.cached_ephemeral_writer.lock();
if matches!(&*cached, CachedEphemeralWriter::InUse) {
*cached = CachedEphemeralWriter::Uninitialized;
}
Err(err)
}
}
}
CachedEphemeralWriter::InUse => {
*cached = CachedEphemeralWriter::InUse;
drop(cached);
Ok((
Self::open_writer_with_shared_caches(
&self.db_path,
Arc::clone(&self.ensured_agents),
Arc::clone(&self.ensured_workspaces),
Arc::clone(&self.ensured_conversation_sources),
Arc::clone(&self.ensured_daily_stats_keys),
)?,
false,
))
}
}
}
pub(crate) fn release_cached_ephemeral_writer(&self, writer: Self) {
let checkpoint_pages = writer.index_writer_checkpoint_pages.load(Ordering::Relaxed);
let busy_timeout_ms = writer.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
let conn = writer.into_raw();
let mut cached = self.cached_ephemeral_writer.lock();
debug_assert!(
matches!(&*cached, CachedEphemeralWriter::InUse),
"cached ephemeral writer state should be in-use when releasing"
);
*cached = CachedEphemeralWriter::Cached(Box::new(
SendFrankenConnection::new_with_index_writer_state(
conn,
checkpoint_pages,
busy_timeout_ms,
),
));
}
pub(crate) fn discard_cached_ephemeral_writer(&self, mut writer: Self) {
writer.close_best_effort_in_place();
let mut cached = self.cached_ephemeral_writer.lock();
if matches!(&*cached, CachedEphemeralWriter::InUse) {
*cached = CachedEphemeralWriter::Uninitialized;
}
}
fn cached_agent_id(&self, key: &EnsuredAgentKey) -> Option<i64> {
self.ensured_agents.lock().get(key).copied()
}
fn mark_agent_ensured(&self, key: EnsuredAgentKey, id: i64) {
self.ensured_agents.lock().insert(key, id);
}
fn cached_workspace_id(&self, key: &EnsuredWorkspaceKey) -> Option<i64> {
self.ensured_workspaces.lock().get(key).copied()
}
fn mark_workspace_ensured(&self, key: EnsuredWorkspaceKey, id: i64) {
self.ensured_workspaces.lock().insert(key, id);
}
fn conversation_source_already_ensured(&self, key: &EnsuredConversationSourceKey) -> bool {
self.ensured_conversation_sources.lock().contains(key)
}
fn mark_conversation_source_ensured(&self, key: EnsuredConversationSourceKey) {
self.ensured_conversation_sources.lock().insert(key);
}
fn daily_stats_key_already_ensured(&self, key: &EnsuredDailyStatsKey) -> bool {
self.ensured_daily_stats_keys.lock().contains(key)
}
fn daily_stats_keys_already_ensured(&self, keys: &[EnsuredDailyStatsKey; 4]) -> bool {
let ensured = self.ensured_daily_stats_keys.lock();
keys.iter().all(|key| ensured.contains(key))
}
fn mark_daily_stats_key_ensured(&self, key: EnsuredDailyStatsKey) {
self.ensured_daily_stats_keys.lock().insert(key);
}
fn fts_messages_present_cached(&self, tx: &FrankenTransaction<'_>) -> bool {
match self.fts_messages_present_cache.load(Ordering::Acquire) {
FTS_MESSAGES_PRESENT_PRESENT => return true,
FTS_MESSAGES_PRESENT_ABSENT => return false,
_ => {}
}
let present = tx
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master
WHERE name = 'fts_messages'
AND rootpage > 0",
fparams![],
|row| row.get_typed::<i64>(0),
)
.map(|count| count > 0)
.unwrap_or_else(|err| {
tracing::debug!(
error = %err,
"failed to probe fts_messages presence; skipping db-resident FTS maintenance"
);
false
});
self.set_fts_messages_present_cache(present);
present
}
fn set_fts_messages_present_cache(&self, present: bool) {
self.fts_messages_present_cache.store(
if present {
FTS_MESSAGES_PRESENT_PRESENT
} else {
FTS_MESSAGES_PRESENT_ABSENT
},
Ordering::Release,
);
}
fn invalidate_fts_messages_present_cache(&self) {
self.fts_messages_present_cache
.store(FTS_MESSAGES_PRESENT_UNKNOWN, Ordering::Release);
}
fn invalidate_conversation_source_cache(&self, source_id: &str) {
self.ensured_conversation_sources
.lock()
.retain(|key| key.id != source_id);
}
fn close_cached_ephemeral_writer_best_effort_in_place(&mut self) {
let cached = self.cached_ephemeral_writer.get_mut();
if let CachedEphemeralWriter::Cached(conn) =
std::mem::replace(cached, CachedEphemeralWriter::Uninitialized)
{
let mut conn = conn;
conn.0.close_best_effort_in_place();
}
}
fn close_cached_ephemeral_writer_without_checkpoint_in_place(&mut self) -> Result<()> {
let cached = self.cached_ephemeral_writer.get_mut();
match std::mem::replace(cached, CachedEphemeralWriter::Uninitialized) {
CachedEphemeralWriter::Cached(mut conn) => conn
.0
.close_without_checkpoint_in_place()
.with_context(|| "closing cached frankensqlite writer without final checkpoint"),
CachedEphemeralWriter::Uninitialized | CachedEphemeralWriter::InUse => Ok(()),
}
}
pub fn open_readonly(path: &Path) -> Result<Self> {
Self::open_readonly_with_doctor_lock_timeout(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)
}
pub fn open_readonly_with_doctor_lock_timeout(path: &Path, timeout: Duration) -> Result<Self> {
let path_str = path.to_string_lossy().to_string();
let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
let conn = open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
.with_context(|| format!("opening frankensqlite db readonly at {}", path.display()))?;
let storage = Self::new(conn, path.to_path_buf());
storage.apply_readonly_config()?;
Ok(storage)
}
pub fn close(self) -> Result<()> {
let mut this = self;
this.close_cached_ephemeral_writer_best_effort_in_place();
this.conn
.close()
.with_context(|| "closing frankensqlite connection")
}
pub fn close_without_checkpoint(self) -> Result<()> {
let mut this = self;
this.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
this.conn
.close_without_checkpoint()
.with_context(|| "closing frankensqlite connection without final checkpoint")
}
pub fn close_best_effort_in_place(&mut self) {
self.close_cached_ephemeral_writer_best_effort_in_place();
self.conn.close_best_effort_in_place();
}
pub fn close_without_checkpoint_in_place(&mut self) -> Result<()> {
self.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
self.conn
.close_without_checkpoint_in_place()
.with_context(|| "closing frankensqlite connection without final checkpoint")
}
pub fn raw(&self) -> &FrankenConnection {
&self.conn
}
pub fn into_raw(self) -> FrankenConnection {
let mut this = self;
this.close_cached_ephemeral_writer_best_effort_in_place();
this.conn
}
pub fn apply_config(&self) -> Result<()> {
self.conn
.execute("PRAGMA journal_mode = WAL;")
.with_context(|| "setting journal_mode")?;
self.conn
.execute("PRAGMA synchronous = NORMAL;")
.with_context(|| "setting synchronous")?;
self.conn
.execute("PRAGMA cache_size = -65536;")
.with_context(|| "setting cache_size")?;
self.conn
.execute("PRAGMA foreign_keys = ON;")
.with_context(|| "setting foreign_keys")?;
self.conn
.execute("PRAGMA busy_timeout = 5000;")
.with_context(|| "setting busy_timeout")?;
let checkpoint_pragma =
format!("PRAGMA wal_autocheckpoint = {DEFAULT_WAL_AUTOCHECKPOINT_PAGES};");
let _ = self.conn.execute(&checkpoint_pragma);
self.index_writer_checkpoint_pages
.store(DEFAULT_WAL_AUTOCHECKPOINT_PAGES, Ordering::Relaxed);
let _ = self.conn.execute("PRAGMA fsqlite.concurrent_mode = ON;");
let _ = self.conn.execute("PRAGMA concurrent_mode = ON;");
let autocommit_pragma =
disable_autocommit_retain(|pragma| self.conn.execute(pragma).map(|_| ()))?;
tracing::debug!(
pragma = autocommit_pragma,
"disabled frankensqlite autocommit_retain for storage connection"
);
Ok(())
}
fn apply_readonly_config(&self) -> Result<()> {
self.conn
.execute("PRAGMA query_only = 1;")
.with_context(|| "setting query_only")?;
self.conn
.execute("PRAGMA busy_timeout = 5000;")
.with_context(|| "setting busy_timeout")?;
self.conn
.execute("PRAGMA cache_size = -65536;")
.with_context(|| "setting cache_size")?;
self.conn
.execute("PRAGMA foreign_keys = ON;")
.with_context(|| "setting foreign_keys")?;
Ok(())
}
pub fn run_migrations(&self) -> Result<()> {
transition_from_meta_version(&self.conn)?;
let base_result = build_cass_migrations_before_tail_cache()
.run(&self.conn)
.with_context(|| "running base schema migrations")?;
let mut applied = base_result.applied;
if apply_conversation_tail_state_cache_migration(&self.conn)
.with_context(|| "running conversation tail-state cache migration")?
{
applied.push(15);
}
let post_result = build_cass_migrations_after_tail_cache()
.run(&self.conn)
.with_context(|| "running post-tail-cache schema migrations")?;
applied.extend(post_result.applied);
let current = self.schema_version()?;
if !applied.is_empty() {
info!(
applied = ?applied,
current,
was_fresh = base_result.was_fresh,
"frankensqlite schema migrations applied"
);
}
self.sync_meta_schema_version(current)?;
Ok(())
}
fn repair_missing_current_schema_objects(&self) -> Result<()> {
let mut missing_tables = Vec::new();
for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
if let Err(err) = self.conn.query(probe_sql) {
if error_indicates_missing_table(&err) {
missing_tables.push(table_name);
continue;
}
return Err(err).with_context(|| {
format!("probing required schema table {table_name} for completeness")
});
}
}
if !missing_tables.is_empty() {
info!(
missing_tables = ?missing_tables,
"repairing missing current-schema tables on an already-versioned cass database"
);
for batch in current_schema_repair_batches_for_missing_tables(&missing_tables)? {
self.conn
.execute_batch(batch.sql)
.with_context(|| format!("repairing current-schema batch {}", batch.name))?;
}
for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
if !missing_tables.contains(&table_name) {
continue;
}
self.conn
.query(probe_sql)
.with_context(|| format!("verifying repaired schema table {table_name}"))?;
}
}
self.repair_missing_conversation_token_columns()?;
Ok(())
}
fn repair_missing_conversation_token_columns(&self) -> Result<()> {
let columns = franken_table_column_names(&self.conn, "conversations")
.with_context(|| "inspecting conversations columns for token-summary repair")?;
let mut missing_columns = Vec::new();
for &(column_name, column_type) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
if columns.contains(column_name) {
continue;
}
let sql = format!("ALTER TABLE conversations ADD COLUMN {column_name} {column_type};");
self.conn.execute(&sql).with_context(|| {
format!("adding missing conversations.{column_name} token-summary column")
})?;
missing_columns.push(column_name);
}
if !missing_columns.is_empty() {
tracing::warn!(
target: "cass::schema_repair",
db_path = %self.db_path.display(),
missing_columns = ?missing_columns,
"cass#222: repaired missing conversations token-summary columns"
);
}
Ok(())
}
pub(crate) fn cleanup_orphan_fk_rows(&self) -> Result<OrphanFkCleanupReport> {
let mut report = OrphanFkCleanupReport::default();
let orphan_message_ids = match collect_orphan_message_ids(&self.conn) {
Ok(ids) => ids,
Err(err) if error_indicates_missing_table(&err) => {
tracing::debug!(
target: "cass::fk_repair",
child_table = "messages",
error = %err,
"skipping orphan-message probe (table or column unavailable)"
);
Vec::new()
}
Err(err) => return Err(err),
};
if !orphan_message_ids.is_empty() {
report.record("messages", orphan_message_ids.len() as i64);
}
if !orphan_message_ids.is_empty() {
delete_orphan_message_ids_bisecting_oom(&self.conn, &orphan_message_ids)
.context("deleting orphan message rows and dependent children")?;
}
for entry in ORPHAN_DIRECT_CHILD_TABLES {
loop {
let ids = match collect_direct_orphan_id_page(&self.conn, entry) {
Ok(ids) => ids,
Err(err)
if error_indicates_missing_table(&err)
|| error_indicates_missing_column(&err) =>
{
tracing::debug!(
target: "cass::fk_repair",
child_table = entry.child_table,
error = %err,
"skipping orphan probe (table or column unavailable)"
);
break;
}
Err(err) => {
return Err(err).with_context(|| {
format!("probing orphan rows in {}", entry.child_table)
});
}
};
if ids.is_empty() {
break;
}
let deleted = delete_direct_orphan_ids_bisecting_oom(&self.conn, entry, &ids)
.with_context(|| format!("deleting orphan rows from {}", entry.child_table))?;
if deleted == 0 {
break;
}
report.record(
entry.child_table,
i64::try_from(deleted).unwrap_or(i64::MAX),
);
}
}
if report.total == 0 {
return Ok(report);
}
tracing::warn!(
target: "cass::fk_repair",
db_path = %self.db_path.display(),
total_orphans = report.total,
per_table = ?report.per_table,
"cass#202: removed orphan rows left behind by interrupted index transactions"
);
Ok(report)
}
pub fn schema_version(&self) -> Result<i64> {
let rows = self
.conn
.query("SELECT MAX(version) FROM _schema_migrations;")
.with_context(|| "reading schema version from _schema_migrations")?;
if let Some(row) = rows.first()
&& let Ok(v) = row.get_typed::<Option<i64>>(0)
{
return Ok(v.unwrap_or(0));
}
Ok(0)
}
fn sync_meta_schema_version(&self, version: i64) -> Result<()> {
if self.conn.query("SELECT key FROM meta LIMIT 1;").is_err() {
return Ok(());
}
if let Ok(rows) = self
.conn
.query("SELECT value FROM meta WHERE key = 'schema_version';")
&& let Some(row) = rows.first()
&& let Ok(val) = row.get_typed::<String>(0)
&& val == version.to_string()
{
return Ok(()); }
self.conn
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1);",
&[ParamValue::from(version.to_string())],
)
.with_context(|| "syncing meta schema_version")?;
Ok(())
}
pub fn database_path(&self) -> Result<PathBuf> {
Ok(self.db_path.clone())
}
pub(crate) fn ephemeral_writer_preflight_verified(&self) -> bool {
self.ephemeral_writer_preflight_verified
.load(Ordering::Relaxed)
}
pub(crate) fn mark_ephemeral_writer_preflight_verified(&self) {
self.ephemeral_writer_preflight_verified
.store(true, Ordering::Relaxed);
}
pub(crate) fn index_writer_checkpoint_pages(&self) -> Option<i64> {
let pages = self.index_writer_checkpoint_pages.load(Ordering::Relaxed);
(pages != UNSET_INDEX_WRITER_CHECKPOINT_PAGES).then_some(pages)
}
pub(crate) fn mark_index_writer_checkpoint_pages(&self, pages: i64) {
self.index_writer_checkpoint_pages
.store(pages, Ordering::Relaxed);
}
pub(crate) fn index_writer_busy_timeout_ms(&self) -> Option<u64> {
let timeout_ms = self.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
(timeout_ms != UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS).then_some(timeout_ms)
}
pub(crate) fn mark_index_writer_busy_timeout_ms(&self, timeout_ms: u64) {
self.index_writer_busy_timeout_ms
.store(timeout_ms, Ordering::Relaxed);
}
pub fn open_or_rebuild(path: &Path) -> std::result::Result<Self, MigrationError> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
if path.exists() {
let check_result = check_schema_compatibility(path);
match check_result {
Ok(SchemaCheck::Compatible) | Ok(SchemaCheck::NeedsMigration) => {
}
Ok(SchemaCheck::NeedsRebuild(reason)) => {
let backup_path = create_backup(path)?;
cleanup_old_backups(path, MAX_BACKUPS)?;
remove_database_files(path)?;
return Err(MigrationError::RebuildRequired {
reason,
backup_path,
});
}
Err(err) if schema_check_error_requires_rebuild(&err) => {
let backup_path = create_backup(path)?;
cleanup_old_backups(path, MAX_BACKUPS)?;
remove_database_files(path)?;
return Err(MigrationError::RebuildRequired {
reason: format!("Database appears corrupted: {err}"),
backup_path,
});
}
Err(err) => return Err(MigrationError::Database(err)),
}
}
let storage = Self::open(path).map_err(|e| MigrationError::Other(e.to_string()))?;
Ok(storage)
}
}
fn build_cass_migrations_before_tail_cache() -> MigrationRunner {
MigrationRunner::new()
.add(13, "full_schema_v13", MIGRATION_FRESH_SCHEMA)
.add(14, "fts_contentless", MIGRATION_V14)
}
fn build_cass_migrations_after_tail_cache() -> MigrationRunner {
MigrationRunner::new()
.add(16, "drop_redundant_message_conv_idx", MIGRATION_V16)
.add(17, "drop_message_created_idx", MIGRATION_V17)
.add(18, "conversation_tail_state_hot_table", MIGRATION_V18)
.add(19, "conversation_external_lookup", MIGRATION_V19)
.add(20, "conversation_external_tail_lookup", MIGRATION_V20)
}
fn schema_migration_is_applied(conn: &FrankenConnection, version: i64) -> Result<bool> {
let rows = conn
.query_with_params(
"SELECT 1 FROM _schema_migrations WHERE version = ?1 LIMIT 1;",
&[SqliteValue::from(version)],
)
.with_context(|| format!("checking schema migration version {version}"))?;
Ok(!rows.is_empty())
}
fn apply_conversation_tail_state_cache_migration(conn: &FrankenConnection) -> Result<bool> {
conn.execute("BEGIN IMMEDIATE;")
.with_context(|| "starting v15 conversation tail-state migration transaction")?;
let result = (|| -> Result<bool> {
if schema_migration_is_applied(conn, 15)? {
conn.execute("COMMIT;")
.with_context(|| "committing already-applied v15 migration transaction")?;
return Ok(false);
}
let started = Instant::now();
let conversation_columns = franken_table_column_names(conn, "conversations")
.with_context(|| "inspecting conversations columns before v15 migration")?;
if !conversation_columns.contains("last_message_idx") {
conn.execute("ALTER TABLE conversations ADD COLUMN last_message_idx INTEGER;")
.with_context(|| "adding v15 conversations.last_message_idx column")?;
}
if !conversation_columns.contains("last_message_created_at") {
conn.execute("ALTER TABLE conversations ADD COLUMN last_message_created_at INTEGER;")
.with_context(|| "adding v15 conversations.last_message_created_at column")?;
}
conn.execute_batch(MIGRATION_V15_TAIL_STATE_TABLE)
.with_context(|| "applying v15 conversation tail-state table schema")?;
conn.execute_compat(
"INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
fparams![15_i64, "conversation_tail_state_cache"],
)
.with_context(|| "recording v15 conversation tail-state migration")?;
conn.execute("COMMIT;")
.with_context(|| "committing v15 conversation tail-state migration")?;
info!(
elapsed_ms = started.elapsed().as_millis(),
"applied v15 conversation tail-state cache migration"
);
Ok(true)
})();
if result.is_err() {
let _ = conn.execute("ROLLBACK;");
}
result
}
fn franken_table_column_names(
conn: &FrankenConnection,
table_name: &str,
) -> Result<HashSet<String>> {
if !table_name
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '_')
{
return Err(anyhow!(
"unsafe table name for PRAGMA table_info: {table_name}"
));
}
conn.query_map_collect(
&format!("PRAGMA table_info({table_name})"),
fparams![],
|row: &FrankenRow| row.get_typed::<String>(1),
)
.with_context(|| format!("reading PRAGMA table_info({table_name})"))
.map(|columns| columns.into_iter().collect())
}
const MIGRATION_FRESH_SCHEMA: &str = r"
-- Core tables (V1)
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS agents (
id INTEGER PRIMARY KEY,
slug TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
version TEXT,
kind TEXT NOT NULL,
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS workspaces (
id INTEGER PRIMARY KEY,
path TEXT NOT NULL UNIQUE,
display_name TEXT
);
-- Sources (V4)
CREATE TABLE IF NOT EXISTS sources (
id TEXT PRIMARY KEY,
kind TEXT NOT NULL,
host_label TEXT,
machine_id TEXT,
platform TEXT,
config_json TEXT,
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL
);
INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
-- Conversations: V1 base + V5 provenance + V7 metadata_bin + V10 token summary
CREATE TABLE IF NOT EXISTS conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL REFERENCES agents(id),
workspace_id INTEGER REFERENCES workspaces(id),
source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
external_id TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER,
ended_at INTEGER,
approx_tokens INTEGER,
metadata_json TEXT,
origin_host TEXT,
metadata_bin BLOB,
total_input_tokens INTEGER,
total_output_tokens INTEGER,
total_cache_read_tokens INTEGER,
total_cache_creation_tokens INTEGER,
grand_total_tokens INTEGER,
estimated_cost_usd REAL,
primary_model TEXT,
api_call_count INTEGER,
tool_call_count INTEGER,
user_message_count INTEGER,
assistant_message_count INTEGER,
-- V15 columns are included in the fresh schema so fresh DB creation does
-- not need ALTER TABLE on conversations. That ALTER path can duplicate
-- provenance autoindex state in frankensqlite when the named unique
-- provenance index already exists.
last_message_idx INTEGER,
last_message_created_at INTEGER
);
-- Named unique index avoids autoindex issues if table is ever recreated
CREATE UNIQUE INDEX IF NOT EXISTS idx_conversations_provenance
ON conversations(source_id, agent_id, external_id);
-- Messages: V1 base + V7 extra_bin
CREATE TABLE IF NOT EXISTS messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
idx INTEGER NOT NULL,
role TEXT NOT NULL,
author TEXT,
created_at INTEGER,
content TEXT NOT NULL,
extra_json TEXT,
extra_bin BLOB,
UNIQUE(conversation_id, idx)
);
CREATE TABLE IF NOT EXISTS snippets (
id INTEGER PRIMARY KEY,
message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
file_path TEXT,
start_line INTEGER,
end_line INTEGER,
language TEXT,
snippet_text TEXT
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS conversation_tags (
conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
PRIMARY KEY (conversation_id, tag_id)
);
-- Daily stats (V8)
CREATE TABLE IF NOT EXISTS daily_stats (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
source_id TEXT NOT NULL DEFAULT 'all',
session_count INTEGER NOT NULL DEFAULT 0,
message_count INTEGER NOT NULL DEFAULT 0,
total_chars INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL,
PRIMARY KEY (day_id, agent_slug, source_id)
);
-- Embedding jobs (V9)
CREATE TABLE IF NOT EXISTS embedding_jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
db_path TEXT NOT NULL,
model_id TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending',
total_docs INTEGER NOT NULL DEFAULT 0,
completed_docs INTEGER NOT NULL DEFAULT 0,
error_message TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
started_at TEXT,
completed_at TEXT
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
ON embedding_jobs(db_path, model_id)
WHERE status IN ('pending', 'running');
-- Token usage ledger (V10)
CREATE TABLE IF NOT EXISTS token_usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
conversation_id INTEGER NOT NULL,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT NOT NULL DEFAULT 'local',
timestamp_ms INTEGER NOT NULL,
day_id INTEGER NOT NULL,
model_name TEXT,
model_family TEXT,
model_tier TEXT,
service_tier TEXT,
provider TEXT,
input_tokens INTEGER,
output_tokens INTEGER,
cache_read_tokens INTEGER,
cache_creation_tokens INTEGER,
thinking_tokens INTEGER,
total_tokens INTEGER,
estimated_cost_usd REAL,
role TEXT NOT NULL,
content_chars INTEGER NOT NULL,
has_tool_calls INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
data_source TEXT NOT NULL DEFAULT 'api',
UNIQUE(message_id)
);
-- Token daily stats (V10)
CREATE TABLE IF NOT EXISTS token_daily_stats (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
source_id TEXT NOT NULL DEFAULT 'all',
model_family TEXT NOT NULL DEFAULT 'all',
api_call_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_message_count INTEGER NOT NULL DEFAULT 0,
total_input_tokens INTEGER NOT NULL DEFAULT 0,
total_output_tokens INTEGER NOT NULL DEFAULT 0,
total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
grand_total_tokens INTEGER NOT NULL DEFAULT 0,
total_content_chars INTEGER NOT NULL DEFAULT 0,
total_tool_calls INTEGER NOT NULL DEFAULT 0,
estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
session_count INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL,
PRIMARY KEY (day_id, agent_slug, source_id, model_family)
);
-- Model pricing (V10)
CREATE TABLE IF NOT EXISTS model_pricing (
model_pattern TEXT NOT NULL,
provider TEXT NOT NULL,
input_cost_per_mtok REAL NOT NULL,
output_cost_per_mtok REAL NOT NULL,
cache_read_cost_per_mtok REAL,
cache_creation_cost_per_mtok REAL,
effective_date TEXT NOT NULL,
PRIMARY KEY (model_pattern, effective_date)
);
INSERT OR IGNORE INTO model_pricing VALUES
('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
-- Message metrics: V11 base + V12 model dimensions
CREATE TABLE IF NOT EXISTS message_metrics (
message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
created_at_ms INTEGER NOT NULL,
hour_id INTEGER NOT NULL,
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
role TEXT NOT NULL,
content_chars INTEGER NOT NULL,
content_tokens_est INTEGER NOT NULL,
api_input_tokens INTEGER,
api_output_tokens INTEGER,
api_cache_read_tokens INTEGER,
api_cache_creation_tokens INTEGER,
api_thinking_tokens INTEGER,
api_service_tier TEXT,
api_data_source TEXT NOT NULL DEFAULT 'estimated',
tool_call_count INTEGER NOT NULL DEFAULT 0,
has_tool_calls INTEGER NOT NULL DEFAULT 0,
has_plan INTEGER NOT NULL DEFAULT 0,
model_name TEXT,
model_family TEXT NOT NULL DEFAULT 'unknown',
model_tier TEXT NOT NULL DEFAULT 'unknown',
provider TEXT NOT NULL DEFAULT 'unknown'
);
-- Hourly rollups: V11 base + V13 plan columns
CREATE TABLE IF NOT EXISTS usage_hourly (
hour_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
message_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
plan_message_count INTEGER NOT NULL DEFAULT 0,
api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
api_tokens_total INTEGER NOT NULL DEFAULT 0,
api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL DEFAULT 0,
plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
);
-- Daily rollups: V11 base + V13 plan columns
CREATE TABLE IF NOT EXISTS usage_daily (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
message_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
plan_message_count INTEGER NOT NULL DEFAULT 0,
api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
api_tokens_total INTEGER NOT NULL DEFAULT 0,
api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL DEFAULT 0,
plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
);
-- Model daily rollups (V12)
CREATE TABLE IF NOT EXISTS usage_models_daily (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
model_family TEXT NOT NULL DEFAULT 'unknown',
model_tier TEXT NOT NULL DEFAULT 'unknown',
message_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
plan_message_count INTEGER NOT NULL DEFAULT 0,
api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
api_tokens_total INTEGER NOT NULL DEFAULT 0,
api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
);
-- All indexes
CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
";
#[derive(Clone, Copy)]
struct SchemaRepairBatch {
name: &'static str,
tables: &'static [&'static str],
sql: &'static str,
}
const CURRENT_SCHEMA_REPAIR_SOURCES_SQL: &str = r"
CREATE TABLE IF NOT EXISTS sources (
id TEXT PRIMARY KEY,
kind TEXT NOT NULL,
host_label TEXT,
machine_id TEXT,
platform TEXT,
config_json TEXT,
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL
);
INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
";
const CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL: &str = r"
CREATE TABLE IF NOT EXISTS daily_stats (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
source_id TEXT NOT NULL DEFAULT 'all',
session_count INTEGER NOT NULL DEFAULT 0,
message_count INTEGER NOT NULL DEFAULT 0,
total_chars INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL,
PRIMARY KEY (day_id, agent_slug, source_id)
);
CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
";
const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL: &str = r"
CREATE TABLE IF NOT EXISTS conversation_external_lookup (
lookup_key TEXT PRIMARY KEY,
conversation_id INTEGER NOT NULL
);
INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
SELECT
CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
CAST(agent_id AS TEXT) || ':' ||
CAST(length(external_id) AS TEXT) || ':' || external_id,
id
FROM conversations
WHERE external_id IS NOT NULL;
";
const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL: &str = r"
CREATE TABLE IF NOT EXISTS conversation_tail_state (
conversation_id INTEGER PRIMARY KEY,
ended_at INTEGER,
last_message_idx INTEGER,
last_message_created_at INTEGER
);
CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
lookup_key TEXT PRIMARY KEY,
conversation_id INTEGER NOT NULL,
ended_at INTEGER,
last_message_idx INTEGER,
last_message_created_at INTEGER
);
INSERT OR REPLACE INTO conversation_external_tail_lookup (
lookup_key,
conversation_id,
ended_at,
last_message_idx,
last_message_created_at
)
SELECT
CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
CAST(c.agent_id AS TEXT) || ':' ||
CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
c.id,
ts.ended_at,
ts.last_message_idx,
ts.last_message_created_at
FROM conversations c
LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
WHERE c.external_id IS NOT NULL;
";
const CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL: &str = r"
CREATE TABLE IF NOT EXISTS embedding_jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
db_path TEXT NOT NULL,
model_id TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending',
total_docs INTEGER NOT NULL DEFAULT 0,
completed_docs INTEGER NOT NULL DEFAULT 0,
error_message TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
started_at TEXT,
completed_at TEXT
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
ON embedding_jobs(db_path, model_id)
WHERE status IN ('pending', 'running');
";
const CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL: &str = r"
CREATE TABLE IF NOT EXISTS token_usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
conversation_id INTEGER NOT NULL,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT NOT NULL DEFAULT 'local',
timestamp_ms INTEGER NOT NULL,
day_id INTEGER NOT NULL,
model_name TEXT,
model_family TEXT,
model_tier TEXT,
service_tier TEXT,
provider TEXT,
input_tokens INTEGER,
output_tokens INTEGER,
cache_read_tokens INTEGER,
cache_creation_tokens INTEGER,
thinking_tokens INTEGER,
total_tokens INTEGER,
estimated_cost_usd REAL,
role TEXT NOT NULL,
content_chars INTEGER NOT NULL,
has_tool_calls INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
data_source TEXT NOT NULL DEFAULT 'api',
UNIQUE(message_id)
);
CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
CREATE TABLE IF NOT EXISTS token_daily_stats (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
source_id TEXT NOT NULL DEFAULT 'all',
model_family TEXT NOT NULL DEFAULT 'all',
api_call_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_message_count INTEGER NOT NULL DEFAULT 0,
total_input_tokens INTEGER NOT NULL DEFAULT 0,
total_output_tokens INTEGER NOT NULL DEFAULT 0,
total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
grand_total_tokens INTEGER NOT NULL DEFAULT 0,
total_content_chars INTEGER NOT NULL DEFAULT 0,
total_tool_calls INTEGER NOT NULL DEFAULT 0,
estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
session_count INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL,
PRIMARY KEY (day_id, agent_slug, source_id, model_family)
);
CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
CREATE TABLE IF NOT EXISTS model_pricing (
model_pattern TEXT NOT NULL,
provider TEXT NOT NULL,
input_cost_per_mtok REAL NOT NULL,
output_cost_per_mtok REAL NOT NULL,
cache_read_cost_per_mtok REAL,
cache_creation_cost_per_mtok REAL,
effective_date TEXT NOT NULL,
PRIMARY KEY (model_pattern, effective_date)
);
INSERT OR IGNORE INTO model_pricing VALUES
('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
";
const CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL: &str = r"
CREATE TABLE IF NOT EXISTS message_metrics (
message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
created_at_ms INTEGER NOT NULL,
hour_id INTEGER NOT NULL,
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
role TEXT NOT NULL,
content_chars INTEGER NOT NULL,
content_tokens_est INTEGER NOT NULL,
api_input_tokens INTEGER,
api_output_tokens INTEGER,
api_cache_read_tokens INTEGER,
api_cache_creation_tokens INTEGER,
api_thinking_tokens INTEGER,
api_service_tier TEXT,
api_data_source TEXT NOT NULL DEFAULT 'estimated',
tool_call_count INTEGER NOT NULL DEFAULT 0,
has_tool_calls INTEGER NOT NULL DEFAULT 0,
has_plan INTEGER NOT NULL DEFAULT 0,
model_name TEXT,
model_family TEXT NOT NULL DEFAULT 'unknown',
model_tier TEXT NOT NULL DEFAULT 'unknown',
provider TEXT NOT NULL DEFAULT 'unknown'
);
CREATE TABLE IF NOT EXISTS usage_hourly (
hour_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
message_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
plan_message_count INTEGER NOT NULL DEFAULT 0,
api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
api_tokens_total INTEGER NOT NULL DEFAULT 0,
api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL DEFAULT 0,
plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
);
CREATE TABLE IF NOT EXISTS usage_daily (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
message_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
plan_message_count INTEGER NOT NULL DEFAULT 0,
api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
api_tokens_total INTEGER NOT NULL DEFAULT 0,
api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL DEFAULT 0,
plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
);
CREATE TABLE IF NOT EXISTS usage_models_daily (
day_id INTEGER NOT NULL,
agent_slug TEXT NOT NULL,
workspace_id INTEGER NOT NULL DEFAULT 0,
source_id TEXT NOT NULL DEFAULT 'local',
model_family TEXT NOT NULL DEFAULT 'unknown',
model_tier TEXT NOT NULL DEFAULT 'unknown',
message_count INTEGER NOT NULL DEFAULT 0,
user_message_count INTEGER NOT NULL DEFAULT 0,
assistant_message_count INTEGER NOT NULL DEFAULT 0,
tool_call_count INTEGER NOT NULL DEFAULT 0,
plan_message_count INTEGER NOT NULL DEFAULT 0,
api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
api_tokens_total INTEGER NOT NULL DEFAULT 0,
api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
last_updated INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
);
CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
";
const CURRENT_SCHEMA_REPAIR_BATCHES: &[SchemaRepairBatch] = &[
SchemaRepairBatch {
name: "sources",
tables: &["sources"],
sql: CURRENT_SCHEMA_REPAIR_SOURCES_SQL,
},
SchemaRepairBatch {
name: "daily_stats",
tables: &["daily_stats"],
sql: CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL,
},
SchemaRepairBatch {
name: "conversation_external_lookup",
tables: &["conversation_external_lookup"],
sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL,
},
SchemaRepairBatch {
name: "conversation_external_tail_lookup",
tables: &[
"conversation_tail_state",
"conversation_external_tail_lookup",
],
sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL,
},
SchemaRepairBatch {
name: "embedding_jobs",
tables: &["embedding_jobs"],
sql: CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL,
},
SchemaRepairBatch {
name: "token_analytics",
tables: &["token_usage", "token_daily_stats", "model_pricing"],
sql: CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL,
},
SchemaRepairBatch {
name: "message_rollups",
tables: &[
"message_metrics",
"usage_hourly",
"usage_daily",
"usage_models_daily",
],
sql: CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL,
},
];
fn current_schema_repair_batches_for_missing_tables(
missing_tables: &[&'static str],
) -> Result<Vec<&'static SchemaRepairBatch>> {
let missing_set: HashSet<&'static str> = missing_tables.iter().copied().collect();
let mut selected_batches = Vec::new();
let mut covered_tables = HashSet::new();
for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
if !batch
.tables
.iter()
.any(|table_name| missing_set.contains(table_name))
{
continue;
}
selected_batches.push(batch);
covered_tables.extend(batch.tables.iter().copied());
}
for &table_name in missing_tables {
if !covered_tables.contains(table_name) {
return Err(anyhow!(
"no current-schema repair batch registered for missing table {table_name}"
));
}
}
Ok(selected_batches)
}
const MIGRATION_NAMES: [(i64, &str); 20] = [
(1, "core_tables"),
(2, "fts_messages"),
(3, "fts_messages_rebuild"),
(4, "sources"),
(5, "provenance_columns"),
(6, "source_path_index"),
(7, "msgpack_columns"),
(8, "daily_stats"),
(9, "embedding_jobs"),
(10, "token_analytics"),
(11, "message_metrics"),
(12, "model_dimensions"),
(13, "plan_token_rollups"),
(14, "fts_contentless"),
(15, "conversation_tail_state_cache"),
(16, "drop_redundant_message_conv_idx"),
(17, "drop_message_created_idx"),
(18, "conversation_tail_state_hot_table"),
(19, "conversation_external_lookup"),
(20, "conversation_external_tail_lookup"),
];
fn transition_from_meta_version(conn: &FrankenConnection) -> Result<()> {
if conn
.query("SELECT version FROM \"_schema_migrations\";")
.is_ok()
{
return Ok(());
}
if conn.query("SELECT key FROM meta;").is_err() {
return Ok(());
}
let rows = conn
.query("SELECT value FROM meta WHERE key = 'schema_version';")
.with_context(|| "reading schema_version from meta")?;
let current_version: i64 = rows
.first()
.and_then(|row| row.get_typed::<String>(0).ok())
.and_then(|s| s.parse().ok())
.unwrap_or(0);
if current_version == 0 {
if conn.query("SELECT id FROM conversations LIMIT 1;").is_err() {
return Ok(());
}
info!("meta.schema_version=0 but tables exist; skipping transition (corrupted state)");
return Ok(());
}
info!(
current_version,
"transitioning schema tracking from meta table to _schema_migrations"
);
conn.execute(
"CREATE TABLE IF NOT EXISTS _schema_migrations (\
version INTEGER PRIMARY KEY, \
name TEXT NOT NULL, \
applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))\
);",
)
.with_context(|| "creating _schema_migrations table for transition")?;
let backfill_through_version = if (10..13).contains(¤t_version) {
13
} else {
current_version
};
for &(version, name) in &MIGRATION_NAMES {
if version > backfill_through_version {
break;
}
conn.execute_compat(
"INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
&[ParamValue::from(version), ParamValue::from(name)],
)
.with_context(|| format!("backfilling _schema_migrations version {version}"))?;
}
info!(
current_version,
backfill_through_version,
"schema version transition complete: backfilled legacy meta schema versions"
);
Ok(())
}
const REQUIRED_CURRENT_SCHEMA_TABLE_PROBES: &[(&str, &str)] = &[
("sources", "SELECT id FROM sources LIMIT 1;"),
("daily_stats", "SELECT day_id FROM daily_stats LIMIT 1;"),
(
"conversation_external_lookup",
"SELECT lookup_key FROM conversation_external_lookup LIMIT 1;",
),
(
"conversation_tail_state",
"SELECT conversation_id FROM conversation_tail_state LIMIT 1;",
),
(
"conversation_external_tail_lookup",
"SELECT lookup_key, last_message_idx FROM conversation_external_tail_lookup LIMIT 1;",
),
("embedding_jobs", "SELECT id FROM embedding_jobs LIMIT 1;"),
("token_usage", "SELECT id FROM token_usage LIMIT 1;"),
(
"token_daily_stats",
"SELECT day_id FROM token_daily_stats LIMIT 1;",
),
(
"model_pricing",
"SELECT model_pattern FROM model_pricing LIMIT 1;",
),
(
"message_metrics",
"SELECT message_id FROM message_metrics LIMIT 1;",
),
("usage_hourly", "SELECT hour_id FROM usage_hourly LIMIT 1;"),
("usage_daily", "SELECT day_id FROM usage_daily LIMIT 1;"),
(
"usage_models_daily",
"SELECT day_id FROM usage_models_daily LIMIT 1;",
),
];
const REQUIRED_CONVERSATION_TOKEN_COLUMNS: &[(&str, &str)] = &[
("total_input_tokens", "INTEGER"),
("total_output_tokens", "INTEGER"),
("total_cache_read_tokens", "INTEGER"),
("total_cache_creation_tokens", "INTEGER"),
("grand_total_tokens", "INTEGER"),
("estimated_cost_usd", "REAL"),
("primary_model", "TEXT"),
("api_call_count", "INTEGER"),
("tool_call_count", "INTEGER"),
("user_message_count", "INTEGER"),
("assistant_message_count", "INTEGER"),
];
fn error_indicates_missing_table(err: &impl std::fmt::Display) -> bool {
err.to_string()
.to_ascii_lowercase()
.contains("no such table")
}
fn error_indicates_missing_column(err: &impl std::fmt::Display) -> bool {
err.to_string()
.to_ascii_lowercase()
.contains("no such column")
}
const ORPHAN_FK_ID_CHUNK_SIZE: usize = 256;
fn collect_orphan_message_ids(conn: &FrankenConnection) -> Result<Vec<i64>> {
let min_conversation_id = conn
.query_map_collect(
"SELECT conversation_id
FROM messages
ORDER BY conversation_id ASC
LIMIT 1",
fparams![],
|row| row.get_typed(0),
)
.context("finding minimum message conversation id for orphan FK cleanup")?
.into_iter()
.next();
let Some(min_conversation_id) = min_conversation_id else {
return Ok(Vec::new());
};
let max_conversation_id: i64 = conn
.query_row_map(
"SELECT conversation_id
FROM messages
ORDER BY conversation_id DESC
LIMIT 1",
fparams![],
|row| row.get_typed(0),
)
.context("finding maximum message conversation id for orphan FK cleanup")?;
let parent_conversation_ids: Vec<i64> = conn
.query_map_collect(
"SELECT id
FROM conversations
WHERE id BETWEEN ?1 AND ?2
ORDER BY id",
fparams![min_conversation_id, max_conversation_id],
|row| row.get_typed(0),
)
.context("listing parent conversation ids for orphan FK cleanup")?;
let mut message_ids = Vec::new();
let mut gap_start = min_conversation_id;
for parent_id in parent_conversation_ids {
if parent_id < gap_start {
continue;
}
if parent_id > max_conversation_id {
break;
}
if gap_start < parent_id {
collect_message_ids_for_conversation_gap(
conn,
gap_start,
parent_id.saturating_sub(1),
&mut message_ids,
)?;
}
if parent_id == i64::MAX {
return Ok(message_ids);
}
gap_start = parent_id + 1;
}
if gap_start <= max_conversation_id {
collect_message_ids_for_conversation_gap(
conn,
gap_start,
max_conversation_id,
&mut message_ids,
)?;
}
Ok(message_ids)
}
fn collect_message_ids_for_conversation_gap(
conn: &FrankenConnection,
gap_start: i64,
gap_end: i64,
message_ids: &mut Vec<i64>,
) -> Result<()> {
let (sql, params) = if gap_start == gap_end {
(
"SELECT id FROM messages WHERE conversation_id = ?1",
vec![SqliteValue::from(gap_start)],
)
} else {
(
"SELECT id FROM messages WHERE conversation_id BETWEEN ?1 AND ?2",
vec![SqliteValue::from(gap_start), SqliteValue::from(gap_end)],
)
};
let rows = conn.query_with_params(sql, ¶ms).with_context(|| {
format!("listing orphan message ids for conversation-id gap {gap_start}..={gap_end}")
})?;
message_ids.reserve(rows.len());
for row in rows {
message_ids.push(row.get_typed(0)?);
}
Ok(())
}
fn delete_rows_by_i64_chunks(
tx: &FrankenTransaction<'_>,
delete_many_sql_prefix: &'static str,
ids: &[i64],
) -> Result<usize> {
if ids.is_empty() {
return Ok(0);
}
let full_chunk_sql = delete_rows_by_i64_sql(delete_many_sql_prefix, ORPHAN_FK_ID_CHUNK_SIZE);
let tail_len = ids.len() % ORPHAN_FK_ID_CHUNK_SIZE;
let tail_sql =
(tail_len != 0).then(|| delete_rows_by_i64_sql(delete_many_sql_prefix, tail_len));
let mut deleted = 0;
for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
let sql = if chunk.len() == ORPHAN_FK_ID_CHUNK_SIZE {
&full_chunk_sql
} else {
tail_sql.as_ref().unwrap_or(&full_chunk_sql)
};
let params = chunk
.iter()
.map(|id| SqliteValue::from(*id))
.collect::<Vec<_>>();
deleted += tx.execute_with_params(sql, ¶ms)?;
}
Ok(deleted)
}
fn delete_rows_by_i64_sql(delete_many_sql_prefix: &'static str, count: usize) -> String {
let placeholders = sql_placeholders(count);
format!("{delete_many_sql_prefix} ({placeholders})")
}
fn sql_placeholders(count: usize) -> String {
vec!["?"; count].join(", ")
}
fn delete_orphan_message_ids_bisecting_oom(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
let mut deleted = 0usize;
for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
deleted = deleted.saturating_add(delete_orphan_message_id_chunk(conn, chunk)?);
}
Ok(deleted)
}
fn delete_orphan_message_id_chunk(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
if ids.is_empty() {
return Ok(0);
}
match delete_orphan_message_id_chunk_once(conn, ids) {
Ok(deleted) => Ok(deleted),
Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
let split_at = ids.len() / 2;
tracing::warn!(
target: "cass::fk_repair",
rows = ids.len(),
left = split_at,
right = ids.len().saturating_sub(split_at),
error = %err,
"orphan-message cleanup ran out of memory; retrying as smaller batches"
);
let left = delete_orphan_message_id_chunk(conn, &ids[..split_at])?;
let right = delete_orphan_message_id_chunk(conn, &ids[split_at..])?;
Ok(left.saturating_add(right))
}
Err(err) => Err(err),
}
}
fn delete_orphan_message_id_chunk_once(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
let mut tx = conn.transaction()?;
let mut deleted = 0usize;
for entry in ORPHAN_MESSAGE_DEPENDENT_TABLES {
match delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids) {
Ok(count) => {
deleted = deleted.saturating_add(count);
}
Err(err) if error_indicates_missing_table(&err) => {
tracing::debug!(
target: "cass::fk_repair",
child_table = entry.child_table,
error = %err,
"skipping orphan-message dependent cleanup (table unavailable)"
);
}
Err(err) => {
return Err(err).with_context(|| {
format!(
"deleting rows from {} that depend on orphan messages",
entry.child_table
)
});
}
}
}
deleted = deleted.saturating_add(
delete_rows_by_i64_chunks(&tx, "DELETE FROM messages WHERE id IN", ids)
.context("deleting orphan rows from messages")?,
);
tx.commit()?;
Ok(deleted)
}
fn collect_direct_orphan_id_page(
conn: &FrankenConnection,
entry: &'static OrphanFkTable,
) -> Result<Vec<i64>> {
Ok(conn.query_map_collect(
entry.orphan_id_page_sql,
fparams![i64::try_from(ORPHAN_FK_ID_CHUNK_SIZE).unwrap_or(i64::MAX)],
|row| row.get_typed(0),
)?)
}
fn delete_direct_orphan_ids_bisecting_oom(
conn: &FrankenConnection,
entry: &'static OrphanFkTable,
ids: &[i64],
) -> Result<usize> {
let mut deleted = 0usize;
for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
deleted = deleted.saturating_add(delete_direct_orphan_id_chunk(conn, entry, chunk)?);
}
Ok(deleted)
}
fn delete_direct_orphan_id_chunk(
conn: &FrankenConnection,
entry: &'static OrphanFkTable,
ids: &[i64],
) -> Result<usize> {
if ids.is_empty() {
return Ok(0);
}
match delete_direct_orphan_id_chunk_once(conn, entry, ids) {
Ok(deleted) => Ok(deleted),
Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
let split_at = ids.len() / 2;
tracing::warn!(
target: "cass::fk_repair",
child_table = entry.child_table,
rows = ids.len(),
left = split_at,
right = ids.len().saturating_sub(split_at),
error = %err,
"direct orphan cleanup ran out of memory; retrying as smaller batches"
);
let left = delete_direct_orphan_id_chunk(conn, entry, &ids[..split_at])?;
let right = delete_direct_orphan_id_chunk(conn, entry, &ids[split_at..])?;
Ok(left.saturating_add(right))
}
Err(err) => Err(err),
}
}
fn delete_direct_orphan_id_chunk_once(
conn: &FrankenConnection,
entry: &'static OrphanFkTable,
ids: &[i64],
) -> Result<usize> {
let mut tx = conn.transaction()?;
let deleted = delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids)?;
tx.commit()?;
Ok(deleted)
}
struct OrphanFkTable {
child_table: &'static str,
orphan_id_page_sql: &'static str,
delete_many_sql_prefix: &'static str,
}
const ORPHAN_DIRECT_CHILD_TABLES: &[OrphanFkTable] = &[
OrphanFkTable {
child_table: "message_metrics",
orphan_id_page_sql: "SELECT message_id FROM message_metrics \
WHERE NOT EXISTS (\
SELECT 1 FROM messages \
WHERE messages.id = message_metrics.message_id\
) \
ORDER BY message_id \
LIMIT ?1",
delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
},
OrphanFkTable {
child_table: "token_usage",
orphan_id_page_sql: "SELECT message_id FROM token_usage \
WHERE NOT EXISTS (\
SELECT 1 FROM messages \
WHERE messages.id = token_usage.message_id\
) \
ORDER BY message_id \
LIMIT ?1",
delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
},
OrphanFkTable {
child_table: "snippets",
orphan_id_page_sql: "SELECT message_id FROM snippets \
WHERE NOT EXISTS (\
SELECT 1 FROM messages \
WHERE messages.id = snippets.message_id\
) \
ORDER BY message_id \
LIMIT ?1",
delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
},
OrphanFkTable {
child_table: "conversation_tags",
orphan_id_page_sql: "SELECT conversation_id FROM conversation_tags \
WHERE NOT EXISTS (\
SELECT 1 FROM conversations \
WHERE conversations.id = conversation_tags.conversation_id\
) \
ORDER BY conversation_id \
LIMIT ?1",
delete_many_sql_prefix: "DELETE FROM conversation_tags WHERE conversation_id IN",
},
];
struct OrphanMessageDependentTable {
child_table: &'static str,
delete_many_sql_prefix: &'static str,
}
const ORPHAN_MESSAGE_DEPENDENT_TABLES: &[OrphanMessageDependentTable] = &[
OrphanMessageDependentTable {
child_table: "message_metrics",
delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
},
OrphanMessageDependentTable {
child_table: "token_usage",
delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
},
OrphanMessageDependentTable {
child_table: "snippets",
delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
},
];
#[derive(Debug, Default, Clone)]
pub(crate) struct OrphanFkCleanupReport {
pub total: i64,
pub per_table: Vec<(&'static str, i64)>,
}
impl OrphanFkCleanupReport {
fn record(&mut self, child_table: &'static str, count: i64) {
if let Some((_, existing)) = self
.per_table
.iter_mut()
.find(|(table, _)| *table == child_table)
{
*existing = existing.saturating_add(count);
} else {
self.per_table.push((child_table, count));
}
self.total = self.total.saturating_add(count);
}
}
pub struct InsertOutcome {
pub conversation_id: i64,
pub conversation_inserted: bool,
pub inserted_indices: Vec<i64>,
}
#[cfg(test)]
#[derive(Debug, Clone, Default)]
struct MessageInsertSubstageProfile {
single_row_calls: usize,
batch_calls: usize,
batch_rows: usize,
payload_duration: Duration,
sql_build_duration: Duration,
param_build_duration: Duration,
execute_duration: Duration,
rowid_duration: Duration,
}
#[cfg(test)]
#[derive(Debug, Clone, Default)]
struct InsertConversationTreePerfProfile {
invocations: usize,
messages: usize,
inserted_messages: usize,
total_duration: Duration,
source_duration: Duration,
tx_open_duration: Duration,
existing_lookup_duration: Duration,
existing_idx_lookup_duration: Duration,
existing_replay_lookup_duration: Duration,
dedupe_filter_duration: Duration,
conversation_row_duration: Duration,
message_insert_duration: Duration,
message_insert_breakdown: MessageInsertSubstageProfile,
snippet_insert_duration: Duration,
fts_entry_duration: Duration,
fts_flush_duration: Duration,
analytics_duration: Duration,
commit_duration: Duration,
}
#[cfg(test)]
impl InsertConversationTreePerfProfile {
fn millis(duration: Duration) -> f64 {
duration.as_secs_f64() * 1000.0
}
fn log_summary(&self, label: &str) {
let calls = self.invocations.max(1) as f64;
let accounted_duration = self.source_duration
+ self.tx_open_duration
+ self.existing_lookup_duration
+ self.existing_idx_lookup_duration
+ self.existing_replay_lookup_duration
+ self.dedupe_filter_duration
+ self.conversation_row_duration
+ self.message_insert_duration
+ self.snippet_insert_duration
+ self.fts_entry_duration
+ self.fts_flush_duration
+ self.analytics_duration
+ self.commit_duration;
let residual_duration = self.total_duration.saturating_sub(accounted_duration);
eprintln!(
concat!(
"CASS_INSERT_TREE_STAGE_PROFILE ",
"label={} calls={} messages={} inserted_messages={} ",
"total_ms={:.3} source_ms={:.3} tx_open_ms={:.3} existing_lookup_ms={:.3} ",
"existing_idx_lookup_ms={:.3} existing_replay_lookup_ms={:.3} dedupe_filter_ms={:.3} ",
"conversation_row_ms={:.3} message_insert_ms={:.3} snippet_insert_ms={:.3} ",
"fts_entry_ms={:.3} fts_flush_ms={:.3} analytics_ms={:.3} commit_ms={:.3} ",
"msg_payload_ms={:.3} msg_sql_ms={:.3} msg_param_ms={:.3} msg_execute_ms={:.3} msg_rowid_ms={:.3} ",
"residual_ms={:.3} avg_total_ms={:.3} avg_message_insert_ms={:.3} ",
"avg_msg_execute_ms={:.3} avg_msg_payload_ms={:.3} avg_snippet_insert_ms={:.3} avg_fts_entry_ms={:.3} avg_commit_ms={:.3}"
),
label,
self.invocations,
self.messages,
self.inserted_messages,
Self::millis(self.total_duration),
Self::millis(self.source_duration),
Self::millis(self.tx_open_duration),
Self::millis(self.existing_lookup_duration),
Self::millis(self.existing_idx_lookup_duration),
Self::millis(self.existing_replay_lookup_duration),
Self::millis(self.dedupe_filter_duration),
Self::millis(self.conversation_row_duration),
Self::millis(self.message_insert_duration),
Self::millis(self.snippet_insert_duration),
Self::millis(self.fts_entry_duration),
Self::millis(self.fts_flush_duration),
Self::millis(self.analytics_duration),
Self::millis(self.commit_duration),
Self::millis(self.message_insert_breakdown.payload_duration),
Self::millis(self.message_insert_breakdown.sql_build_duration),
Self::millis(self.message_insert_breakdown.param_build_duration),
Self::millis(self.message_insert_breakdown.execute_duration),
Self::millis(self.message_insert_breakdown.rowid_duration),
Self::millis(residual_duration),
Self::millis(self.total_duration) / calls,
Self::millis(self.message_insert_duration) / calls,
Self::millis(self.message_insert_breakdown.execute_duration) / calls,
Self::millis(self.message_insert_breakdown.payload_duration) / calls,
Self::millis(self.snippet_insert_duration) / calls,
Self::millis(self.fts_entry_duration) / calls,
Self::millis(self.commit_duration) / calls,
);
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
enum PendingConversationKey {
External {
source_id: String,
agent_id: i64,
external_id: String,
},
SourcePath {
source_id: String,
agent_id: i64,
source_path: String,
started_at: Option<i64>,
},
}
fn conversation_external_lookup_key(source_id: &str, agent_id: i64, external_id: &str) -> String {
format!(
"{}:{source_id}:{agent_id}:{}:{external_id}",
source_id.chars().count(),
external_id.chars().count()
)
}
fn conversation_external_lookup_key_for_conv(agent_id: i64, conv: &Conversation) -> Option<String> {
conv.external_id
.as_deref()
.map(|external_id| conversation_external_lookup_key(&conv.source_id, agent_id, external_id))
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct MessageMergeFingerprint {
idx: i64,
created_at: Option<i64>,
role: MessageRole,
author: Option<String>,
content_hash: [u8; 32],
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct MessageReplayFingerprint {
created_at: Option<i64>,
role: MessageRole,
author: Option<String>,
content_hash: [u8; 32],
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct ConversationMergeEvidence {
exact_overlap: usize,
replay_overlap: usize,
smaller_replay_set: usize,
started_close: bool,
start_distance_ms: i64,
}
struct ExistingConversationNewMessages<'a> {
messages: Vec<&'a Message>,
new_chars: i64,
idx_collision_count: usize,
first_collision_idx: Option<i64>,
}
#[derive(Debug, Clone, Copy)]
struct ExistingConversationTailState {
last_message_idx: i64,
last_message_created_at: i64,
ended_at: Option<i64>,
}
#[derive(Debug, Clone, Copy)]
struct ExistingConversationWithTail {
id: i64,
tail_state: Option<ExistingConversationTailState>,
}
fn conversation_effective_started_at(conv: &Conversation) -> Option<i64> {
conv.started_at
.or_else(|| conv.messages.iter().filter_map(|msg| msg.created_at).min())
}
fn conversation_tail_state(conv: &Conversation) -> (Option<i64>, Option<i64>) {
(
conv.messages.iter().map(|msg| msg.idx).max(),
conv.messages.iter().filter_map(|msg| msg.created_at).max(),
)
}
fn borrowed_messages_tail_state(messages: &[&Message]) -> (Option<i64>, Option<i64>) {
(
messages.iter().map(|msg| msg.idx).max(),
messages.iter().filter_map(|msg| msg.created_at).max(),
)
}
fn role_from_str(role: &str) -> MessageRole {
match role {
"user" => MessageRole::User,
"agent" | "assistant" => MessageRole::Agent,
"tool" => MessageRole::Tool,
"system" => MessageRole::System,
other => MessageRole::Other(other.to_string()),
}
}
fn message_merge_fingerprint(msg: &Message) -> MessageMergeFingerprint {
MessageMergeFingerprint {
idx: msg.idx,
created_at: msg.created_at,
role: msg.role.clone(),
author: msg.author.clone(),
content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
}
}
fn message_replay_fingerprint(msg: &Message) -> MessageReplayFingerprint {
MessageReplayFingerprint {
created_at: msg.created_at,
role: msg.role.clone(),
author: msg.author.clone(),
content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
}
}
fn conversation_message_fingerprints(conv: &Conversation) -> HashSet<MessageMergeFingerprint> {
conv.messages
.iter()
.map(message_merge_fingerprint)
.collect()
}
fn conversation_message_replay_fingerprints(
conv: &Conversation,
) -> HashSet<MessageReplayFingerprint> {
conv.messages
.iter()
.map(message_replay_fingerprint)
.collect()
}
fn replay_fingerprint_from_merge(
fingerprint: &MessageMergeFingerprint,
) -> MessageReplayFingerprint {
MessageReplayFingerprint {
created_at: fingerprint.created_at,
role: fingerprint.role.clone(),
author: fingerprint.author.clone(),
content_hash: fingerprint.content_hash,
}
}
fn replay_fingerprints_from_merge_set(
fingerprints: &HashSet<MessageMergeFingerprint>,
) -> HashSet<MessageReplayFingerprint> {
fingerprints
.iter()
.map(replay_fingerprint_from_merge)
.collect()
}
fn collect_new_messages_for_existing_conversation<'a>(
conversation_id: i64,
conv: &'a Conversation,
existing_messages: &mut HashMap<i64, MessageMergeFingerprint>,
existing_replay_fingerprints: &mut HashSet<MessageReplayFingerprint>,
replay_skip_log: &'static str,
) -> ExistingConversationNewMessages<'a> {
let mut idx_collision_count = 0usize;
let mut first_collision_idx: Option<i64> = None;
let mut new_chars: i64 = 0;
let mut messages = Vec::new();
for msg in &conv.messages {
let incoming_fingerprint = message_merge_fingerprint(msg);
if let Some(existing_fingerprint) = existing_messages.get(&msg.idx) {
if existing_fingerprint != &incoming_fingerprint {
idx_collision_count = idx_collision_count.saturating_add(1);
first_collision_idx.get_or_insert(msg.idx);
}
continue;
}
let incoming_replay = replay_fingerprint_from_merge(&incoming_fingerprint);
if existing_replay_fingerprints.contains(&incoming_replay) {
tracing::debug!(
conversation_id,
idx = msg.idx,
source_path = %conv.source_path.display(),
"{replay_skip_log}"
);
continue;
}
existing_messages.insert(msg.idx, incoming_fingerprint);
existing_replay_fingerprints.insert(incoming_replay);
new_chars += msg.content.len() as i64;
messages.push(msg);
}
ExistingConversationNewMessages {
messages,
new_chars,
idx_collision_count,
first_collision_idx,
}
}
fn franken_existing_conversation_append_tail_state(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
) -> Result<Option<ExistingConversationTailState>> {
let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
.query_row_map(
"SELECT last_message_idx, last_message_created_at, ended_at
FROM conversation_tail_state
WHERE conversation_id = ?1",
fparams![conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
)
.optional()?;
if let Some(cached) = cached {
let (_, _, cached_ended_at) = cached;
if let Some(tail_state) =
existing_conversation_tail_state_from_cached(cached.0, cached.1, cached_ended_at)
{
return Ok(Some(tail_state));
}
}
let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
"SELECT last_message_idx, last_message_created_at, ended_at
FROM conversations
WHERE id = ?1",
fparams![conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
)?;
let (_, _, cached_ended_at) = legacy_cached;
if let Some(tail_state) = existing_conversation_tail_state_from_cached(
legacy_cached.0,
legacy_cached.1,
cached_ended_at,
) {
franken_insert_conversation_tail_state(
tx,
conversation_id,
cached_ended_at,
Some(tail_state.last_message_idx),
Some(tail_state.last_message_created_at),
)?;
return Ok(Some(tail_state));
}
let (max_idx, max_created_at): (Option<i64>, Option<i64>) = tx.query_row_map(
"SELECT MAX(idx), MAX(created_at)
FROM messages
WHERE conversation_id = ?1",
fparams![conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)?;
if let Some((last_message_idx, last_message_created_at)) = max_idx.zip(max_created_at) {
franken_update_conversation_tail_state(
tx,
conversation_id,
None,
Some(last_message_idx),
Some(last_message_created_at),
)?;
return Ok(Some(ExistingConversationTailState {
last_message_idx,
last_message_created_at,
ended_at: cached_ended_at,
}));
}
Ok(None)
}
fn existing_conversation_tail_state_from_cached(
last_message_idx: Option<i64>,
last_message_created_at: Option<i64>,
ended_at: Option<i64>,
) -> Option<ExistingConversationTailState> {
let (last_message_idx, last_message_created_at) =
last_message_idx.zip(last_message_created_at)?;
Some(ExistingConversationTailState {
last_message_idx,
last_message_created_at,
ended_at,
})
}
fn franken_find_existing_conversation_with_tail_by_key(
tx: &FrankenTransaction<'_>,
key: &PendingConversationKey,
conv: Option<&Conversation>,
) -> Result<Option<ExistingConversationWithTail>> {
if let PendingConversationKey::External {
source_id,
agent_id,
external_id,
} = key
{
let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
if let Some(existing) = franken_find_external_conversation_tail_lookup(tx, &lookup_key)? {
return Ok(Some(existing));
}
return Ok(None);
}
let Some(id) = franken_find_existing_conversation_by_key(tx, key, conv)? else {
return Ok(None);
};
let tail_state = franken_existing_conversation_append_tail_state(tx, id)?;
Ok(Some(ExistingConversationWithTail { id, tail_state }))
}
fn franken_insert_conversation_tail_state(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
ended_at: Option<i64>,
last_message_idx: Option<i64>,
last_message_created_at: Option<i64>,
) -> Result<()> {
if ended_at.is_none() && last_message_idx.is_none() && last_message_created_at.is_none() {
return Ok(());
}
tx.execute_compat(
"INSERT OR REPLACE INTO conversation_tail_state (
conversation_id, ended_at, last_message_idx, last_message_created_at
) VALUES (?1, ?2, ?3, ?4)",
fparams![
conversation_id,
ended_at,
last_message_idx,
last_message_created_at
],
)?;
Ok(())
}
fn franken_update_conversation_tail_columns(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
ended_at_candidate: Option<i64>,
last_message_idx_candidate: Option<i64>,
last_message_created_at_candidate: Option<i64>,
) -> Result<()> {
if ended_at_candidate.is_none()
&& last_message_idx_candidate.is_none()
&& last_message_created_at_candidate.is_none()
{
return Ok(());
}
tx.execute_compat(
"UPDATE conversations
SET ended_at = CASE
WHEN ?1 IS NULL THEN ended_at
WHEN ended_at IS NULL OR ended_at < ?1 THEN ?1
ELSE ended_at
END,
last_message_idx = CASE
WHEN ?2 IS NULL THEN last_message_idx
WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
ELSE last_message_idx
END,
last_message_created_at = CASE
WHEN ?3 IS NULL THEN last_message_created_at
WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
ELSE last_message_created_at
END
WHERE id = ?4",
fparams![
ended_at_candidate,
last_message_idx_candidate,
last_message_created_at_candidate,
conversation_id
],
)?;
Ok(())
}
fn franken_tail_state_insert_ended_at(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
candidate: Option<i64>,
) -> Result<Option<i64>> {
let canonical: Option<i64> = tx
.query_row_map(
"SELECT ended_at FROM conversations WHERE id = ?1",
fparams![conversation_id],
|row| row.get_typed(0),
)
.optional()?
.flatten();
Ok(canonical.max(candidate))
}
fn franken_update_conversation_tail_state(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
ended_at_candidate: Option<i64>,
last_message_idx_candidate: Option<i64>,
last_message_created_at_candidate: Option<i64>,
) -> Result<()> {
if ended_at_candidate.is_none()
&& last_message_idx_candidate.is_none()
&& last_message_created_at_candidate.is_none()
{
return Ok(());
}
let changed = tx.execute_compat(
"UPDATE conversation_tail_state
SET ended_at = CASE
WHEN ?1 IS NULL THEN ended_at
ELSE MAX(IFNULL(ended_at, 0), ?1)
END,
last_message_idx = CASE
WHEN ?2 IS NULL THEN last_message_idx
WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
ELSE last_message_idx
END,
last_message_created_at = CASE
WHEN ?3 IS NULL THEN last_message_created_at
WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
ELSE last_message_created_at
END
WHERE conversation_id = ?4",
fparams![
ended_at_candidate,
last_message_idx_candidate,
last_message_created_at_candidate,
conversation_id
],
)?;
if changed == 0 {
let insert_ended_at =
franken_tail_state_insert_ended_at(tx, conversation_id, ended_at_candidate)?;
franken_insert_conversation_tail_state(
tx,
conversation_id,
insert_ended_at,
last_message_idx_candidate,
last_message_created_at_candidate,
)?;
}
franken_update_conversation_tail_columns(
tx,
conversation_id,
ended_at_candidate,
last_message_idx_candidate,
last_message_created_at_candidate,
)?;
Ok(())
}
fn franken_set_conversation_tail_state_after_append(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
ended_at: i64,
last_message_idx: i64,
last_message_created_at: i64,
) -> Result<()> {
let changed = tx.execute_compat(
"UPDATE conversation_tail_state
SET ended_at = ?1,
last_message_idx = ?2,
last_message_created_at = ?3
WHERE conversation_id = ?4",
fparams![
ended_at,
last_message_idx,
last_message_created_at,
conversation_id
],
)?;
if changed == 0 {
let insert_ended_at =
franken_tail_state_insert_ended_at(tx, conversation_id, Some(ended_at))?;
franken_insert_conversation_tail_state(
tx,
conversation_id,
insert_ended_at,
Some(last_message_idx),
Some(last_message_created_at),
)?;
}
franken_update_conversation_tail_columns(
tx,
conversation_id,
Some(ended_at),
Some(last_message_idx),
Some(last_message_created_at),
)?;
Ok(())
}
fn collect_append_only_tail_messages<'a>(
conv: &'a Conversation,
existing_max_idx: i64,
existing_max_created_at: i64,
) -> Option<ExistingConversationNewMessages<'a>> {
if conv.messages.is_empty() {
return Some(ExistingConversationNewMessages {
messages: Vec::new(),
new_chars: 0,
idx_collision_count: 0,
first_collision_idx: None,
});
}
let mut split_idx = None;
let mut prev_idx = None;
for (pos, msg) in conv.messages.iter().enumerate() {
if prev_idx.is_some_and(|prev| msg.idx < prev) {
return None;
}
prev_idx = Some(msg.idx);
if split_idx.is_none() && msg.idx > existing_max_idx {
split_idx = Some(pos);
}
}
let split_idx = split_idx?;
let mut seen_tail_idx = HashSet::new();
let mut seen_tail_replay = HashSet::new();
let mut new_chars = 0i64;
let mut messages = Vec::new();
for msg in &conv.messages[split_idx..] {
let created_at = msg.created_at?;
if created_at <= existing_max_created_at {
return None;
}
if !seen_tail_idx.insert(msg.idx) {
return None;
}
let replay_fingerprint = message_replay_fingerprint(msg);
if !seen_tail_replay.insert(replay_fingerprint) {
return None;
}
new_chars += msg.content.len() as i64;
messages.push(msg);
}
Some(ExistingConversationNewMessages {
messages,
new_chars,
idx_collision_count: 0,
first_collision_idx: None,
})
}
fn start_distance_ms(left: Option<i64>, right: Option<i64>) -> i64 {
match (left, right) {
(Some(left), Some(right)) => (i128::from(left) - i128::from(right))
.abs()
.try_into()
.unwrap_or(i64::MAX),
_ => i64::MAX,
}
}
fn conversation_merge_evidence(
incoming_exact: &HashSet<MessageMergeFingerprint>,
incoming_replay: &HashSet<MessageReplayFingerprint>,
existing_exact: &HashSet<MessageMergeFingerprint>,
existing_replay: &HashSet<MessageReplayFingerprint>,
incoming_started_at: Option<i64>,
existing_started_at: Option<i64>,
) -> Option<ConversationMergeEvidence> {
let exact_overlap = incoming_exact.intersection(existing_exact).count();
let replay_overlap = incoming_replay.intersection(existing_replay).count();
if exact_overlap == 0 && replay_overlap == 0 {
return None;
}
let smaller_replay_set = incoming_replay.len().min(existing_replay.len());
let started_close = timestamps_within_tolerance(
incoming_started_at,
existing_started_at,
SOURCE_PATH_MERGE_START_TOLERANCE_MS,
);
let full_replay_subset_match = smaller_replay_set >= 2 && replay_overlap == smaller_replay_set;
let merge_allowed = if started_close {
exact_overlap >= 1 || replay_overlap >= 2
} else {
exact_overlap >= 2 || full_replay_subset_match
};
merge_allowed.then_some(ConversationMergeEvidence {
exact_overlap,
replay_overlap,
smaller_replay_set,
started_close,
start_distance_ms: start_distance_ms(incoming_started_at, existing_started_at),
})
}
fn timestamps_within_tolerance(left: Option<i64>, right: Option<i64>, tolerance_ms: i64) -> bool {
match (left, right) {
(Some(left), Some(right)) => {
(i128::from(left) - i128::from(right)).abs() <= i128::from(tolerance_ms)
}
_ => false,
}
}
fn conversation_merge_key(agent_id: i64, conv: &Conversation) -> PendingConversationKey {
if let Some(external_id) = conv.external_id.clone() {
PendingConversationKey::External {
source_id: conv.source_id.clone(),
agent_id,
external_id,
}
} else {
PendingConversationKey::SourcePath {
source_id: conv.source_id.clone(),
agent_id,
source_path: path_to_string(&conv.source_path),
started_at: conversation_effective_started_at(conv),
}
}
}
pub struct MessageForEmbedding {
pub message_id: i64,
pub created_at: Option<i64>,
pub agent_id: i64,
pub workspace_id: Option<i64>,
pub source_id_hash: u32,
pub role: String,
pub content: String,
}
impl FrankenStorage {
pub fn ensure_agent(&self, agent: &Agent) -> Result<i64> {
let cache_key = EnsuredAgentKey::from_agent(agent);
if let Some(id) = self.cached_agent_id(&cache_key) {
return Ok(id);
}
let now = Self::now_millis();
self.conn.execute_compat(
"INSERT INTO agents(slug, name, version, kind, created_at, updated_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6)
ON CONFLICT(slug) DO UPDATE SET
name = excluded.name,
version = excluded.version,
kind = excluded.kind,
updated_at = excluded.updated_at
WHERE NOT (
agents.name IS excluded.name
AND agents.version IS excluded.version
AND agents.kind IS excluded.kind
)",
fparams![
agent.slug.as_str(),
agent.name.as_str(),
agent.version.as_deref(),
cache_key.kind.as_str(),
now,
now
],
)?;
let id = self
.conn
.query_row_map(
"SELECT id FROM agents WHERE slug = ?1 LIMIT 1",
fparams![agent.slug.as_str()],
|row| row.get_typed(0),
)
.with_context(|| format!("fetching agent id for {}", agent.slug))?;
self.mark_agent_ensured(cache_key, id);
Ok(id)
}
pub fn ensure_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
let path_str = path.to_string_lossy().to_string();
let cache_key = EnsuredWorkspaceKey::new(path_str.clone(), display_name);
if let Some(id) = self.cached_workspace_id(&cache_key) {
return Ok(id);
}
if let Some(display_name) = display_name {
self.conn.execute_compat(
"INSERT INTO workspaces(path, display_name)
VALUES(?1, ?2)
ON CONFLICT(path) DO UPDATE SET
display_name = excluded.display_name
WHERE NOT (workspaces.display_name IS excluded.display_name)",
fparams![path_str.as_str(), display_name],
)?;
} else {
self.conn.execute_compat(
"INSERT OR IGNORE INTO workspaces(path, display_name) VALUES(?1, NULL)",
fparams![path_str.as_str()],
)?;
}
let id = self
.conn
.query_row_map(
"SELECT id FROM workspaces WHERE path = ?1 LIMIT 1",
fparams![path_str.as_str()],
|row| row.get_typed(0),
)
.with_context(|| format!("fetching workspace id for {path_str}"))?;
self.mark_workspace_ensured(cache_key, id);
Ok(id)
}
pub fn now_millis() -> i64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
.unwrap_or(0)
}
pub fn day_id_from_millis(timestamp_ms: i64) -> i64 {
const EPOCH_2020_SECS: i64 = 1_577_836_800;
let secs = timestamp_ms.div_euclid(1000);
(secs - EPOCH_2020_SECS).div_euclid(86400)
}
pub fn hour_id_from_millis(timestamp_ms: i64) -> i64 {
const EPOCH_2020_SECS: i64 = 1_577_836_800;
let secs = timestamp_ms.div_euclid(1000);
(secs - EPOCH_2020_SECS).div_euclid(3600)
}
pub fn millis_from_day_id(day_id: i64) -> i64 {
const EPOCH_2020_SECS: i64 = 1_577_836_800;
(EPOCH_2020_SECS + day_id * 86400) * 1000
}
pub fn millis_from_hour_id(hour_id: i64) -> i64 {
const EPOCH_2020_SECS: i64 = 1_577_836_800;
(EPOCH_2020_SECS + hour_id * 3600) * 1000
}
pub fn get_last_scan_ts(&self) -> Result<Option<i64>> {
let result: Result<String, _> = self.conn.query_row_map(
"SELECT value FROM meta WHERE key = 'last_scan_ts'",
fparams![],
|row| row.get_typed(0),
);
match result.optional() {
Ok(Some(s)) => Ok(s.parse().ok()),
Ok(None) => Ok(None),
Err(e) => Err(e.into()),
}
}
pub fn set_last_scan_ts(&self, ts: i64) -> Result<()> {
self.conn.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES('last_scan_ts', ?1)",
fparams![ts.to_string()],
)?;
Ok(())
}
pub fn get_last_indexed_at(&self) -> Result<Option<i64>> {
let result: Result<String, _> = self.conn.query_row_map(
"SELECT value FROM meta WHERE key = 'last_indexed_at'",
fparams![],
|row| row.get_typed(0),
);
match result.optional() {
Ok(Some(s)) => Ok(s.parse().ok()),
Ok(None) => Ok(None),
Err(e) => Err(e.into()),
}
}
pub fn set_last_indexed_at(&self, ts: i64) -> Result<()> {
self.conn.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES('last_indexed_at', ?1)",
fparams![ts.to_string()],
)?;
Ok(())
}
pub fn list_agents(&self) -> Result<Vec<Agent>> {
self.conn
.query_map_collect(
"SELECT id, slug, name, version, kind FROM agents ORDER BY slug",
fparams![],
|row| {
let kind: String = row.get_typed(4)?;
Ok(Agent {
id: Some(row.get_typed(0)?),
slug: row.get_typed(1)?,
name: row.get_typed(2)?,
version: row.get_typed(3)?,
kind: match kind.as_str() {
"cli" => AgentKind::Cli,
"vscode" => AgentKind::VsCode,
_ => AgentKind::Hybrid,
},
})
},
)
.with_context(|| "listing agents")
}
pub fn total_conversation_count(&self) -> Result<usize> {
let count: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})?;
Ok(count.max(0) as usize)
}
pub fn total_message_count(&self) -> Result<usize> {
let count: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})?;
Ok(count.max(0) as usize)
}
pub fn purge_agent_archive_data(&self, agent_slug: &str) -> Result<AgentArchivePurgeResult> {
let normalized = agent_slug.trim().to_ascii_lowercase();
if normalized.is_empty() {
return Err(anyhow!("agent slug cannot be empty"));
}
let Some(agent_id) = self
.conn
.query_row_map(
"SELECT id FROM agents WHERE slug = ?1",
fparams![normalized.as_str()],
|row| row.get_typed::<i64>(0),
)
.optional()?
else {
return Ok(AgentArchivePurgeResult::default());
};
let conversations_deleted: i64 = self.conn.query_row_map(
"SELECT COUNT(*) FROM conversations WHERE agent_id = ?1",
fparams![agent_id],
|row| row.get_typed(0),
)?;
if conversations_deleted == 0 {
return Ok(AgentArchivePurgeResult::default());
}
let messages_deleted: i64 = self.conn.query_row_map(
"SELECT COUNT(*)
FROM messages
WHERE conversation_id IN (
SELECT id FROM conversations WHERE agent_id = ?1
)",
fparams![agent_id],
|row| row.get_typed(0),
)?;
let mut tx = self.conn.transaction()?;
tx.execute_compat(
"DELETE FROM conversation_external_lookup
WHERE conversation_id IN (
SELECT id FROM conversations WHERE agent_id = ?1
)",
fparams![agent_id],
)?;
tx.execute_compat(
"DELETE FROM conversation_external_tail_lookup
WHERE conversation_id IN (
SELECT id FROM conversations WHERE agent_id = ?1
)",
fparams![agent_id],
)?;
tx.execute_compat(
"DELETE FROM conversations WHERE agent_id = ?1",
fparams![agent_id],
)?;
tx.execute_compat(
"DELETE FROM agents
WHERE id = ?1
AND NOT EXISTS (
SELECT 1 FROM conversations WHERE agent_id = ?1
)",
fparams![agent_id],
)?;
tx.commit()?;
Ok(AgentArchivePurgeResult {
conversations_deleted: conversations_deleted.max(0) as usize,
messages_deleted: messages_deleted.max(0) as usize,
})
}
pub fn list_workspaces(&self) -> Result<Vec<crate::model::types::Workspace>> {
self.conn
.query_map_collect(
"SELECT id, path, display_name FROM workspaces ORDER BY path",
fparams![],
|row| {
let path_str: String = row.get_typed(1)?;
Ok(crate::model::types::Workspace {
id: Some(row.get_typed(0)?),
path: Path::new(&path_str).to_path_buf(),
display_name: row.get_typed(2)?,
})
},
)
.with_context(|| "listing workspaces")
}
pub fn list_conversations(&self, limit: i64, offset: i64) -> Result<Vec<Conversation>> {
self.conn
.query_map_collect(
r"SELECT c.id,
COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
(SELECT w.path FROM workspaces w WHERE w.id = c.workspace_id),
c.external_id, c.title, c.source_path,
c.started_at,
COALESCE(
(SELECT ts.ended_at
FROM conversation_tail_state ts
WHERE ts.conversation_id = c.id),
c.ended_at
),
c.approx_tokens, c.metadata_json,
c.source_id, c.origin_host, c.metadata_bin
FROM conversations c
ORDER BY CASE WHEN c.started_at IS NULL THEN 1 ELSE 0 END, c.started_at DESC, c.id DESC
LIMIT ?1 OFFSET ?2",
fparams![limit, offset],
|row| {
let workspace_path: Option<String> = row.get_typed(2)?;
let source_path: String = row.get_typed(5)?;
let raw_source_id: Option<String> = row.get_typed(10)?;
let raw_origin_host: Option<String> = row.get_typed(11)?;
let (source_id, _, origin_host) = normalized_storage_source_parts(
raw_source_id.as_deref(),
None,
raw_origin_host.as_deref(),
);
Ok(Conversation {
id: Some(row.get_typed(0)?),
agent_slug: row.get_typed(1)?,
workspace: workspace_path.map(|p| Path::new(&p).to_path_buf()),
external_id: row.get_typed(3)?,
title: row.get_typed(4)?,
source_path: Path::new(&source_path).to_path_buf(),
started_at: row.get_typed(6)?,
ended_at: row.get_typed(7)?,
approx_tokens: row.get_typed(8)?,
metadata_json: franken_read_metadata_compat(row, 9, 12),
messages: Vec::new(),
source_id,
origin_host,
})
},
)
.with_context(|| "listing conversations")
}
pub fn build_lexical_rebuild_lookups(
&self,
) -> Result<(HashMap<i64, String>, HashMap<i64, PathBuf>)> {
let agents: HashMap<i64, String> = self
.conn
.query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
Ok((row.get_typed::<i64>(0)?, row.get_typed::<String>(1)?))
})
.with_context(|| "loading agent lookup for lexical rebuild")?
.into_iter()
.collect();
let workspaces: HashMap<i64, PathBuf> = self
.conn
.query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
let path_str: String = row.get_typed(1)?;
Ok((row.get_typed::<i64>(0)?, PathBuf::from(path_str)))
})
.with_context(|| "loading workspace lookup for lexical rebuild")?
.into_iter()
.collect();
Ok((agents, workspaces))
}
pub fn list_conversation_footprints_for_lexical_rebuild(
&self,
) -> Result<Vec<LexicalRebuildConversationFootprintRow>> {
let tail_state_rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
"SELECT conversation_id, last_message_idx
FROM conversation_tail_state
ORDER BY conversation_id ASC",
fparams![],
|row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
) {
Ok(rows) => rows,
Err(err) if error_indicates_missing_table(&err) => Vec::new(),
Err(err) => {
return Err(err).with_context(|| "listing lexical rebuild tail-state estimates");
}
};
let tail_state_by_conversation: HashMap<i64, Option<i64>> =
tail_state_rows.into_iter().collect();
let rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
"SELECT id, last_message_idx
FROM conversations
ORDER BY id ASC",
fparams![],
|row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
) {
Ok(rows) => rows,
Err(err) if error_indicates_missing_column(&err) => self
.conn
.query_map_collect(
"SELECT id
FROM conversations
ORDER BY id ASC",
fparams![],
|row| Ok((row.get_typed::<i64>(0)?, None)),
)
.with_context(|| {
"listing lexical rebuild conversation ids after missing tail column fallback"
})?,
Err(err) => {
return Err(err)
.with_context(|| "listing lexical rebuild conversation footprint estimates");
}
};
let mut footprints = Vec::with_capacity(rows.len());
let mut missing_tail_positions = HashMap::new();
for (conversation_id, conversation_last_message_idx) in rows {
let last_message_idx = tail_state_by_conversation
.get(&conversation_id)
.copied()
.flatten()
.or(conversation_last_message_idx);
let Some(message_count) = lexical_rebuild_message_count_from_tail_idx(last_message_idx)
else {
missing_tail_positions.insert(conversation_id, footprints.len());
footprints.push(LexicalRebuildConversationFootprintRow {
conversation_id,
message_count: 0,
message_bytes: 0,
});
continue;
};
footprints.push(lexical_rebuild_conversation_footprint_from_count(
conversation_id,
message_count,
));
}
let every_footprint_was_missing_tail = missing_tail_positions.len() == footprints.len();
if !missing_tail_positions.is_empty() {
self.fill_missing_lexical_rebuild_footprint_tails(
&mut footprints,
&missing_tail_positions,
)?;
}
if !every_footprint_was_missing_tail {
self.raise_lexical_rebuild_footprints_to_exact_message_counts(&mut footprints)?;
}
Ok(footprints)
}
pub fn lexical_rebuild_has_tail_footprint_metadata(&self) -> Result<bool> {
let total_conversations: i64 = self
.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.with_context(|| "counting conversations for lexical rebuild tail metadata coverage")?;
let total_conversations = usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX);
if total_conversations == 0 {
return Ok(true);
}
let conversation_columns = franken_table_column_names(&self.conn, "conversations")?;
let conversations_have_tail_column = conversation_columns.contains("last_message_idx");
let tail_state_has_tail_column =
match franken_table_column_names(&self.conn, "conversation_tail_state") {
Ok(columns) => columns.contains("last_message_idx"),
Err(err) if error_indicates_missing_table(&err) => false,
Err(err) => {
return Err(err)
.with_context(|| "reading lexical rebuild tail-state metadata columns");
}
};
if !conversations_have_tail_column && !tail_state_has_tail_column {
return Ok(false);
}
let covered_sql = match (conversations_have_tail_column, tail_state_has_tail_column) {
(true, true) => {
"SELECT COUNT(*)
FROM conversations c
LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
WHERE c.last_message_idx IS NOT NULL
OR ts.last_message_idx IS NOT NULL"
}
(true, false) => {
"SELECT COUNT(*)
FROM conversations
WHERE last_message_idx IS NOT NULL"
}
(false, true) => {
"SELECT COUNT(*)
FROM conversations c
WHERE EXISTS (
SELECT 1
FROM conversation_tail_state ts
WHERE ts.conversation_id = c.id
AND ts.last_message_idx IS NOT NULL
)"
}
(false, false) => unreachable!("checked before covered_sql selection"),
};
let covered_conversations: i64 = self
.conn
.query_row_map(covered_sql, fparams![], |row| row.get_typed(0))
.with_context(
|| "counting conversations covered by lexical rebuild tail footprint metadata",
)?;
let covered_conversations =
usize::try_from(covered_conversations.max(0)).unwrap_or(usize::MAX);
Ok(lexical_rebuild_tail_metadata_coverage_is_sufficient(
total_conversations,
covered_conversations,
))
}
fn raise_lexical_rebuild_footprints_to_exact_message_counts(
&self,
footprints: &mut [LexicalRebuildConversationFootprintRow],
) -> Result<()> {
if footprints.is_empty() {
return Ok(());
}
let positions_by_conversation: HashMap<i64, usize> = footprints
.iter()
.enumerate()
.map(|(position, footprint)| (footprint.conversation_id, position))
.collect();
self.conn
.query_with_params_for_each(
"SELECT conversation_id, COUNT(*) AS message_count
FROM messages
GROUP BY conversation_id
ORDER BY conversation_id ASC",
&[] as &[SqliteValue],
|row| {
let conversation_id: i64 = row.get_typed(0)?;
let exact_count: i64 = row.get_typed(1)?;
let Some(position) = positions_by_conversation.get(&conversation_id) else {
return Ok(());
};
let exact_count = usize::try_from(exact_count.max(0)).unwrap_or(usize::MAX);
let footprint = &mut footprints[*position];
if exact_count > footprint.message_count {
footprint.message_count = exact_count;
footprint.message_bytes =
footprint.message_bytes.max(exact_count.saturating_mul(
LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
));
}
Ok(())
},
)
.with_context(|| "raising lexical rebuild footprints to exact message counts")?;
Ok(())
}
fn fill_missing_lexical_rebuild_footprint_tails(
&self,
footprints: &mut [LexicalRebuildConversationFootprintRow],
missing_tail_positions: &HashMap<i64, usize>,
) -> Result<()> {
if missing_tail_positions.len() <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT {
for (conversation_id, position) in missing_tail_positions {
let last_message_idx: Option<i64> = self
.conn
.query_row_map(
"SELECT MAX(idx) FROM messages WHERE conversation_id = ?1",
fparams![*conversation_id],
|row| row.get_typed(0),
)
.with_context(|| {
format!(
"looking up missing lexical rebuild tail estimate for conversation {conversation_id}"
)
})?;
if let Some(message_count) =
lexical_rebuild_message_count_from_tail_idx(last_message_idx)
{
footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
*conversation_id,
message_count,
);
}
}
return Ok(());
}
self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
footprints,
missing_tail_positions,
"SELECT conversation_id, MAX(idx) AS last_message_idx
FROM messages INDEXED BY idx_messages_conv_idx
GROUP BY conversation_id
ORDER BY conversation_id ASC",
)
.or_else(|err| {
if err
.to_string()
.contains("no such index: idx_messages_conv_idx")
{
return self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
footprints,
missing_tail_positions,
"SELECT conversation_id, MAX(idx) AS last_message_idx
FROM messages
GROUP BY conversation_id
ORDER BY conversation_id ASC",
);
}
Err(err)
})
.with_context(|| "grouping missing lexical rebuild tail estimates from messages")?;
Ok(())
}
fn fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
&self,
footprints: &mut [LexicalRebuildConversationFootprintRow],
missing_tail_positions: &HashMap<i64, usize>,
sql: &str,
) -> Result<()> {
self.conn
.query_with_params_for_each(sql, &[] as &[SqliteValue], |row| {
let conversation_id: i64 = row.get_typed(0)?;
let last_message_idx: Option<i64> = row.get_typed(1)?;
let Some(position) = missing_tail_positions.get(&conversation_id) else {
return Ok(());
};
if let Some(message_count) =
lexical_rebuild_message_count_from_tail_idx(last_message_idx)
{
footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
conversation_id,
message_count,
);
}
Ok(())
})
.with_context(|| "grouping lexical rebuild missing tail estimates")
}
pub fn list_conversation_ids_for_lexical_rebuild(&self) -> Result<Vec<i64>> {
self.conn
.query_map_collect(
"SELECT id FROM conversations ORDER BY id ASC",
fparams![],
|row| row.get_typed(0),
)
.with_context(|| "listing conversation ids for lexical rebuild")
}
pub fn list_conversations_for_lexical_rebuild_by_offset(
&self,
limit: i64,
offset: i64,
agent_slugs: &HashMap<i64, String>,
workspace_paths: &HashMap<i64, PathBuf>,
) -> Result<Vec<LexicalRebuildConversationRow>> {
self.conn
.query_map_collect(
r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
started_at,
COALESCE(
(SELECT ts.ended_at
FROM conversation_tail_state ts
WHERE ts.conversation_id = conversations.id),
ended_at
),
source_id, origin_host
FROM conversations
ORDER BY id ASC
LIMIT ?1 OFFSET ?2",
fparams![limit, offset],
|row| {
let agent_id: Option<i64> = row.get_typed(1)?;
let workspace_id: Option<i64> = row.get_typed(2)?;
let source_path: String = row.get_typed(5)?;
let raw_source_id: Option<String> = row.get_typed(8)?;
let raw_origin_host: Option<String> = row.get_typed(9)?;
let (source_id, _, origin_host) = normalized_storage_source_parts(
raw_source_id.as_deref(),
None,
raw_origin_host.as_deref(),
);
Ok(LexicalRebuildConversationRow {
id: Some(row.get_typed(0)?),
agent_slug: agent_id
.and_then(|aid| agent_slugs.get(&aid).cloned())
.unwrap_or_else(|| "unknown".to_string()),
workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
external_id: row.get_typed(3)?,
title: row.get_typed(4)?,
source_path: Path::new(&source_path).to_path_buf(),
started_at: row.get_typed(6)?,
ended_at: row.get_typed(7)?,
source_id,
origin_host,
})
},
)
.with_context(|| "listing conversations for lexical rebuild")
}
pub fn list_conversations_for_lexical_rebuild_after_id(
&self,
limit: i64,
after_conversation_id: i64,
agent_slugs: &HashMap<i64, String>,
workspace_paths: &HashMap<i64, PathBuf>,
) -> Result<Vec<LexicalRebuildConversationRow>> {
self.conn
.query_map_collect(
r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
started_at,
COALESCE(
(SELECT ts.ended_at
FROM conversation_tail_state ts
WHERE ts.conversation_id = conversations.id),
ended_at
),
source_id, origin_host
FROM conversations
WHERE id > ?2
ORDER BY id ASC
LIMIT ?1",
fparams![limit, after_conversation_id],
|row| {
let agent_id: Option<i64> = row.get_typed(1)?;
let workspace_id: Option<i64> = row.get_typed(2)?;
let source_path: String = row.get_typed(5)?;
let raw_source_id: Option<String> = row.get_typed(8)?;
let raw_origin_host: Option<String> = row.get_typed(9)?;
let (source_id, _, origin_host) = normalized_storage_source_parts(
raw_source_id.as_deref(),
None,
raw_origin_host.as_deref(),
);
Ok(LexicalRebuildConversationRow {
id: Some(row.get_typed(0)?),
agent_slug: agent_id
.and_then(|aid| agent_slugs.get(&aid).cloned())
.unwrap_or_else(|| "unknown".to_string()),
workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
external_id: row.get_typed(3)?,
title: row.get_typed(4)?,
source_path: Path::new(&source_path).to_path_buf(),
started_at: row.get_typed(6)?,
ended_at: row.get_typed(7)?,
source_id,
origin_host,
})
},
)
.with_context(|| {
format!(
"listing conversations for lexical rebuild after id {after_conversation_id}"
)
})
}
pub fn list_conversations_for_lexical_rebuild_after_id_through_id(
&self,
limit: i64,
after_conversation_id: i64,
through_conversation_id: i64,
agent_slugs: &HashMap<i64, String>,
workspace_paths: &HashMap<i64, PathBuf>,
) -> Result<Vec<LexicalRebuildConversationRow>> {
if through_conversation_id <= after_conversation_id {
return Ok(Vec::new());
}
self.conn
.query_map_collect(
r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
started_at,
COALESCE(
(SELECT ts.ended_at
FROM conversation_tail_state ts
WHERE ts.conversation_id = conversations.id),
ended_at
),
source_id, origin_host
FROM conversations
WHERE id > ?2 AND id <= ?3
ORDER BY id ASC
LIMIT ?1",
fparams![limit, after_conversation_id, through_conversation_id],
|row| {
let agent_id: Option<i64> = row.get_typed(1)?;
let workspace_id: Option<i64> = row.get_typed(2)?;
let source_path: String = row.get_typed(5)?;
let raw_source_id: Option<String> = row.get_typed(8)?;
let raw_origin_host: Option<String> = row.get_typed(9)?;
let (source_id, _, origin_host) = normalized_storage_source_parts(
raw_source_id.as_deref(),
None,
raw_origin_host.as_deref(),
);
Ok(LexicalRebuildConversationRow {
id: Some(row.get_typed(0)?),
agent_slug: agent_id
.and_then(|aid| agent_slugs.get(&aid).cloned())
.unwrap_or_else(|| "unknown".to_string()),
workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
external_id: row.get_typed(3)?,
title: row.get_typed(4)?,
source_path: Path::new(&source_path).to_path_buf(),
started_at: row.get_typed(6)?,
ended_at: row.get_typed(7)?,
source_id,
origin_host,
})
},
)
.with_context(|| {
format!(
"listing conversations for lexical rebuild after id {after_conversation_id} through id {through_conversation_id}"
)
})
}
pub fn fetch_messages(&self, conversation_id: i64) -> Result<Vec<Message>> {
let hinted_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
FROM messages INDEXED BY sqlite_autoindex_messages_1 \
WHERE conversation_id = ?1 ORDER BY idx";
let fallback_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
FROM messages \
WHERE conversation_id = ?1 ORDER BY idx";
self.conn
.query_map_collect(hinted_sql, fparams![conversation_id], |row| {
let role: String = row.get_typed(2)?;
Ok(Message {
id: Some(row.get_typed(0)?),
idx: row.get_typed(1)?,
role: match role.as_str() {
"user" => MessageRole::User,
"agent" | "assistant" => MessageRole::Agent,
"tool" => MessageRole::Tool,
"system" => MessageRole::System,
other => MessageRole::Other(other.to_string()),
},
author: row.get_typed(3)?,
created_at: row.get_typed(4)?,
content: row.get_typed(5)?,
extra_json: franken_read_message_extra_compat(row, 6, 7),
snippets: Vec::new(),
})
})
.or_else(|err| {
if err
.to_string()
.contains("no such index: sqlite_autoindex_messages_1")
{
return self.conn.query_map_collect(
fallback_sql,
fparams![conversation_id],
|row| {
let role: String = row.get_typed(2)?;
Ok(Message {
id: Some(row.get_typed(0)?),
idx: row.get_typed(1)?,
role: match role.as_str() {
"user" => MessageRole::User,
"agent" | "assistant" => MessageRole::Agent,
"tool" => MessageRole::Tool,
"system" => MessageRole::System,
other => MessageRole::Other(other.to_string()),
},
author: row.get_typed(3)?,
created_at: row.get_typed(4)?,
content: row.get_typed(5)?,
extra_json: franken_read_message_extra_compat(row, 6, 7),
snippets: Vec::new(),
})
},
);
}
Err(err)
})
.with_context(|| format!("fetching messages for conversation {conversation_id}"))
}
pub fn fetch_messages_for_lexical_rebuild(&self, conversation_id: i64) -> Result<Vec<Message>> {
let hinted_sql = "SELECT id, idx, role, author, created_at, content \
FROM messages INDEXED BY sqlite_autoindex_messages_1 \
WHERE conversation_id = ?1 ORDER BY idx";
let fallback_sql = "SELECT id, idx, role, author, created_at, content \
FROM messages \
WHERE conversation_id = ?1 ORDER BY idx";
self.conn
.query_map_collect(hinted_sql, fparams![conversation_id], |row| {
let role: String = row.get_typed(2)?;
Ok(Message {
id: Some(row.get_typed(0)?),
idx: row.get_typed(1)?,
role: match role.as_str() {
"user" => MessageRole::User,
"agent" | "assistant" => MessageRole::Agent,
"tool" => MessageRole::Tool,
"system" => MessageRole::System,
other => MessageRole::Other(other.to_string()),
},
author: row.get_typed(3)?,
created_at: row.get_typed(4)?,
content: row.get_typed(5)?,
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
})
.or_else(|err| {
if err
.to_string()
.contains("no such index: sqlite_autoindex_messages_1")
{
return self.conn.query_map_collect(
fallback_sql,
fparams![conversation_id],
|row| {
let role: String = row.get_typed(2)?;
Ok(Message {
id: Some(row.get_typed(0)?),
idx: row.get_typed(1)?,
role: match role.as_str() {
"user" => MessageRole::User,
"agent" | "assistant" => MessageRole::Agent,
"tool" => MessageRole::Tool,
"system" => MessageRole::System,
other => MessageRole::Other(other.to_string()),
},
author: row.get_typed(3)?,
created_at: row.get_typed(4)?,
content: row.get_typed(5)?,
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
},
);
}
Err(err)
})
.with_context(|| {
format!("fetching messages for lexical rebuild of conversation {conversation_id}")
})
}
pub fn fetch_messages_for_lexical_rebuild_batch(
&self,
conversation_ids: &[i64],
max_messages: Option<usize>,
max_content_bytes: Option<usize>,
) -> Result<HashMap<i64, Vec<Message>>> {
if conversation_ids.is_empty() {
return Ok(HashMap::new());
}
let mut grouped: HashMap<i64, Vec<Message>> =
HashMap::with_capacity(conversation_ids.len());
let mut fetched_conversation_ids = HashSet::with_capacity(conversation_ids.len());
let mut total_messages = 0usize;
let mut total_content_bytes = 0usize;
for conversation_id in conversation_ids {
if !fetched_conversation_ids.insert(*conversation_id) {
continue;
}
let messages = self
.fetch_messages_for_lexical_rebuild(*conversation_id)
.with_context(|| {
format!("fetching lexical rebuild messages for conversation {conversation_id}")
})?;
total_messages = total_messages.saturating_add(messages.len());
if let Some(limit) = max_messages
&& total_messages > limit
{
return Err(anyhow!(
"lexical rebuild batch fetch exceeded message guardrail: messages={total_messages} limit={limit} conversations={}",
conversation_ids.len()
));
}
let message_bytes = messages
.iter()
.map(|message| message.content.len())
.sum::<usize>();
total_content_bytes = total_content_bytes.saturating_add(message_bytes);
if let Some(limit) = max_content_bytes
&& total_content_bytes > limit
{
return Err(anyhow!(
"lexical rebuild batch fetch exceeded content-byte guardrail: bytes={total_content_bytes} limit={limit} conversations={}",
conversation_ids.len()
));
}
if !messages.is_empty() {
grouped.insert(*conversation_id, messages);
}
}
Ok(grouped)
}
pub fn stream_messages_for_lexical_rebuild_between_conversation_ids<F>(
&self,
start_conversation_id: i64,
end_conversation_id: i64,
mut f: F,
) -> Result<()>
where
F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
{
if end_conversation_id < start_conversation_id {
return Ok(());
}
let conversation_ids: Vec<i64> = self
.conn
.query_map_collect(
"SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
fparams![start_conversation_id, end_conversation_id],
|row| row.get_typed(0),
)
.with_context(|| "listing conversation ids for streamed lexical rebuild")?;
for conversation_id in conversation_ids {
let messages = self
.fetch_messages_for_lexical_rebuild(conversation_id)
.with_context(|| {
format!("streaming lexical rebuild messages for conversation {conversation_id}")
})?;
for message in messages {
let message_id = message.id.ok_or_else(|| {
anyhow!(
"lexical rebuild message missing id for conversation {conversation_id} idx {}",
message.idx
)
})?;
f(LexicalRebuildMessageRow {
conversation_id,
id: message_id,
idx: message.idx,
role: role_str(&message.role),
author: message.author,
created_at: message.created_at,
content: message.content,
})?;
}
}
Ok(())
}
pub fn stream_grouped_messages_for_lexical_rebuild_between_conversation_ids<F>(
&self,
start_conversation_id: i64,
end_conversation_id: i64,
mut f: F,
) -> Result<()>
where
F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
{
if end_conversation_id < start_conversation_id {
return Ok(());
}
let mut current_conversation_id: Option<i64> = None;
let mut current_messages: LexicalRebuildGroupedMessageRows = SmallVec::new();
let mut current_last_message_id = 0i64;
let mut flush_current = |current_conversation_id: &mut Option<i64>,
current_messages: &mut LexicalRebuildGroupedMessageRows,
current_last_message_id: &mut i64|
-> Result<()> {
let Some(conversation_id) = current_conversation_id.take() else {
return Ok(());
};
let messages = std::mem::take(current_messages);
let last_message_id = std::mem::take(current_last_message_id);
f(conversation_id, messages, last_message_id)
};
self.stream_messages_for_lexical_rebuild_between_conversation_ids(
start_conversation_id,
end_conversation_id,
|row| {
if current_conversation_id != Some(row.conversation_id) {
flush_current(
&mut current_conversation_id,
&mut current_messages,
&mut current_last_message_id,
)?;
current_conversation_id = Some(row.conversation_id);
}
current_last_message_id = row.id;
current_messages.push(LexicalRebuildGroupedMessageRow {
idx: row.idx,
is_tool_role: row.role == "tool",
created_at: row.created_at,
content: row.content,
});
Ok(())
},
)
.with_context(|| "streaming grouped lexical rebuild messages")?;
flush_current(
&mut current_conversation_id,
&mut current_messages,
&mut current_last_message_id,
)
.with_context(|| "flushing grouped lexical rebuild messages")
}
pub fn stream_grouped_messages_for_lexical_rebuild_from_conversation_id<F>(
&self,
start_conversation_id: i64,
f: F,
) -> Result<()>
where
F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
{
self.stream_grouped_messages_for_lexical_rebuild_between_conversation_ids(
start_conversation_id,
i64::MAX,
f,
)
}
pub fn stream_messages_for_lexical_rebuild_from_conversation_id<F>(
&self,
start_conversation_id: i64,
f: F,
) -> Result<()>
where
F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
{
self.stream_messages_for_lexical_rebuild_between_conversation_ids(
start_conversation_id,
i64::MAX,
f,
)
}
pub fn get_source(&self, id: &str) -> Result<Option<Source>> {
let result = self.conn.query_row_map(
"SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources WHERE id = ?1",
fparams![id],
|row| {
let kind_str: String = row.get_typed(1)?;
let config_json_str: Option<String> = row.get_typed(5)?;
Ok(Source {
id: row.get_typed(0)?,
kind: SourceKind::parse(&kind_str).unwrap_or_default(),
host_label: row.get_typed(2)?,
machine_id: row.get_typed(3)?,
platform: row.get_typed(4)?,
config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
created_at: row.get_typed(6)?,
updated_at: row.get_typed(7)?,
})
},
);
Ok(result.optional()?)
}
pub fn list_sources(&self) -> Result<Vec<Source>> {
self.conn
.query_map_collect(
"SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources ORDER BY id",
fparams![],
|row| {
let kind_str: String = row.get_typed(1)?;
let config_json_str: Option<String> = row.get_typed(5)?;
Ok(Source {
id: row.get_typed(0)?,
kind: SourceKind::parse(&kind_str).unwrap_or_default(),
host_label: row.get_typed(2)?,
machine_id: row.get_typed(3)?,
platform: row.get_typed(4)?,
config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
created_at: row.get_typed(6)?,
updated_at: row.get_typed(7)?,
})
},
)
.with_context(|| "listing sources")
}
pub fn get_source_ids(&self) -> Result<Vec<String>> {
self.conn
.query_map_collect(
"SELECT id FROM sources WHERE id != 'local' ORDER BY id",
fparams![],
|row| row.get_typed(0),
)
.with_context(|| "listing source ids")
}
pub fn upsert_source(&self, source: &Source) -> Result<()> {
self.invalidate_conversation_source_cache(source.id.as_str());
let now = Self::now_millis();
let kind_str = source.kind.to_string();
let config_json_str = source
.config_json
.as_ref()
.map(serde_json::to_string)
.transpose()?;
self.conn.execute_compat(
"INSERT INTO sources(id, kind, host_label, machine_id, platform, config_json, created_at, updated_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
ON CONFLICT(id) DO UPDATE SET
kind = excluded.kind,
host_label = excluded.host_label,
machine_id = excluded.machine_id,
platform = excluded.platform,
config_json = excluded.config_json,
updated_at = excluded.updated_at
WHERE NOT (
sources.kind IS excluded.kind
AND sources.host_label IS excluded.host_label
AND sources.machine_id IS excluded.machine_id
AND sources.platform IS excluded.platform
AND sources.config_json IS excluded.config_json
)",
fparams![
source.id.as_str(),
kind_str.as_str(),
source.host_label.as_deref(),
source.machine_id.as_deref(),
source.platform.as_deref(),
config_json_str.as_deref(),
source.created_at.unwrap_or(now),
now
],
)?;
Ok(())
}
fn historical_bundle_key_hash(
version: u32,
bundle: &HistoricalDatabaseBundle,
include_bundle_stats: bool,
) -> String {
let signature = if include_bundle_stats {
format!(
"{}:{}:{}:{}",
version,
bundle.root_path.display(),
bundle.total_bytes,
bundle.modified_at_ms
)
} else {
format!("{}:{}", version, bundle.root_path.display())
};
blake3::hash(signature.as_bytes()).to_hex().to_string()
}
fn historical_bundle_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
format!(
"historical_bundle_salvaged:{}",
Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_LEDGER_VERSION, bundle, false)
)
}
fn historical_bundle_legacy_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
let signature = format!(
"{}:{}:{}:{}",
HISTORICAL_SALVAGE_LEDGER_VERSION,
bundle.root_path.display(),
bundle.total_bytes,
bundle.modified_at_ms
);
format!(
"historical_bundle_salvaged:{}",
blake3::hash(signature.as_bytes()).to_hex()
)
}
fn historical_bundle_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
format!(
"historical_bundle_progress:{}",
Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_PROGRESS_VERSION, bundle, false)
)
}
fn historical_bundle_legacy_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
let signature = format!(
"{}:{}:{}:{}",
HISTORICAL_SALVAGE_PROGRESS_VERSION,
bundle.root_path.display(),
bundle.total_bytes,
bundle.modified_at_ms
);
format!(
"historical_bundle_progress:{}",
blake3::hash(signature.as_bytes()).to_hex()
)
}
fn historical_bundle_already_imported(
&self,
bundle: &HistoricalDatabaseBundle,
) -> Result<bool> {
for key in [
Self::historical_bundle_meta_key(bundle),
Self::historical_bundle_legacy_meta_key(bundle),
] {
let existing: Option<String> = self
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![key.as_str()],
|row| row.get_typed(0),
)
.optional()?;
if existing.is_some() {
return Ok(true);
}
}
Ok(false)
}
pub(crate) fn has_pending_historical_bundles(&self, canonical_db_path: &Path) -> Result<bool> {
for bundle in discover_historical_database_bundles(canonical_db_path) {
if !self.historical_bundle_already_imported(&bundle)? {
return Ok(true);
}
}
Ok(false)
}
fn load_historical_bundle_progress(
&self,
bundle: &HistoricalDatabaseBundle,
) -> Result<Option<HistoricalBundleProgress>> {
for key in [
Self::historical_bundle_progress_key(bundle),
Self::historical_bundle_legacy_progress_key(bundle),
] {
let raw: Option<String> = self
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![key.as_str()],
|row| row.get_typed(0),
)
.optional()?;
let Some(raw) = raw else {
continue;
};
let parsed: HistoricalBundleProgress =
serde_json::from_str(&raw).with_context(|| {
format!(
"parsing historical salvage progress checkpoint for {}",
bundle.root_path.display()
)
})?;
if parsed.progress_version == HISTORICAL_SALVAGE_PROGRESS_VERSION {
return Ok(Some(parsed));
}
}
Ok(None)
}
fn record_historical_bundle_progress(
&self,
bundle: &HistoricalDatabaseBundle,
method: &str,
last_completed_source_row_id: i64,
conversations_imported: usize,
messages_imported: usize,
) -> Result<()> {
let key = Self::historical_bundle_progress_key(bundle);
let value = HistoricalBundleProgress {
progress_version: HISTORICAL_SALVAGE_PROGRESS_VERSION,
path: bundle.root_path.display().to_string(),
bytes: bundle.total_bytes,
modified_at_ms: bundle.modified_at_ms,
method: method.to_string(),
last_completed_source_row_id,
conversations_imported,
messages_imported,
updated_at_ms: Self::now_millis(),
};
let value_str = serde_json::to_string(&value)?;
self.conn.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams![key.as_str(), value_str.as_str()],
)?;
Ok(())
}
fn clear_historical_bundle_progress(&self, bundle: &HistoricalDatabaseBundle) -> Result<()> {
for key in [
Self::historical_bundle_progress_key(bundle),
Self::historical_bundle_legacy_progress_key(bundle),
] {
self.conn
.execute_compat("DELETE FROM meta WHERE key = ?1", fparams![key.as_str()])?;
}
Ok(())
}
fn record_historical_bundle_import(
&self,
bundle: &HistoricalDatabaseBundle,
method: &str,
conversations_imported: usize,
messages_imported: usize,
) -> Result<()> {
let key = Self::historical_bundle_meta_key(bundle);
let value = serde_json::json!({
"salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
"path": bundle.root_path.display().to_string(),
"bytes": bundle.total_bytes,
"modified_at_ms": bundle.modified_at_ms,
"method": method,
"conversations_imported": conversations_imported,
"messages_imported": messages_imported,
"recorded_at_ms": Self::now_millis(),
});
let value_str = serde_json::to_string(&value)?;
self.conn.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams![key.as_str(), value_str.as_str()],
)?;
Ok(())
}
fn historical_import_error_is_split_retryable(err: &anyhow::Error) -> bool {
const RETRYABLE_PATTERNS: &[&str] = &[
"out of memory",
"string or blob too big",
"too many sql variables",
];
err.chain().any(|cause| {
let rendered = cause.to_string().to_ascii_lowercase();
RETRYABLE_PATTERNS
.iter()
.any(|pattern| rendered.contains(pattern))
})
}
fn split_historical_batch_entry_messages(
entry: &HistoricalBatchEntry,
) -> Option<(HistoricalBatchEntry, HistoricalBatchEntry)> {
if entry.conversation.messages.len() < 2 {
return None;
}
let split_at = entry.conversation.messages.len() / 2;
if split_at == 0 || split_at >= entry.conversation.messages.len() {
return None;
}
let mut left = entry.clone();
left.conversation.messages = entry.conversation.messages[..split_at].to_vec();
let mut right = entry.clone();
right.conversation.messages = entry.conversation.messages[split_at..].to_vec();
Some((left, right))
}
fn import_historical_batch_with_retry<F>(
entries: &[HistoricalBatchEntry],
insert_batch: &mut F,
) -> Result<HistoricalBatchImportTotals>
where
F: FnMut(&[HistoricalBatchEntry]) -> Result<HistoricalBatchImportTotals>,
{
match insert_batch(entries) {
Ok(totals) => Ok(totals),
Err(err) if Self::historical_import_error_is_split_retryable(&err) => {
if entries.len() > 1 {
let mid = entries.len() / 2;
tracing::warn!(
batch_entries = entries.len(),
split_left = mid,
split_right = entries.len() - mid,
error = %err,
"historical salvage batch failed; retrying in smaller sub-batches"
);
let left =
Self::import_historical_batch_with_retry(&entries[..mid], insert_batch)?;
let right =
Self::import_historical_batch_with_retry(&entries[mid..], insert_batch)?;
return Ok(HistoricalBatchImportTotals {
inserted_source_rows: left.inserted_source_rows
+ right.inserted_source_rows,
inserted_messages: left.inserted_messages + right.inserted_messages,
});
}
if let Some(entry) = entries.first()
&& let Some((left, right)) = Self::split_historical_batch_entry_messages(entry)
{
tracing::warn!(
source_row_id = entry.source_row_id,
message_count = entry.conversation.messages.len(),
error = %err,
"historical salvage conversation failed; retrying in smaller message slices"
);
let left_totals = Self::import_historical_batch_with_retry(
std::slice::from_ref(&left),
insert_batch,
)?;
let right_totals = Self::import_historical_batch_with_retry(
std::slice::from_ref(&right),
insert_batch,
)?;
return Ok(HistoricalBatchImportTotals {
inserted_source_rows: usize::from(
left_totals.inserted_source_rows > 0
|| right_totals.inserted_source_rows > 0,
),
inserted_messages: left_totals
.inserted_messages
.saturating_add(right_totals.inserted_messages),
});
}
Err(err)
}
Err(err) => Err(err),
}
}
fn import_historical_sources(&self, source_conn: &FrankenConnection) -> Result<()> {
let sources: Vec<Source> = match source_conn.query_map_collect(
"SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at
FROM sources",
fparams![],
|row| {
let raw_source_id: String = row.get_typed(0)?;
let kind_str: String = row.get_typed(1)?;
let raw_host_label: Option<String> = row.get_typed(2)?;
let config_json_raw: Option<String> = row.get_typed(5)?;
let (source_id, source_kind, host_label) = normalized_storage_source_parts(
Some(raw_source_id.as_str()),
Some(kind_str.as_str()),
raw_host_label.as_deref(),
);
Ok(Source {
id: source_id,
kind: source_kind,
host_label,
machine_id: row.get_typed(3)?,
platform: row.get_typed(4)?,
config_json: config_json_raw.and_then(|raw| serde_json::from_str(&raw).ok()),
created_at: row.get_typed(6)?,
updated_at: row.get_typed(7)?,
})
},
) {
Ok(rows) => rows,
Err(err) => {
tracing::warn!(error = %err, "historical sources table unavailable; skipping source import");
return Ok(());
}
};
for source in sources {
self.upsert_source(&source)?;
}
Ok(())
}
fn import_historical_conversations(
&self,
bundle: &HistoricalDatabaseBundle,
salvage_method: &str,
source_conn: &FrankenConnection,
) -> Result<(usize, usize)> {
let batch_limits = historical_import_batch_limits();
let cache_enabled = IndexingCache::is_enabled();
let mut indexing_cache = IndexingCache::new();
let mut known_sources: HashSet<String> = self
.list_sources()?
.into_iter()
.map(|source| source.id)
.collect();
let resume_progress = self.load_historical_bundle_progress(bundle)?;
let resume_after_row_id = resume_progress
.as_ref()
.map(|progress| progress.last_completed_source_row_id)
.filter(|row_id| *row_id > 0);
tracing::info!(
target: "cass::historical_salvage",
batch_conversations = batch_limits.conversations,
batch_messages = batch_limits.messages,
batch_payload_chars = batch_limits.payload_chars,
cache_enabled,
resume_after_row_id,
"configured historical salvage batch limits"
);
if let Some(progress) = &resume_progress {
tracing::info!(
target: "cass::historical_salvage",
path = %bundle.root_path.display(),
resume_after_row_id = progress.last_completed_source_row_id,
prior_conversations_imported = progress.conversations_imported,
prior_messages_imported = progress.messages_imported,
"resuming historical salvage bundle from durable checkpoint"
);
}
let conv_sql = if resume_after_row_id.is_some() {
"SELECT
c.id,
COALESCE(a.slug, 'unknown'),
w.path,
c.external_id,
c.title,
c.source_path,
c.started_at,
c.ended_at,
c.approx_tokens,
c.metadata_json,
c.source_id,
c.origin_host
FROM conversations c
LEFT JOIN agents a ON c.agent_id = a.id
LEFT JOIN workspaces w ON c.workspace_id = w.id
WHERE c.id > ?1
ORDER BY c.id"
} else {
"SELECT
c.id,
COALESCE(a.slug, 'unknown'),
w.path,
c.external_id,
c.title,
c.source_path,
c.started_at,
c.ended_at,
c.approx_tokens,
c.metadata_json,
c.source_id,
c.origin_host
FROM conversations c
LEFT JOIN agents a ON c.agent_id = a.id
LEFT JOIN workspaces w ON c.workspace_id = w.id
ORDER BY c.id"
};
let conv_params: &[ParamValue] =
if let Some(last_completed_source_row_id) = resume_after_row_id {
&[ParamValue::from(last_completed_source_row_id)]
} else {
&[]
};
#[allow(clippy::type_complexity)]
let conv_rows: Vec<(
i64,
String,
Option<String>,
Option<String>,
Option<String>,
String,
Option<i64>,
Option<i64>,
Option<i64>,
Option<String>,
Option<String>,
Option<String>,
)> = source_conn
.query_map_collect(conv_sql, conv_params, |row| {
Ok((
row.get_typed::<i64>(0)?,
row.get_typed::<String>(1)?,
row.get_typed::<Option<String>>(2)?,
row.get_typed::<Option<String>>(3)?,
row.get_typed::<Option<String>>(4)?,
row.get_typed::<String>(5)?,
row.get_typed::<Option<i64>>(6)?,
row.get_typed::<Option<i64>>(7)?,
row.get_typed::<Option<i64>>(8)?,
row.get_typed::<Option<String>>(9)?,
row.get_typed::<Option<String>>(10)?,
row.get_typed::<Option<String>>(11)?,
))
})
.context("querying historical conversations")?;
let msg_sql = "SELECT idx, role, author, created_at, content, extra_json
FROM messages
WHERE conversation_id = ?1
ORDER BY idx";
let mut imported_conversations = resume_progress
.as_ref()
.map(|progress| progress.conversations_imported)
.unwrap_or(0);
let mut imported_messages = resume_progress
.as_ref()
.map(|progress| progress.messages_imported)
.unwrap_or(0);
let mut pending_batch: Vec<HistoricalBatchEntry> = Vec::new();
let mut pending_batch_messages = 0usize;
let mut pending_batch_chars = 0usize;
let mut pending_batch_first_row_id: Option<i64> = None;
let mut pending_batch_last_row_id: Option<i64> = None;
let flush_batch = |storage: &FrankenStorage,
batch: &mut Vec<HistoricalBatchEntry>,
pending_messages: &mut usize,
pending_chars: &mut usize,
first_row_id: &mut Option<i64>,
last_row_id: &mut Option<i64>,
imported_conversations: &mut usize,
imported_messages: &mut usize|
-> Result<()> {
if batch.is_empty() {
return Ok(());
}
let batch_first_row_id = *first_row_id;
let batch_last_row_id = *last_row_id;
if historical_salvage_debug_enabled() {
eprintln!(
"[historical-salvage] flushing batch rows {:?}..{:?} conversations={} messages={} payload_chars={}",
batch_first_row_id,
batch_last_row_id,
batch.len(),
*pending_messages,
*pending_chars
);
}
tracing::info!(
target: "cass::historical_salvage",
batch_conversations = batch.len(),
batch_messages = *pending_messages,
batch_payload_chars = *pending_chars,
first_source_row_id = batch_first_row_id,
last_source_row_id = batch_last_row_id,
"flushing historical salvage batch"
);
let mut insert_batch =
|entries: &[HistoricalBatchEntry]| -> Result<HistoricalBatchImportTotals> {
let borrowed_batch: Vec<(i64, Option<i64>, &Conversation)> = entries
.iter()
.map(|entry| (entry.agent_id, entry.workspace_id, &entry.conversation))
.collect();
let outcomes = storage
.insert_conversations_batched(&borrowed_batch)
.with_context(|| {
let first_source_row_id =
entries.first().map(|entry| entry.source_row_id);
let last_source_row_id =
entries.last().map(|entry| entry.source_row_id);
format!(
"inserting historical salvage batch source rows {:?}..{:?}",
first_source_row_id, last_source_row_id
)
})?;
let mut totals = HistoricalBatchImportTotals::default();
for outcome in outcomes {
if !outcome.inserted_indices.is_empty() {
totals.inserted_source_rows += 1;
totals.inserted_messages += outcome.inserted_indices.len();
}
}
Ok(totals)
};
let totals =
Self::import_historical_batch_with_retry(batch.as_slice(), &mut insert_batch)?;
*imported_conversations =
(*imported_conversations).saturating_add(totals.inserted_source_rows);
*imported_messages = (*imported_messages).saturating_add(totals.inserted_messages);
if let Some(last_completed_row_id) = batch_last_row_id {
storage.record_historical_bundle_progress(
bundle,
salvage_method,
last_completed_row_id,
*imported_conversations,
*imported_messages,
)?;
}
tracing::info!(
target: "cass::historical_salvage",
batch_conversations = batch.len(),
batch_messages = *pending_messages,
imported_conversations = *imported_conversations,
imported_messages = *imported_messages,
first_source_row_id = batch_first_row_id,
last_source_row_id = batch_last_row_id,
"historical salvage batch committed"
);
if historical_salvage_debug_enabled() {
eprintln!(
"[historical-salvage] committed batch rows {:?}..{:?} imported_conversations={} imported_messages={}",
batch_first_row_id,
batch_last_row_id,
*imported_conversations,
*imported_messages
);
}
batch.clear();
*pending_messages = 0;
*pending_chars = 0;
*first_row_id = None;
*last_row_id = None;
Ok(())
};
for (
conversation_row_id,
agent_slug,
workspace_path,
external_id,
title,
source_path,
started_at,
ended_at,
approx_tokens,
metadata_json_raw,
raw_source_id,
raw_origin_host,
) in conv_rows
{
let source_id = crate::search::tantivy::normalized_index_source_id(
raw_source_id.as_deref(),
None,
raw_origin_host.as_deref(),
);
let origin_host =
crate::search::tantivy::normalized_index_origin_host(raw_origin_host.as_deref());
let messages: Vec<Message> = source_conn
.query_map_collect(msg_sql, fparams![conversation_row_id], |msg_row| {
let role: String = msg_row.get_typed(1)?;
Ok(Message {
id: None,
idx: msg_row.get_typed(0)?,
role: match role.as_str() {
"user" => MessageRole::User,
"agent" | "assistant" => MessageRole::Agent,
"tool" => MessageRole::Tool,
"system" => MessageRole::System,
other => MessageRole::Other(other.to_string()),
},
author: msg_row.get_typed(2)?,
created_at: msg_row.get_typed(3)?,
content: msg_row.get_typed(4)?,
extra_json: parse_historical_json_column(msg_row.get_typed(5)?),
snippets: Vec::new(),
})
})
.context("collecting historical message rows")?;
if messages.is_empty() {
continue;
}
let conversation_message_count = messages.len();
let conversation_chars = messages
.iter()
.map(message_payload_size_hint)
.sum::<usize>();
let conversation = Conversation {
id: None,
agent_slug: agent_slug.clone(),
workspace: workspace_path.map(PathBuf::from),
external_id,
title,
source_path: PathBuf::from(source_path),
started_at,
ended_at,
approx_tokens,
metadata_json: parse_json_column(metadata_json_raw),
messages,
source_id,
origin_host,
};
if !known_sources.contains(&conversation.source_id) {
let placeholder = if conversation.source_id == LOCAL_SOURCE_ID {
Source::local()
} else {
Source {
id: conversation.source_id.clone(),
kind: SourceKind::Ssh,
host_label: conversation.origin_host.clone(),
machine_id: None,
platform: None,
config_json: None,
created_at: None,
updated_at: None,
}
};
self.upsert_source(&placeholder)?;
known_sources.insert(conversation.source_id.clone());
}
let agent = Agent {
id: None,
slug: agent_slug.clone(),
name: agent_slug,
version: None,
kind: AgentKind::Cli,
};
let agent_id = if cache_enabled {
indexing_cache.get_or_insert_agent(self, &agent)?
} else {
self.ensure_agent(&agent)?
};
let workspace_id = if let Some(workspace) = &conversation.workspace {
if cache_enabled {
Some(indexing_cache.get_or_insert_workspace(self, workspace, None)?)
} else {
Some(self.ensure_workspace(workspace, None)?)
}
} else {
None
};
let exceeds_pending_limits = !pending_batch.is_empty()
&& (pending_batch.len() >= batch_limits.conversations
|| pending_batch_messages.saturating_add(conversation_message_count)
> batch_limits.messages
|| pending_batch_chars.saturating_add(conversation_chars)
> batch_limits.payload_chars);
if exceeds_pending_limits {
flush_batch(
self,
&mut pending_batch,
&mut pending_batch_messages,
&mut pending_batch_chars,
&mut pending_batch_first_row_id,
&mut pending_batch_last_row_id,
&mut imported_conversations,
&mut imported_messages,
)?;
}
if pending_batch_first_row_id.is_none() {
pending_batch_first_row_id = Some(conversation_row_id);
}
pending_batch_last_row_id = Some(conversation_row_id);
pending_batch_messages =
pending_batch_messages.saturating_add(conversation_message_count);
pending_batch_chars = pending_batch_chars.saturating_add(conversation_chars);
pending_batch.push(HistoricalBatchEntry {
source_row_id: conversation_row_id,
agent_id,
workspace_id,
conversation,
});
if pending_batch.len() >= batch_limits.conversations
|| pending_batch_messages >= batch_limits.messages
|| pending_batch_chars >= batch_limits.payload_chars
{
flush_batch(
self,
&mut pending_batch,
&mut pending_batch_messages,
&mut pending_batch_chars,
&mut pending_batch_first_row_id,
&mut pending_batch_last_row_id,
&mut imported_conversations,
&mut imported_messages,
)?;
}
}
flush_batch(
self,
&mut pending_batch,
&mut pending_batch_messages,
&mut pending_batch_chars,
&mut pending_batch_first_row_id,
&mut pending_batch_last_row_id,
&mut imported_conversations,
&mut imported_messages,
)?;
if cache_enabled {
let (hits, misses, hit_rate) = indexing_cache.stats();
tracing::info!(
target: "cass::historical_salvage",
hits,
misses,
hit_rate = format!("{:.1}%", hit_rate * 100.0),
agents = indexing_cache.agent_count(),
workspaces = indexing_cache.workspace_count(),
sources = known_sources.len(),
"historical salvage cache stats"
);
}
Ok((imported_conversations, imported_messages))
}
pub fn salvage_historical_databases(
&self,
canonical_db_path: &Path,
) -> Result<HistoricalSalvageOutcome> {
let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
let mut outcome = HistoricalSalvageOutcome {
bundles_considered: ordered_bundles.len(),
..HistoricalSalvageOutcome::default()
};
for bundle in ordered_bundles {
if self.historical_bundle_already_imported(&bundle)? {
self.clear_historical_bundle_progress(&bundle)?;
continue;
}
let source = match open_historical_bundle_for_salvage(&bundle).with_context(|| {
format!(
"opening historical bundle {} for salvage",
bundle.root_path.display()
)
}) {
Ok(source) => source,
Err(err) => {
tracing::warn!(
path = %bundle.root_path.display(),
error = %err,
"skipping unreadable historical cass database bundle during salvage"
);
self.clear_historical_bundle_progress(&bundle)?;
continue;
}
};
if let Some(progress) = self.load_historical_bundle_progress(&bundle)? {
let backup_max_conversation_id: i64 = source
.conn
.query_row_map(
"SELECT COALESCE(MAX(id), 0) FROM conversations",
fparams![],
|row| row.get_typed(0),
)
.unwrap_or(0);
if backup_max_conversation_id > 0
&& progress.last_completed_source_row_id >= backup_max_conversation_id
{
self.record_historical_bundle_import(
&bundle,
source.method,
progress.conversations_imported,
progress.messages_imported,
)?;
self.clear_historical_bundle_progress(&bundle)?;
tracing::info!(
path = %bundle.root_path.display(),
last_completed_source_row_id = progress.last_completed_source_row_id,
backup_max_conversation_id,
conversations_imported = progress.conversations_imported,
messages_imported = progress.messages_imported,
"historical bundle already fully imported per checkpoint; marking salvaged and skipping O(n) re-scan"
);
continue;
}
}
self.import_historical_sources(&source.conn)?;
let (imported_conversations, imported_messages) =
self.import_historical_conversations(&bundle, source.method, &source.conn)?;
self.record_historical_bundle_import(
&bundle,
source.method,
imported_conversations,
imported_messages,
)?;
self.clear_historical_bundle_progress(&bundle)?;
outcome.bundles_imported += 1;
outcome.conversations_imported += imported_conversations;
outcome.messages_imported += imported_messages;
tracing::info!(
path = %bundle.root_path.display(),
bytes = bundle.total_bytes,
method = source.method,
imported_conversations,
imported_messages,
"salvaged historical cass database bundle"
);
}
Ok(outcome)
}
pub fn delete_source(&self, id: &str, _cascade: bool) -> Result<bool> {
if id == LOCAL_SOURCE_ID {
anyhow::bail!("cannot delete the local source");
}
let count = self
.conn
.execute_compat("DELETE FROM sources WHERE id = ?1", fparams![id])?;
if count > 0 {
self.invalidate_conversation_source_cache(id);
}
Ok(count > 0)
}
pub fn insert_conversation_tree(
&self,
agent_id: i64,
workspace_id: Option<i64>,
conv: &Conversation,
) -> Result<InsertOutcome> {
let normalized_conv = normalized_conversation_for_storage(conv);
let conv = normalized_conv.as_ref();
self.ensure_source_for_conversation(conv)?;
let defer_lexical_updates = defer_storage_lexical_updates_enabled();
let defer_analytics_updates = defer_analytics_updates_enabled();
let conversation_key = conversation_merge_key(agent_id, conv);
let mut tx = self.conn.transaction()?;
let existing = franken_find_existing_conversation_with_tail_by_key(
&tx,
&conversation_key,
Some(conv),
)?;
if let Some(existing) = existing {
let outcome = self.franken_append_messages_with_tail_in_tx(
&tx,
agent_id,
existing.id,
conv,
existing.tail_state,
defer_lexical_updates,
defer_analytics_updates,
)?;
tx.commit()?;
return Ok(outcome);
}
let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
&tx,
agent_id,
workspace_id,
conv,
&conversation_key,
)? {
ConversationInsertStatus::Inserted(conv_id) => conv_id,
ConversationInsertStatus::Existing(existing_id) => {
let ExistingMessageLookup {
by_idx: mut existing_messages,
replay: mut existing_replay_fingerprints,
} = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
let ExistingConversationNewMessages {
messages: new_messages,
new_chars,
idx_collision_count,
first_collision_idx,
} = collect_new_messages_for_existing_conversation(
existing_id,
conv,
&mut existing_messages,
&mut existing_replay_fingerprints,
"skipping replay-equivalent recovered message with shifted idx",
);
let (inserted_last_idx, inserted_last_created_at) =
borrowed_messages_tail_state(&new_messages);
let mut inserted_indices = Vec::new();
let mut fts_entries = Vec::new();
let mut fts_pending_chars = 0usize;
let mut _fts_inserted_total = 0usize;
let inserted_message_ids =
franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
if !defer_lexical_updates {
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut _fts_inserted_total,
)?;
}
}
inserted_indices.push(msg.idx);
}
if idx_collision_count > 0 {
tracing::warn!(
conversation_id = existing_id,
collision_count = idx_collision_count,
first_idx = first_collision_idx,
source_path = %conv.source_path.display(),
"message idx collisions encountered while merging recovered conversation; retaining canonical message variants"
);
}
if !defer_lexical_updates {
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut _fts_inserted_total,
)?;
}
let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
franken_update_conversation_tail_state(
&tx,
existing_id,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
{
franken_update_external_conversation_tail_lookup_key(
&tx,
&lookup_key,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
}
if !defer_analytics_updates && !inserted_indices.is_empty() {
franken_update_daily_stats_in_tx(
self,
&tx,
&conv.agent_slug,
&conv.source_id,
conversation_effective_started_at(conv),
StatsDelta {
session_count_delta: 0,
message_count_delta: inserted_indices.len() as i64,
total_chars_delta: new_chars,
},
)?;
}
tx.commit()?;
return Ok(InsertOutcome {
conversation_id: existing_id,
conversation_inserted: false,
inserted_indices,
});
}
};
let mut fts_entries = Vec::new();
let mut fts_pending_chars = 0usize;
let mut _fts_inserted_total = 0usize;
let mut total_chars: i64 = 0;
let mut inserted_indices = Vec::new();
let mut pending_messages = HashMap::new();
let mut pending_replay_fingerprints = HashSet::new();
let mut idx_collision_count = 0usize;
let mut first_collision_idx: Option<i64> = None;
let mut new_messages = Vec::new();
for msg in &conv.messages {
let incoming_fingerprint = message_merge_fingerprint(msg);
if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
if existing_fingerprint != &incoming_fingerprint {
idx_collision_count = idx_collision_count.saturating_add(1);
first_collision_idx.get_or_insert(msg.idx);
}
continue;
}
let incoming_replay = message_replay_fingerprint(msg);
if pending_replay_fingerprints.contains(&incoming_replay) {
tracing::debug!(
conversation_id = conv_id,
idx = msg.idx,
source_path = %conv.source_path.display(),
"skipping replay-equivalent duplicate message within new conversation insert"
);
continue;
}
pending_messages.insert(msg.idx, incoming_fingerprint);
pending_replay_fingerprints.insert(incoming_replay);
new_messages.push(msg);
}
let inserted_message_ids = franken_batch_insert_new_messages(&tx, conv_id, &new_messages)?;
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
if !defer_lexical_updates {
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut _fts_inserted_total,
)?;
}
}
total_chars += msg.content.len() as i64;
inserted_indices.push(msg.idx);
}
if idx_collision_count > 0 {
tracing::warn!(
conversation_id = conv_id,
collision_count = idx_collision_count,
first_idx = first_collision_idx,
source_path = %conv.source_path.display(),
"message idx collisions encountered while inserting a new conversation; retaining the first canonical variant per idx"
);
}
if !defer_lexical_updates {
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut _fts_inserted_total,
)?;
}
if !defer_analytics_updates {
franken_update_daily_stats_in_tx(
self,
&tx,
&conv.agent_slug,
&conv.source_id,
conversation_effective_started_at(conv),
StatsDelta {
session_count_delta: 1,
message_count_delta: inserted_indices.len() as i64,
total_chars_delta: total_chars,
},
)?;
}
tx.commit()?;
Ok(InsertOutcome {
conversation_id: conv_id,
conversation_inserted: true,
inserted_indices,
})
}
#[cfg(test)]
fn insert_conversation_tree_with_profile(
&self,
agent_id: i64,
workspace_id: Option<i64>,
conv: &Conversation,
profile: &mut InsertConversationTreePerfProfile,
) -> Result<InsertOutcome> {
let total_start = Instant::now();
let normalized_conv = normalized_conversation_for_storage(conv);
let conv = normalized_conv.as_ref();
let source_start = Instant::now();
self.ensure_source_for_conversation(conv)?;
profile.source_duration += source_start.elapsed();
let defer_lexical_updates = defer_storage_lexical_updates_enabled();
let defer_analytics_updates = defer_analytics_updates_enabled();
let conversation_key = conversation_merge_key(agent_id, conv);
let tx_open_start = Instant::now();
let mut tx = self.conn.transaction()?;
profile.tx_open_duration += tx_open_start.elapsed();
let existing_lookup_start = Instant::now();
let existing =
franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
profile.existing_lookup_duration += existing_lookup_start.elapsed();
if let Some(existing_id) = existing {
return Err(anyhow!(
"profile helper expects new conversation path, found existing id {existing_id}"
));
}
let conversation_row_start = Instant::now();
let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
&tx,
agent_id,
workspace_id,
conv,
&conversation_key,
)? {
ConversationInsertStatus::Inserted(conv_id) => conv_id,
ConversationInsertStatus::Existing(existing_id) => {
return Err(anyhow!(
"profile helper expected inserted conversation row, reused existing id {existing_id}"
));
}
};
profile.conversation_row_duration += conversation_row_start.elapsed();
let mut fts_entries = Vec::new();
let mut fts_pending_chars = 0usize;
let mut fts_inserted_total = 0usize;
let mut total_chars: i64 = 0;
let mut inserted_indices = Vec::new();
let mut pending_messages = HashMap::new();
let mut pending_replay_fingerprints = HashSet::new();
let mut idx_collision_count = 0usize;
let mut first_collision_idx: Option<i64> = None;
let mut new_messages = Vec::new();
for msg in &conv.messages {
let incoming_fingerprint = message_merge_fingerprint(msg);
if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
if existing_fingerprint != &incoming_fingerprint {
idx_collision_count = idx_collision_count.saturating_add(1);
first_collision_idx.get_or_insert(msg.idx);
}
continue;
}
let incoming_replay = message_replay_fingerprint(msg);
if pending_replay_fingerprints.contains(&incoming_replay) {
tracing::debug!(
conversation_id = conv_id,
idx = msg.idx,
source_path = %conv.source_path.display(),
"skipping replay-equivalent duplicate message within profiled new conversation insert"
);
continue;
}
pending_messages.insert(msg.idx, incoming_fingerprint);
pending_replay_fingerprints.insert(incoming_replay);
new_messages.push(msg);
}
let message_insert_start = Instant::now();
let inserted_message_ids = franken_batch_insert_new_messages_with_profile(
&tx,
conv_id,
&new_messages,
&mut profile.message_insert_breakdown,
)?;
profile.message_insert_duration += message_insert_start.elapsed();
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
let snippet_insert_start = Instant::now();
franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
profile.snippet_insert_duration += snippet_insert_start.elapsed();
if !defer_lexical_updates {
let fts_entry_start = Instant::now();
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
profile.fts_entry_duration += fts_entry_start.elapsed();
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
let fts_flush_start = Instant::now();
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
profile.fts_flush_duration += fts_flush_start.elapsed();
}
}
total_chars += msg.content.len() as i64;
inserted_indices.push(msg.idx);
}
if idx_collision_count > 0 {
tracing::warn!(
conversation_id = conv_id,
collision_count = idx_collision_count,
first_idx = first_collision_idx,
source_path = %conv.source_path.display(),
"message idx collisions encountered while profiling a new conversation insert; retaining the first canonical variant per idx"
);
}
if !defer_lexical_updates {
let fts_flush_start = Instant::now();
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
profile.fts_flush_duration += fts_flush_start.elapsed();
}
if !defer_analytics_updates {
let analytics_start = Instant::now();
franken_update_daily_stats_in_tx(
self,
&tx,
&conv.agent_slug,
&conv.source_id,
conversation_effective_started_at(conv),
StatsDelta {
session_count_delta: 1,
message_count_delta: inserted_indices.len() as i64,
total_chars_delta: total_chars,
},
)?;
profile.analytics_duration += analytics_start.elapsed();
}
let commit_start = Instant::now();
tx.commit()?;
profile.commit_duration += commit_start.elapsed();
profile.invocations += 1;
profile.messages += conv.messages.len();
profile.inserted_messages += inserted_indices.len();
profile.total_duration += total_start.elapsed();
Ok(InsertOutcome {
conversation_id: conv_id,
conversation_inserted: true,
inserted_indices,
})
}
#[cfg(test)]
fn append_existing_conversation_with_profile(
&self,
agent_id: i64,
_workspace_id: Option<i64>,
conv: &Conversation,
profile: &mut InsertConversationTreePerfProfile,
) -> Result<InsertOutcome> {
let total_start = Instant::now();
let normalized_conv = normalized_conversation_for_storage(conv);
let conv = normalized_conv.as_ref();
let source_start = Instant::now();
self.ensure_source_for_conversation(conv)?;
profile.source_duration += source_start.elapsed();
let defer_lexical_updates = defer_storage_lexical_updates_enabled();
let defer_analytics_updates = defer_analytics_updates_enabled();
let conversation_key = conversation_merge_key(agent_id, conv);
let tx_open_start = Instant::now();
let mut tx = self.conn.transaction()?;
profile.tx_open_duration += tx_open_start.elapsed();
let existing_lookup_start = Instant::now();
let existing = franken_find_existing_conversation_with_tail_by_key(
&tx,
&conversation_key,
Some(conv),
)?;
profile.existing_lookup_duration += existing_lookup_start.elapsed();
let existing = existing.ok_or_else(|| {
anyhow!("append profile helper expects existing conversation for {conversation_key:?}")
})?;
let existing_id = existing.id;
let existing_idx_lookup_start = Instant::now();
let append_tail_state = existing.tail_state;
let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
let existing_plan = append_tail_state.as_ref().and_then(|state| {
collect_append_only_tail_messages(
conv,
state.last_message_idx,
state.last_message_created_at,
)
});
let used_append_tail_plan = existing_plan.is_some();
profile.existing_idx_lookup_duration += existing_idx_lookup_start.elapsed();
let dedupe_filter_start = Instant::now();
let ExistingConversationNewMessages {
messages: new_messages,
new_chars,
idx_collision_count,
first_collision_idx,
} = if let Some(existing_plan) = existing_plan {
existing_plan
} else {
let ExistingMessageLookup {
by_idx: mut existing_messages,
replay: mut existing_replay_fingerprints,
} = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
collect_new_messages_for_existing_conversation(
existing_id,
conv,
&mut existing_messages,
&mut existing_replay_fingerprints,
"skipping replay-equivalent profiled append message with shifted idx",
)
};
profile.dedupe_filter_duration += dedupe_filter_start.elapsed();
let mut inserted_indices = Vec::new();
let mut fts_entries = Vec::new();
let mut fts_pending_chars = 0usize;
let mut fts_inserted_total = 0usize;
let (inserted_last_idx, inserted_last_created_at) =
borrowed_messages_tail_state(&new_messages);
let message_insert_start = Instant::now();
let inserted_message_ids = franken_append_insert_new_messages_with_profile(
&tx,
existing_id,
&new_messages,
&mut profile.message_insert_breakdown,
)?;
profile.message_insert_duration += message_insert_start.elapsed();
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
let snippet_insert_start = Instant::now();
franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
profile.snippet_insert_duration += snippet_insert_start.elapsed();
if !defer_lexical_updates {
let fts_entry_start = Instant::now();
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
profile.fts_entry_duration += fts_entry_start.elapsed();
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
let fts_flush_start = Instant::now();
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
profile.fts_flush_duration += fts_flush_start.elapsed();
}
}
inserted_indices.push(msg.idx);
}
if idx_collision_count > 0 {
tracing::warn!(
conversation_id = existing_id,
collision_count = idx_collision_count,
first_idx = first_collision_idx,
source_path = %conv.source_path.display(),
"message idx collisions encountered while profiling append merge; retaining canonical message variants"
);
}
if !defer_lexical_updates {
let fts_flush_start = Instant::now();
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
profile.fts_flush_duration += fts_flush_start.elapsed();
}
let conversation_row_start = Instant::now();
let mut exact_append_tail_set = false;
if used_append_tail_plan {
if let (Some(last_message_idx), Some(last_message_created_at)) =
(inserted_last_idx, inserted_last_created_at)
{
if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
franken_set_conversation_tail_state_after_append(
&tx,
existing_id,
last_message_created_at,
last_message_idx,
last_message_created_at,
)?;
exact_append_tail_set = true;
} else {
franken_update_conversation_tail_state(
&tx,
existing_id,
Some(last_message_created_at),
inserted_last_idx,
inserted_last_created_at,
)?;
}
}
} else {
let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
franken_update_conversation_tail_state(
&tx,
existing_id,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
}
franken_update_external_conversation_tail_after_append(
&tx,
agent_id,
conv,
used_append_tail_plan,
exact_append_tail_set,
inserted_last_idx,
inserted_last_created_at,
)?;
profile.conversation_row_duration += conversation_row_start.elapsed();
if !defer_analytics_updates && !inserted_indices.is_empty() {
let analytics_start = Instant::now();
franken_update_daily_stats_in_tx(
self,
&tx,
&conv.agent_slug,
&conv.source_id,
conversation_effective_started_at(conv),
StatsDelta {
session_count_delta: 0,
message_count_delta: inserted_indices.len() as i64,
total_chars_delta: new_chars,
},
)?;
profile.analytics_duration += analytics_start.elapsed();
}
let commit_start = Instant::now();
tx.commit()?;
profile.commit_duration += commit_start.elapsed();
profile.invocations += 1;
profile.messages += conv.messages.len();
profile.inserted_messages += inserted_indices.len();
profile.total_duration += total_start.elapsed();
Ok(InsertOutcome {
conversation_id: existing_id,
conversation_inserted: false,
inserted_indices,
})
}
#[allow(clippy::too_many_arguments)]
fn franken_append_messages_with_tail_in_tx(
&self,
tx: &FrankenTransaction<'_>,
agent_id: i64,
conversation_id: i64,
conv: &Conversation,
append_tail_state: Option<ExistingConversationTailState>,
defer_lexical_updates: bool,
defer_analytics_updates: bool,
) -> Result<InsertOutcome> {
let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
let append_plan = append_tail_state.as_ref().and_then(|state| {
collect_append_only_tail_messages(
conv,
state.last_message_idx,
state.last_message_created_at,
)
});
let used_append_tail_plan = append_plan.is_some();
let ExistingConversationNewMessages {
messages: new_messages,
new_chars,
idx_collision_count,
first_collision_idx,
} = if let Some(append_plan) = append_plan {
append_plan
} else {
let ExistingMessageLookup {
by_idx: mut existing_messages,
replay: mut existing_replay_fingerprints,
} = franken_existing_message_lookup(tx, conversation_id, &conv.messages)?;
collect_new_messages_for_existing_conversation(
conversation_id,
conv,
&mut existing_messages,
&mut existing_replay_fingerprints,
"skipping replay-equivalent recovered message with shifted idx",
)
};
let mut inserted_indices = Vec::new();
let mut fts_entries = Vec::new();
let mut fts_pending_chars = 0usize;
let mut _fts_inserted_total = 0usize;
let (inserted_last_idx, inserted_last_created_at) =
borrowed_messages_tail_state(&new_messages);
let inserted_message_ids =
franken_append_insert_new_messages(tx, conversation_id, &new_messages)?;
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
franken_insert_snippets(tx, msg_id, &msg.snippets)?;
if !defer_lexical_updates {
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
flush_pending_fts_entries(
self,
tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut _fts_inserted_total,
)?;
}
}
inserted_indices.push(msg.idx);
}
if idx_collision_count > 0 {
tracing::warn!(
conversation_id,
collision_count = idx_collision_count,
first_idx = first_collision_idx,
source_path = %conv.source_path.display(),
"message idx collisions encountered while appending to an existing conversation; retaining canonical message variants"
);
}
if !defer_lexical_updates {
flush_pending_fts_entries(
self,
tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut _fts_inserted_total,
)?;
}
let mut exact_append_tail_set = false;
if used_append_tail_plan {
if let (Some(last_message_idx), Some(last_message_created_at)) =
(inserted_last_idx, inserted_last_created_at)
{
if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
franken_set_conversation_tail_state_after_append(
tx,
conversation_id,
last_message_created_at,
last_message_idx,
last_message_created_at,
)?;
exact_append_tail_set = true;
} else {
franken_update_conversation_tail_state(
tx,
conversation_id,
Some(last_message_created_at),
inserted_last_idx,
inserted_last_created_at,
)?;
}
}
} else {
let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
franken_update_conversation_tail_state(
tx,
conversation_id,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
}
franken_update_external_conversation_tail_after_append(
tx,
agent_id,
conv,
used_append_tail_plan,
exact_append_tail_set,
inserted_last_idx,
inserted_last_created_at,
)?;
if !defer_analytics_updates && !inserted_indices.is_empty() {
let message_count = inserted_indices.len() as i64;
franken_update_daily_stats_in_tx(
self,
tx,
&conv.agent_slug,
&conv.source_id,
conversation_effective_started_at(conv),
StatsDelta {
session_count_delta: 0,
message_count_delta: message_count,
total_chars_delta: new_chars,
},
)?;
}
Ok(InsertOutcome {
conversation_id,
conversation_inserted: false,
inserted_indices,
})
}
pub fn rebuild_fts(&self) -> Result<()> {
self.rebuild_fts_via_frankensqlite().map(|_| ())
}
pub(crate) fn ensure_search_fallback_fts_consistency(&self) -> Result<FtsConsistencyRepair> {
self.ensure_fts_consistency_via_frankensqlite()
}
pub(crate) fn fallback_fts_is_known_healthy_for_archive_fingerprint(
&self,
archive_fingerprint: &str,
) -> Result<bool> {
Ok(
self.read_fts_franken_rebuild_generation()? == Some(FTS_FRANKEN_REBUILD_GENERATION)
&& self
.read_fts_franken_rebuild_archive_fingerprint()?
.as_deref()
== Some(archive_fingerprint),
)
}
pub(crate) fn record_search_fallback_fts_archive_fingerprint(
&self,
archive_fingerprint: &str,
) -> Result<()> {
self.conn
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams![
FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY,
archive_fingerprint.to_string()
],
)
.with_context(|| "recording frankensqlite FTS archive fingerprint")?;
Ok(())
}
pub(crate) fn daily_stats_is_known_healthy_for_archive_fingerprint(
&self,
archive_fingerprint: &str,
) -> Result<bool> {
Ok(
self.read_daily_stats_health_generation()? == Some(DAILY_STATS_HEALTH_GENERATION)
&& self.read_daily_stats_archive_fingerprint()?.as_deref()
== Some(archive_fingerprint),
)
}
pub(crate) fn record_daily_stats_archive_fingerprint(
&self,
archive_fingerprint: &str,
) -> Result<()> {
self.conn
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams![
DAILY_STATS_HEALTH_GENERATION_META_KEY,
DAILY_STATS_HEALTH_GENERATION.to_string()
],
)
.with_context(|| "recording daily_stats health generation")?;
self.conn
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams![DAILY_STATS_HEALTH_META_KEY, archive_fingerprint.to_string()],
)
.with_context(|| "recording daily_stats archive fingerprint")?;
Ok(())
}
fn read_fts_franken_rebuild_generation(&self) -> Result<Option<i64>> {
let value: Option<String> = self
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![FTS_FRANKEN_REBUILD_META_KEY],
|row| row.get_typed(0),
)
.optional()?;
Ok(value.and_then(|v| v.parse::<i64>().ok()))
}
fn read_fts_franken_rebuild_archive_fingerprint(&self) -> Result<Option<String>> {
Ok(self
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY],
|row| row.get_typed(0),
)
.optional()?)
}
fn read_daily_stats_health_generation(&self) -> Result<Option<i64>> {
let value: Option<String> = self
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![DAILY_STATS_HEALTH_GENERATION_META_KEY],
|row| row.get_typed(0),
)
.optional()?;
Ok(value.and_then(|value| value.parse::<i64>().ok()))
}
fn read_daily_stats_archive_fingerprint(&self) -> Result<Option<String>> {
Ok(self
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![DAILY_STATS_HEALTH_META_KEY],
|row| row.get_typed(0),
)
.optional()?)
}
fn record_fts_franken_rebuild_generation(&self) -> Result<()> {
self.conn
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams![
FTS_FRANKEN_REBUILD_META_KEY,
FTS_FRANKEN_REBUILD_GENERATION.to_string()
],
)
.with_context(|| "recording frankensqlite FTS rebuild generation")?;
Ok(())
}
fn ensure_fts_consistency_via_frankensqlite(&self) -> Result<FtsConsistencyRepair> {
if self.read_fts_franken_rebuild_generation()? != Some(FTS_FRANKEN_REBUILD_GENERATION) {
let fts_already_healthy = (|| -> Result<bool> {
let fts_exists: i64 = self.conn.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
fparams![],
|row| row.get_typed(0),
)?;
if fts_exists != 1 {
return Ok(false);
}
let total: i64 = self.conn.query_row_map(
"SELECT COUNT(*) FROM messages",
fparams![],
|row| row.get_typed(0),
)?;
if total == 0 {
return Ok(false);
}
let indexed: i64 = self.conn.query_row_map(
"SELECT COUNT(*) FROM fts_messages",
fparams![],
|row| row.get_typed(0),
)?;
Ok(indexed > 0 && indexed * 100 >= total * 90)
})()
.unwrap_or(false);
if fts_already_healthy {
tracing::info!(
target: "cass::fts_rebuild",
"FTS already populated and consistent; setting generation marker without rebuild"
);
self.record_fts_franken_rebuild_generation()?;
self.set_fts_messages_present_cache(true);
} else {
let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
self.record_fts_franken_rebuild_generation()?;
return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
}
}
let inspection = (|| -> Result<(i64, bool)> {
let fts_schema_rows = self.conn.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
fparams![],
|row| row.get_typed::<i64>(0),
)?;
let fts_queryable = fts_schema_rows == 1
&& self.conn.query("SELECT COUNT(*) FROM fts_messages").is_ok();
Ok((fts_schema_rows, fts_queryable))
})();
let (fts_schema_rows, fts_queryable) = match inspection {
Ok(result) => result,
Err(err) => {
tracing::warn!(
error = %err,
"frankensqlite FTS consistency probe failed; rebuilding authoritative FTS"
);
let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
self.record_fts_franken_rebuild_generation()?;
return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
}
};
if fts_schema_rows != 1 || !fts_queryable {
let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
self.record_fts_franken_rebuild_generation()?;
return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
}
let total_messages =
self.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed::<i64>(0)
})?;
let indexed_messages =
self.conn
.query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
row.get_typed::<i64>(0)
})?;
if indexed_messages == total_messages {
self.set_fts_messages_present_cache(true);
return Ok(FtsConsistencyRepair::AlreadyHealthy {
rows: usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX),
});
}
if indexed_messages > total_messages {
let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
self.record_fts_franken_rebuild_generation()?;
return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
}
let inserted_rows = self
.stream_fts_rows_via_frankensqlite(true)
.with_context(|| "incrementally repairing missing FTS rows via frankensqlite")?;
let repaired_rows =
self.conn
.query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
row.get_typed::<i64>(0)
})?;
if repaired_rows == total_messages {
self.set_fts_messages_present_cache(true);
return Ok(FtsConsistencyRepair::IncrementalCatchUp {
inserted_rows,
total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
});
}
if inserted_rows == 0 {
tracing::debug!(
target: "cass::fts_rebuild",
indexed_messages = repaired_rows,
total_messages,
un_indexable_gap = total_messages.saturating_sub(repaired_rows),
"FTS catch-up inserted 0 rows; remaining gap is un-indexable (likely orphaned messages with dangling conversation_id)"
);
self.set_fts_messages_present_cache(true);
return Ok(FtsConsistencyRepair::IncrementalCatchUp {
inserted_rows: 0,
total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
});
}
let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
self.record_fts_franken_rebuild_generation()?;
Ok(FtsConsistencyRepair::Rebuilt { inserted_rows })
}
pub(crate) fn rebuild_fts_via_frankensqlite(&self) -> Result<usize> {
self.invalidate_fts_messages_present_cache();
self.conn
.execute("DROP TABLE IF EXISTS fts_messages;")
.with_context(|| "dropping derived fts_messages before frankensqlite rebuild")?;
self.conn
.execute_compat(FTS5_REGISTER_SQL, fparams![])
.with_context(|| "creating derived fts_messages via frankensqlite rebuild")?;
self.set_fts_messages_present_cache(true);
self.stream_fts_rows_via_frankensqlite(false)
}
fn stream_fts_rows_via_frankensqlite(&self, missing_only: bool) -> Result<usize> {
let batch_size = fts_rebuild_batch_size().max(1);
let batch_limit = i64::try_from(batch_size).unwrap_or(i64::MAX);
let mut total_inserted: usize = 0;
let mut total_skipped_orphans: usize = 0;
let mut total_skipped_existing: usize = 0;
let mut last_rowid: i64 = 0;
let conversation_by_id = self.load_fts_conversation_projection_map()?;
let agent_slug_by_id = self.load_fts_agent_slug_map()?;
let workspace_path_by_id = self.load_fts_workspace_path_map()?;
let existing_fts_rowids = if missing_only {
Some(self.load_fts_message_rowid_set()?)
} else {
None
};
let mut entries = Vec::new();
let mut pending_chars = 0usize;
loop {
let rows = self.fetch_fts_rebuild_message_rows(last_rowid, batch_limit)?;
let fetched_count = rows.len();
if fetched_count == 0 {
break;
}
let inserted_before_batch = total_inserted;
let skipped_before_batch = total_skipped_orphans;
let existing_before_batch = total_skipped_existing;
for row in rows {
last_rowid = row.rowid;
if existing_fts_rowids
.as_ref()
.is_some_and(|rowids| rowids.contains(&row.message_id))
{
total_skipped_existing = total_skipped_existing.saturating_add(1);
continue;
}
let Some(conversation) = conversation_by_id.get(&row.conversation_id) else {
total_skipped_orphans = total_skipped_orphans.saturating_add(1);
continue;
};
let agent = conversation
.agent_id
.and_then(|agent_id| agent_slug_by_id.get(&agent_id))
.filter(|slug| !slug.is_empty())
.cloned()
.unwrap_or_else(|| "unknown".to_string());
let workspace = conversation
.workspace_id
.and_then(|workspace_id| workspace_path_by_id.get(&workspace_id))
.cloned()
.unwrap_or_default();
pending_chars = pending_chars.saturating_add(row.content.len());
entries.push(FtsEntry {
content: row.content,
title: conversation.title.clone(),
agent,
workspace,
source_path: conversation.source_path.clone(),
created_at: row.created_at,
message_id: row.message_id,
});
if entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
total_inserted = total_inserted.saturating_add(
franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
);
entries.clear();
pending_chars = 0;
}
}
if !entries.is_empty() {
total_inserted = total_inserted.saturating_add(
franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
);
entries.clear();
pending_chars = 0;
}
tracing::debug!(
target: "cass::fts_rebuild",
batch_rows = fetched_count,
batch_inserted = total_inserted.saturating_sub(inserted_before_batch),
batch_skipped_orphans = total_skipped_orphans.saturating_sub(skipped_before_batch),
batch_skipped_existing = total_skipped_existing.saturating_sub(existing_before_batch),
total_inserted,
total_skipped_orphans,
total_skipped_existing,
last_rowid,
missing_only,
"FTS streaming maintenance batch complete"
);
if fetched_count < batch_size {
break;
}
}
Ok(total_inserted)
}
fn fetch_fts_rebuild_message_rows(
&self,
last_rowid: i64,
batch_limit: i64,
) -> Result<Vec<FtsRebuildMessageRow>> {
self.conn
.query_map_collect(
"SELECT m.rowid, m.id, m.conversation_id, m.content, m.created_at
FROM messages m
WHERE m.rowid > ?1
ORDER BY m.rowid
LIMIT ?2",
fparams![last_rowid, batch_limit],
|row| {
Ok(FtsRebuildMessageRow {
rowid: row.get_typed(0)?,
message_id: row.get_typed(1)?,
conversation_id: row.get_typed(2)?,
content: row.get_typed::<Option<String>>(3)?.unwrap_or_default(),
created_at: row.get_typed(4)?,
})
},
)
.with_context(|| format!("fetching FTS maintenance messages after rowid {last_rowid}"))
}
fn load_fts_message_rowid_set(&self) -> Result<HashSet<i64>> {
let rows: Vec<i64> = self
.conn
.query_map_collect("SELECT rowid FROM fts_messages", fparams![], |row| {
row.get_typed(0)
})
.with_context(|| "loading existing FTS message rowids")?;
Ok(rows.into_iter().collect())
}
fn load_fts_conversation_projection_map(
&self,
) -> Result<HashMap<i64, FtsConversationProjection>> {
let rows: Vec<(i64, FtsConversationProjection)> = self
.conn
.query_map_collect(
"SELECT id, title, agent_id, workspace_id, source_path
FROM conversations",
fparams![],
|row| {
Ok((
row.get_typed(0)?,
FtsConversationProjection {
title: row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
agent_id: row.get_typed(2)?,
workspace_id: row.get_typed(3)?,
source_path: row.get_typed::<Option<String>>(4)?.unwrap_or_default(),
},
))
},
)
.with_context(|| "loading FTS conversation projection map")?;
Ok(rows.into_iter().collect())
}
fn load_fts_agent_slug_map(&self) -> Result<HashMap<i64, String>> {
let rows: Vec<(i64, String)> = self
.conn
.query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
Ok((
row.get_typed(0)?,
row.get_typed::<Option<String>>(1)?
.unwrap_or_else(|| "unknown".to_string()),
))
})
.with_context(|| "loading FTS agent slug map")?;
Ok(rows.into_iter().collect())
}
fn load_fts_workspace_path_map(&self) -> Result<HashMap<i64, String>> {
let rows: Vec<(i64, String)> = self
.conn
.query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
Ok((
row.get_typed(0)?,
row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
))
})
.with_context(|| "loading FTS workspace path map")?;
Ok(rows.into_iter().collect())
}
pub fn fetch_messages_for_embedding(&self) -> Result<Vec<MessageForEmbedding>> {
self.conn
.query_map_collect(
"SELECT m.id, m.created_at, COALESCE(c.agent_id, 0), c.workspace_id, c.source_id, m.role, m.content
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
ORDER BY m.id",
fparams![],
|row| {
let source_id: String = row.get_typed::<Option<String>>(4)?
.unwrap_or_else(|| "local".to_string());
Ok(MessageForEmbedding {
message_id: row.get_typed(0)?,
created_at: row.get_typed(1)?,
agent_id: row.get_typed(2)?,
workspace_id: row.get_typed(3)?,
source_id_hash: crc32fast::hash(source_id.as_bytes()),
role: row.get_typed(5)?,
content: row.get_typed(6)?,
})
},
)
.with_context(|| "fetching messages for embedding")
}
pub fn get_last_embedded_message_id(&self) -> Result<Option<i64>> {
let result: Result<String, _> = self.conn.query_row_map(
"SELECT value FROM meta WHERE key = 'last_embedded_message_id'",
fparams![],
|row| row.get_typed(0),
);
match result.optional() {
Ok(Some(s)) => Ok(s.parse().ok()),
Ok(None) => Ok(None),
Err(e) => Err(e.into()),
}
}
pub fn set_last_embedded_message_id(&self, id: i64) -> Result<()> {
self.conn.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES('last_embedded_message_id', ?1)",
fparams![id.to_string()],
)?;
Ok(())
}
pub fn get_embedding_jobs(&self, db_path: &str) -> Result<Vec<EmbeddingJobRow>> {
self.conn
.query_map_collect(
"SELECT id, db_path, model_id, status, total_docs, completed_docs, error_message, created_at, started_at, completed_at
FROM embedding_jobs WHERE db_path = ?1 ORDER BY id DESC",
fparams![db_path],
|row| {
Ok(EmbeddingJobRow {
id: row.get_typed(0)?,
db_path: row.get_typed(1)?,
model_id: row.get_typed(2)?,
status: row.get_typed(3)?,
total_docs: row.get_typed(4)?,
completed_docs: row.get_typed(5)?,
error_message: row.get_typed(6)?,
created_at: row.get_typed(7)?,
started_at: row.get_typed(8)?,
completed_at: row.get_typed(9)?,
})
},
)
.with_context(|| format!("fetching embedding jobs for {db_path}"))
}
pub fn upsert_embedding_job(
&self,
db_path: &str,
model_id: &str,
total_docs: i64,
) -> Result<i64> {
let updated = self.conn.execute_compat(
"UPDATE embedding_jobs
SET total_docs = ?3
WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
fparams![db_path, model_id, total_docs],
)?;
if updated == 0 {
let insert_result = self.conn.execute_compat(
"INSERT INTO embedding_jobs(db_path, model_id, total_docs) VALUES(?1,?2,?3)",
fparams![db_path, model_id, total_docs],
);
if let Err(err) = insert_result {
if !matches!(err, frankensqlite::FrankenError::UniqueViolation { .. }) {
return Err(err.into());
}
self.conn.execute_compat(
"UPDATE embedding_jobs
SET total_docs = ?3
WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
fparams![db_path, model_id, total_docs],
)?;
}
}
self.conn
.query_row_map(
"SELECT id FROM embedding_jobs
WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')
ORDER BY id DESC
LIMIT 1",
fparams![db_path, model_id],
|row| row.get_typed(0),
)
.with_context(|| "resolving embedding job id after upsert")
}
pub fn start_embedding_job(&self, job_id: i64) -> Result<()> {
self.conn.execute_compat(
"UPDATE embedding_jobs SET status = 'running', started_at = datetime('now') WHERE id = ?1",
fparams![job_id],
)?;
Ok(())
}
pub fn complete_embedding_job(&self, job_id: i64) -> Result<()> {
self.conn.execute_compat(
"UPDATE embedding_jobs SET status = 'completed', completed_at = datetime('now') WHERE id = ?1",
fparams![job_id],
)?;
Ok(())
}
pub fn fail_embedding_job(&self, job_id: i64, error: &str) -> Result<()> {
self.conn.execute_compat(
"UPDATE embedding_jobs SET status = 'failed', error_message = ?2, completed_at = datetime('now') WHERE id = ?1",
fparams![job_id, error],
)?;
Ok(())
}
pub fn cancel_embedding_jobs(&self, db_path: &str, model_id: Option<&str>) -> Result<usize> {
if let Some(mid) = model_id {
Ok(self.conn.execute_compat(
"UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
fparams![db_path, mid],
)?)
} else {
Ok(self.conn.execute_compat(
"UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND status IN ('pending', 'running')",
fparams![db_path],
)?)
}
}
pub fn update_job_progress(&self, job_id: i64, completed_docs: i64) -> Result<()> {
self.conn.execute_compat(
"UPDATE embedding_jobs SET completed_docs = ?2 WHERE id = ?1",
fparams![job_id, completed_docs],
)?;
Ok(())
}
pub fn count_sessions_in_range(
&self,
start_ts_ms: Option<i64>,
end_ts_ms: Option<i64>,
agent_slug: Option<&str>,
source_id: Option<&str>,
) -> Result<(i64, bool)> {
let agent = agent_slug.unwrap_or("all");
let source = source_id.unwrap_or("all");
let stats_count: i64 = self
.conn
.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
row.get_typed(0)
})
.unwrap_or(0);
if stats_count == 0 {
return self.count_sessions_direct(start_ts_ms, end_ts_ms, agent_slug, source_id);
}
let start_day = start_ts_ms.map(Self::day_id_from_millis);
let end_day = end_ts_ms.map(Self::day_id_from_millis);
let count: i64 = match (start_day, end_day) {
(Some(start), Some(end)) => self.conn.query_row_map(
"SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4",
fparams![start, end, agent, source],
|row| row.get_typed(0),
)?,
(Some(start), None) => self.conn.query_row_map(
"SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
WHERE day_id >= ?1 AND agent_slug = ?2 AND source_id = ?3",
fparams![start, agent, source],
|row| row.get_typed(0),
)?,
(None, Some(end)) => self.conn.query_row_map(
"SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
WHERE day_id <= ?1 AND agent_slug = ?2 AND source_id = ?3",
fparams![end, agent, source],
|row| row.get_typed(0),
)?,
(None, None) => self.conn.query_row_map(
"SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
WHERE agent_slug = ?1 AND source_id = ?2",
fparams![agent, source],
|row| row.get_typed(0),
)?,
};
Ok((count, true))
}
fn count_sessions_direct(
&self,
start_ts_ms: Option<i64>,
end_ts_ms: Option<i64>,
agent_slug: Option<&str>,
source_id: Option<&str>,
) -> Result<(i64, bool)> {
let mut sql = "SELECT COUNT(*) FROM conversations c WHERE 1=1".to_string();
let mut param_values: Vec<ParamValue> = Vec::new();
let mut idx = 1;
if let Some(start) = start_ts_ms {
sql.push_str(&format!(" AND c.started_at >= ?{idx}"));
param_values.push(ParamValue::from(start));
idx += 1;
}
if let Some(end) = end_ts_ms {
sql.push_str(&format!(" AND c.started_at <= ?{idx}"));
param_values.push(ParamValue::from(end));
idx += 1;
}
if let Some(agent) = agent_slug
&& agent != "all"
{
sql.push_str(&format!(
" AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug = ?{idx})"
));
param_values.push(ParamValue::from(agent));
idx += 1;
}
if let Some(source) = source_id
&& source != "all"
{
sql.push_str(&format!(" AND c.source_id = ?{idx}"));
param_values.push(ParamValue::from(source));
let _ = idx; }
let count: i64 = self
.conn
.query_row_map(&sql, ¶m_values, |row| row.get_typed(0))?;
Ok((count, false))
}
pub fn get_daily_histogram(
&self,
start_ts_ms: i64,
end_ts_ms: i64,
agent_slug: Option<&str>,
source_id: Option<&str>,
) -> Result<Vec<DailyCount>> {
let start_day = Self::day_id_from_millis(start_ts_ms);
let end_day = Self::day_id_from_millis(end_ts_ms);
let agent = agent_slug.unwrap_or("all");
let source = source_id.unwrap_or("all");
let rows = self.conn.query_map_collect(
"SELECT day_id, session_count, message_count, total_chars
FROM daily_stats
WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4
ORDER BY day_id",
fparams![start_day, end_day, agent, source],
|row| {
Ok(DailyCount {
day_id: row.get_typed(0)?,
sessions: row.get_typed(1)?,
messages: row.get_typed(2)?,
chars: row.get_typed(3)?,
})
},
)?;
Ok(rows)
}
pub fn daily_stats_health(&self) -> Result<DailyStatsHealth> {
let row_count: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
row.get_typed(0)
})?;
let oldest_update: Option<i64> = self.conn.query_row_map(
"SELECT MIN(last_updated) FROM daily_stats",
fparams![],
|row| row.get_typed(0),
)?;
let conversation_count: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})?;
let materialized_total: i64 = self.conn.query_row_map(
"SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| row.get_typed(0),
)?;
Ok(DailyStatsHealth {
populated: row_count > 0,
row_count,
oldest_update_ms: oldest_update,
conversation_count,
materialized_total,
drift: (conversation_count - materialized_total).abs(),
})
}
pub fn insert_conversations_batched(
&self,
conversations: &[(i64, Option<i64>, &Conversation)],
) -> Result<Vec<InsertOutcome>> {
if conversations.is_empty() {
return Ok(Vec::new());
}
self.ensure_sources_for_batch(conversations)?;
let defer_lexical_updates = defer_storage_lexical_updates_enabled();
let defer_analytics_updates = defer_analytics_updates_enabled();
let pricing_table = PricingTable::franken_load(&self.conn).unwrap_or_else(|e| {
tracing::warn!(target: "cass::analytics::pricing", error = %e, "failed to load pricing table");
PricingTable { entries: Vec::new() }
});
let mut pricing_diag = PricingDiagnostics::default();
let mut tx = self.conn.transaction()?;
ensure_agents_in_tx(&tx, conversations)?;
ensure_workspaces_in_tx(&tx, conversations)?;
ensure_sources_in_tx(&tx, conversations)?;
let mut outcomes = Vec::with_capacity(conversations.len());
let mut fts_entries = Vec::new();
let mut fts_pending_chars = 0usize;
let mut fts_inserted_total = 0usize;
let mut fts_count_total = 0usize;
let mut stats = StatsAggregator::new();
let mut token_stats = TokenStatsAggregator::new();
let mut token_entries: Vec<TokenUsageEntry> = Vec::new();
let mut metrics_entries: Vec<MessageMetricsEntry> = Vec::new();
let mut rollup_agg = AnalyticsRollupAggregator::new();
let mut conv_ids_to_summarize: Vec<i64> = Vec::new();
let mut pending_conversation_ids: HashMap<PendingConversationKey, i64> = HashMap::new();
let mut pending_message_fingerprints: HashMap<i64, HashMap<i64, MessageMergeFingerprint>> =
HashMap::new();
let mut pending_message_replay_fingerprints: HashMap<
i64,
HashSet<MessageReplayFingerprint>,
> = HashMap::new();
for &(agent_id, workspace_id, raw_conv) in conversations {
let normalized_conv = normalized_conversation_for_storage(raw_conv);
let conv = normalized_conv.as_ref();
let mut total_chars: i64 = 0;
let mut inserted_indices = Vec::with_capacity(conv.messages.len());
let mut inserted_messages: Vec<(i64, &Message)> =
Vec::with_capacity(conv.messages.len());
let mut session_count_delta = 1_i64;
let conversation_key = conversation_merge_key(agent_id, conv);
let existing_conv_id = if let Some(existing_id) =
pending_conversation_ids.get(&conversation_key)
{
Some(*existing_id)
} else {
let existing_id =
franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
if let Some(existing_id) = existing_id {
pending_conversation_ids.insert(conversation_key.clone(), existing_id);
}
existing_id
};
let conv_id = if let Some(existing_id) = existing_conv_id {
session_count_delta = 0;
let ExistingMessageLookup {
by_idx: mut existing_messages,
replay: mut existing_replay_fingerprints,
} = franken_existing_message_lookup_with_pending(
&tx,
existing_id,
&conv.messages,
&mut pending_message_fingerprints,
&mut pending_message_replay_fingerprints,
)?;
let ExistingConversationNewMessages {
messages: new_messages,
new_chars,
idx_collision_count,
first_collision_idx,
} = collect_new_messages_for_existing_conversation(
existing_id,
conv,
&mut existing_messages,
&mut existing_replay_fingerprints,
"skipping replay-equivalent recovered message with shifted idx during batched merge",
);
let (inserted_last_idx, inserted_last_created_at) =
borrowed_messages_tail_state(&new_messages);
let inserted_message_ids =
franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
total_chars += new_chars;
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
if !defer_lexical_updates {
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_count_total += 1;
fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
}
}
inserted_indices.push(msg.idx);
inserted_messages.push((msg_id, msg));
}
if idx_collision_count > 0 {
tracing::warn!(
conversation_id = existing_id,
collision_count = idx_collision_count,
first_idx = first_collision_idx,
source_path = %conv.source_path.display(),
"message idx collisions encountered during batched conversation merge; retaining canonical message variants"
);
}
let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
franken_update_conversation_tail_state(
&tx,
existing_id,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
{
franken_update_external_conversation_tail_lookup_key(
&tx,
&lookup_key,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
}
pending_message_fingerprints.insert(existing_id, existing_messages);
pending_message_replay_fingerprints
.insert(existing_id, existing_replay_fingerprints);
existing_id
} else {
match franken_insert_conversation_or_get_existing(
&tx,
agent_id,
workspace_id,
conv,
)? {
ConversationInsertStatus::Inserted(new_conv_id) => {
pending_conversation_ids.insert(conversation_key.clone(), new_conv_id);
let pending_messages =
pending_message_fingerprints.entry(new_conv_id).or_default();
let pending_replay_fingerprints = pending_message_replay_fingerprints
.entry(new_conv_id)
.or_default();
let mut new_messages = Vec::new();
for msg in &conv.messages {
let incoming_replay = message_replay_fingerprint(msg);
if pending_messages.contains_key(&msg.idx)
|| pending_replay_fingerprints.contains(&incoming_replay)
{
continue;
}
pending_messages.insert(msg.idx, message_merge_fingerprint(msg));
pending_replay_fingerprints.insert(incoming_replay);
new_messages.push(msg);
}
let inserted_message_ids =
franken_batch_insert_new_messages(&tx, new_conv_id, &new_messages)?;
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
if !defer_lexical_updates {
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_count_total += 1;
fts_pending_chars =
fts_pending_chars.saturating_add(msg.content.len());
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
}
}
total_chars += msg.content.len() as i64;
inserted_indices.push(msg.idx);
inserted_messages.push((msg_id, msg));
}
new_conv_id
}
ConversationInsertStatus::Existing(existing_id) => {
session_count_delta = 0;
pending_conversation_ids.insert(conversation_key.clone(), existing_id);
let ExistingMessageLookup {
by_idx: mut existing_messages,
replay: mut existing_replay_fingerprints,
} = franken_existing_message_lookup_with_pending(
&tx,
existing_id,
&conv.messages,
&mut pending_message_fingerprints,
&mut pending_message_replay_fingerprints,
)?;
let ExistingConversationNewMessages {
messages: new_messages,
new_chars,
idx_collision_count,
first_collision_idx,
} = collect_new_messages_for_existing_conversation(
existing_id,
conv,
&mut existing_messages,
&mut existing_replay_fingerprints,
"skipping replay-equivalent recovered message with shifted idx after duplicate conversation recovery",
);
let (inserted_last_idx, inserted_last_created_at) =
borrowed_messages_tail_state(&new_messages);
let inserted_message_ids =
franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
total_chars += new_chars;
for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
if !defer_lexical_updates {
fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
fts_count_total += 1;
fts_pending_chars =
fts_pending_chars.saturating_add(msg.content.len());
if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
|| fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
{
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
}
}
inserted_indices.push(msg.idx);
inserted_messages.push((msg_id, msg));
}
if idx_collision_count > 0 {
tracing::warn!(
conversation_id = existing_id,
collision_count = idx_collision_count,
first_idx = first_collision_idx,
source_path = %conv.source_path.display(),
"message idx collisions encountered after duplicate conversation recovery; retaining canonical message variants"
);
}
let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
franken_update_conversation_tail_state(
&tx,
existing_id,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
if let Some(lookup_key) =
conversation_external_lookup_key_for_conv(agent_id, conv)
{
franken_update_external_conversation_tail_lookup_key(
&tx,
&lookup_key,
conv_last_ts,
inserted_last_idx,
inserted_last_created_at,
)?;
}
pending_message_fingerprints.insert(existing_id, existing_messages);
pending_message_replay_fingerprints
.insert(existing_id, existing_replay_fingerprints);
existing_id
}
}
};
if !defer_analytics_updates {
let delta = StatsDelta {
session_count_delta,
message_count_delta: inserted_messages.len() as i64,
total_chars_delta: total_chars,
};
let effective_started_at = conversation_effective_started_at(conv);
let day_id = effective_started_at
.map(FrankenStorage::day_id_from_millis)
.unwrap_or(0);
stats.record_delta(
&conv.agent_slug,
&conv.source_id,
day_id,
delta.session_count_delta,
delta.message_count_delta,
delta.total_chars_delta,
);
let conv_day_id = day_id;
let mut session_model_family = String::from("unknown");
let mut has_any_tokens = false;
for &(message_id, msg) in &inserted_messages {
let role_s = role_str(&msg.role);
let usage = if historical_raw_json(&msg.extra_json).is_some() {
crate::connectors::extract_tokens_for_agent(
&conv.agent_slug,
&serde_json::Value::Null,
&msg.content,
&role_s,
)
} else {
crate::connectors::extract_tokens_for_agent(
&conv.agent_slug,
&msg.extra_json,
&msg.content,
&role_s,
)
};
let msg_ts = msg
.created_at
.or(conversation_effective_started_at(conv))
.unwrap_or(0);
let msg_day_id = if msg_ts > 0 {
FrankenStorage::day_id_from_millis(msg_ts)
} else {
conv_day_id
};
let model_info = usage
.model_name
.as_deref()
.map(crate::connectors::normalize_model);
let model_family = model_info
.as_ref()
.map(|i| i.family.clone())
.unwrap_or_else(|| "unknown".into());
let model_tier = model_info
.as_ref()
.map(|i| i.tier.clone())
.unwrap_or_else(|| "unknown".into());
let provider = usage
.provider
.clone()
.or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
.unwrap_or_else(|| "unknown".into());
if model_family != "unknown" {
session_model_family = model_family.clone();
}
let estimated_cost = pricing_table.compute_cost(
usage.model_name.as_deref(),
msg_day_id,
usage.input_tokens,
usage.output_tokens,
usage.cache_read_tokens,
usage.cache_creation_tokens,
);
if estimated_cost.is_some() {
pricing_diag.record_priced();
} else if usage.has_token_data() {
pricing_diag.record_unpriced(usage.model_name.as_deref());
}
token_stats.record(
&conv.agent_slug,
&conv.source_id,
msg_day_id,
&model_family,
&role_s,
&usage,
msg.content.len() as i64,
estimated_cost.unwrap_or(0.0),
);
if usage.has_token_data() {
has_any_tokens = true;
}
let content_chars = msg.content.len() as i64;
let content_tokens_est = content_chars / 4;
let msg_hour_id = FrankenStorage::hour_id_from_millis(msg_ts);
let has_plan = has_plan_for_role(&role_s, &msg.content);
token_entries.push(TokenUsageEntry {
message_id,
conversation_id: conv_id,
agent_id,
workspace_id,
source_id: conv.source_id.clone(),
timestamp_ms: msg_ts,
day_id: msg_day_id,
model_name: usage.model_name.clone(),
model_family: Some(model_family.clone()),
model_tier: Some(model_tier.clone()),
service_tier: usage.service_tier.clone(),
provider: Some(provider.clone()),
input_tokens: usage.input_tokens,
output_tokens: usage.output_tokens,
cache_read_tokens: usage.cache_read_tokens,
cache_creation_tokens: usage.cache_creation_tokens,
thinking_tokens: usage.thinking_tokens,
total_tokens: usage.total_tokens(),
estimated_cost_usd: estimated_cost,
role: role_s.to_string(),
content_chars,
has_tool_calls: usage.has_tool_calls,
tool_call_count: usage.tool_call_count,
data_source: usage.data_source.as_str().to_string(),
});
let mm = MessageMetricsEntry {
message_id,
created_at_ms: msg_ts,
hour_id: msg_hour_id,
day_id: msg_day_id,
agent_slug: conv.agent_slug.clone(),
workspace_id: workspace_id.unwrap_or(0),
source_id: conv.source_id.clone(),
role: role_s.to_string(),
content_chars,
content_tokens_est,
model_name: usage.model_name.clone(),
model_family: model_family.clone(),
model_tier: model_tier.clone(),
provider,
api_input_tokens: usage.input_tokens,
api_output_tokens: usage.output_tokens,
api_cache_read_tokens: usage.cache_read_tokens,
api_cache_creation_tokens: usage.cache_creation_tokens,
api_thinking_tokens: usage.thinking_tokens,
api_service_tier: usage.service_tier.clone(),
api_data_source: usage.data_source.as_str().to_string(),
tool_call_count: usage.tool_call_count as i64,
has_tool_calls: usage.has_tool_calls,
has_plan,
};
rollup_agg.record(&mm);
metrics_entries.push(mm);
}
if session_count_delta > 0 {
token_stats.record_session(
&conv.agent_slug,
&conv.source_id,
conv_day_id,
&session_model_family,
);
}
if has_any_tokens {
conv_ids_to_summarize.push(conv_id);
}
}
outcomes.push(InsertOutcome {
conversation_id: conv_id,
conversation_inserted: session_count_delta > 0,
inserted_indices,
});
}
if !defer_lexical_updates {
flush_pending_fts_entries(
self,
&tx,
&mut fts_entries,
&mut fts_pending_chars,
&mut fts_inserted_total,
)?;
}
if !defer_lexical_updates && fts_count_total > 0 {
tracing::debug!(
target: "cass::perf::fts5",
total = fts_count_total,
inserted = fts_inserted_total,
conversations = conversations.len(),
"franken_batch_fts_insert_complete"
);
}
if !defer_analytics_updates && !stats.is_empty() {
let entries = stats.expand();
let affected = franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
tracing::debug!(
target: "cass::perf::daily_stats",
raw = stats.raw_entry_count(),
expanded = entries.len(),
affected = affected,
"franken_batched_stats_update_complete"
);
}
if !defer_analytics_updates && !token_entries.is_empty() {
let token_count = token_entries.len();
let inserted = franken_insert_token_usage_batched_in_tx(&tx, &token_entries)?;
tracing::debug!(
target: "cass::perf::token_usage",
total = token_count,
inserted = inserted,
"franken_batch_token_usage_insert_complete"
);
}
if !defer_analytics_updates && !token_stats.is_empty() {
let entries = token_stats.expand();
let affected = franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
tracing::debug!(
target: "cass::perf::token_daily_stats",
raw = token_stats.raw_entry_count(),
expanded = entries.len(),
affected = affected,
"franken_batched_token_stats_update_complete"
);
}
if !defer_analytics_updates && !metrics_entries.is_empty() {
let mm_count = metrics_entries.len();
let inserted = franken_insert_message_metrics_batched_in_tx(&tx, &metrics_entries)?;
tracing::debug!(
target: "cass::perf::message_metrics",
total = mm_count,
inserted = inserted,
"franken_batch_message_metrics_insert_complete"
);
}
if !defer_analytics_updates && !rollup_agg.is_empty() {
let (hourly, daily, models_daily) =
franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
tracing::debug!(
target: "cass::perf::usage_rollups",
hourly_buckets = rollup_agg.hourly_entry_count(),
daily_buckets = rollup_agg.daily_entry_count(),
models_daily_buckets = rollup_agg.models_daily_entry_count(),
hourly_affected = hourly,
daily_affected = daily,
models_daily_affected = models_daily,
"franken_batched_usage_rollups_complete"
);
}
if !defer_analytics_updates {
for conv_id in &conv_ids_to_summarize {
franken_update_conversation_token_summaries_in_tx(&tx, *conv_id)?;
}
}
tx.commit()?;
pricing_diag.log_summary();
Ok(outcomes)
}
}
fn normalized_storage_source_parts(
source_id: Option<&str>,
origin_kind: Option<&str>,
origin_host: Option<&str>,
) -> (String, SourceKind, Option<String>) {
let host_label = crate::search::tantivy::normalized_index_origin_host(origin_host);
let source_id = crate::search::tantivy::normalized_index_source_id(
source_id,
origin_kind,
host_label.as_deref(),
);
if source_id == LOCAL_SOURCE_ID {
(source_id, SourceKind::Local, None)
} else {
(source_id, SourceKind::Ssh, host_label)
}
}
fn normalized_source_for_conversation(conv: &Conversation) -> Source {
let (id, kind, host_label) = normalized_storage_source_parts(
Some(conv.source_id.as_str()),
None,
conv.origin_host.as_deref(),
);
Source {
id,
kind,
host_label,
machine_id: None,
platform: None,
config_json: None,
created_at: None,
updated_at: None,
}
}
fn is_bootstrap_local_source(source: &Source) -> bool {
source.id == LOCAL_SOURCE_ID
&& matches!(source.kind, SourceKind::Local)
&& source.host_label.is_none()
&& source.machine_id.is_none()
&& source.platform.is_none()
&& source.config_json.is_none()
&& source.created_at.is_none()
&& source.updated_at.is_none()
}
fn normalized_conversation_for_storage<'a>(conv: &'a Conversation) -> Cow<'a, Conversation> {
let normalized_source = normalized_source_for_conversation(conv);
if normalized_source.id == conv.source_id && normalized_source.host_label == conv.origin_host {
Cow::Borrowed(conv)
} else {
let mut normalized = conv.clone();
normalized.source_id = normalized_source.id;
normalized.origin_host = normalized_source.host_label;
Cow::Owned(normalized)
}
}
impl FrankenStorage {
fn ensure_source_for_conversation(&self, conv: &Conversation) -> Result<()> {
let source = normalized_source_for_conversation(conv);
if is_bootstrap_local_source(&source) {
return Ok(());
}
let cache_key = EnsuredConversationSourceKey::from_source(&source);
if self.conversation_source_already_ensured(&cache_key) {
return Ok(());
}
self.upsert_source(&source)?;
self.mark_conversation_source_ensured(cache_key);
Ok(())
}
fn ensure_sources_for_batch(
&self,
conversations: &[(i64, Option<i64>, &Conversation)],
) -> Result<()> {
let mut seen = HashSet::with_capacity(conversations.len());
for &(_, _, conv) in conversations {
let source = normalized_source_for_conversation(conv);
if seen.insert(source.id.clone()) {
if is_bootstrap_local_source(&source) {
continue;
}
self.upsert_source(&source)?;
self.mark_conversation_source_ensured(EnsuredConversationSourceKey::from_source(
&source,
));
}
}
Ok(())
}
}
fn franken_last_rowid(tx: &FrankenTransaction<'_>) -> Result<i64> {
tx.last_insert_rowid()
.ok()
.filter(|&id| id > 0)
.with_context(|| "last_insert_rowid() returned NULL or 0 after INSERT")
}
fn ensure_agents_in_tx(
tx: &FrankenTransaction<'_>,
conversations: &[(i64, Option<i64>, &Conversation)],
) -> Result<()> {
let mut seen = HashSet::new();
let now = FrankenStorage::now_millis();
for &(agent_id, _, conv) in conversations {
if !seen.insert(agent_id) {
continue;
}
let exists: i64 = tx.query_row_map(
"SELECT COUNT(*) FROM agents WHERE id = ?1",
fparams![agent_id],
|row| row.get_typed(0),
)?;
if exists == 0 {
tracing::debug!(
target: "cass::fk_guard",
agent_id,
slug = %conv.agent_slug,
"inserting agent row inside transaction to satisfy FK constraint"
);
tx.execute_compat(
"INSERT OR IGNORE INTO agents(id, slug, name, kind, created_at, updated_at)
VALUES(?1, ?2, ?3, 'cli', ?4, ?5)",
fparams![
agent_id,
conv.agent_slug.as_str(),
conv.agent_slug.as_str(),
now,
now
],
)?;
}
}
Ok(())
}
fn ensure_workspaces_in_tx(
tx: &FrankenTransaction<'_>,
conversations: &[(i64, Option<i64>, &Conversation)],
) -> Result<()> {
let mut seen = HashSet::new();
for &(_, workspace_id, conv) in conversations {
let ws_id = match workspace_id {
Some(id) => id,
None => continue,
};
if !seen.insert(ws_id) {
continue;
}
let exists: i64 = tx.query_row_map(
"SELECT COUNT(*) FROM workspaces WHERE id = ?1",
fparams![ws_id],
|row| row.get_typed(0),
)?;
if exists == 0 {
let path_str = conv
.workspace
.as_ref()
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
tracing::debug!(
target: "cass::fk_guard",
workspace_id = ws_id,
path = %path_str,
"inserting workspace row inside transaction to satisfy FK constraint"
);
tx.execute_compat(
"INSERT OR IGNORE INTO workspaces(id, path) VALUES(?1, ?2)",
fparams![ws_id, path_str.as_str()],
)?;
}
}
Ok(())
}
fn ensure_sources_in_tx(
tx: &FrankenTransaction<'_>,
conversations: &[(i64, Option<i64>, &Conversation)],
) -> Result<()> {
let mut seen = HashSet::new();
for &(_, _, conv) in conversations {
let (source_id, source_kind, host_label) = normalized_storage_source_parts(
Some(conv.source_id.as_str()),
None,
conv.origin_host.as_deref(),
);
if !seen.insert(source_id.clone()) {
continue;
}
let exists: i64 = tx.query_row_map(
"SELECT COUNT(*) FROM sources WHERE id = ?1",
fparams![source_id.as_str()],
|row| row.get_typed(0),
)?;
if exists == 0 {
let kind_str = source_kind.to_string();
let now = FrankenStorage::now_millis();
tracing::debug!(
target: "cass::fk_guard",
source_id = %source_id,
kind = kind_str.as_str(),
"inserting source row inside transaction to satisfy FK constraint"
);
tx.execute_compat(
"INSERT OR IGNORE INTO sources(id, kind, host_label, created_at, updated_at)
VALUES(?1, ?2, ?3, ?4, ?5)",
fparams![
source_id.as_str(),
kind_str.as_str(),
host_label.as_deref(),
now,
now
],
)?;
}
}
Ok(())
}
fn env_flag_enabled(name: &str) -> bool {
dotenvy::var(name)
.map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
.unwrap_or(false)
}
fn defer_storage_lexical_updates_enabled() -> bool {
env_flag_enabled("CASS_DEFER_LEXICAL_UPDATES")
}
fn defer_analytics_updates_enabled() -> bool {
env_flag_enabled("CASS_DEFER_ANALYTICS_UPDATES")
}
enum ConversationInsertStatus {
Inserted(i64),
Existing(i64),
}
fn franken_find_external_conversation_tail_lookup(
tx: &FrankenTransaction<'_>,
lookup_key: &str,
) -> Result<Option<ExistingConversationWithTail>> {
let params = [SqliteValue::from(lookup_key)];
let row = tx
.query_row_with_params(
"SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
FROM conversation_external_tail_lookup
WHERE lookup_key = ?1",
¶ms,
)
.optional()?;
let Some(row) = row else {
return Ok(None);
};
let id = row.get_typed(0)?;
let ended_at = row.get_typed(1)?;
let last_message_idx = row.get_typed(2)?;
let last_message_created_at = row.get_typed(3)?;
Ok(Some(ExistingConversationWithTail {
id,
tail_state: existing_conversation_tail_state_from_cached(
last_message_idx,
last_message_created_at,
ended_at,
),
}))
}
fn franken_find_external_conversation_lookup(
tx: &FrankenTransaction<'_>,
lookup_key: &str,
) -> Result<Option<i64>> {
Ok(franken_find_external_conversation_tail_lookup(tx, lookup_key)?.map(|existing| existing.id))
}
fn franken_insert_external_conversation_tail_lookup_key(
tx: &FrankenTransaction<'_>,
lookup_key: &str,
conversation_id: i64,
ended_at: Option<i64>,
last_message_idx: Option<i64>,
last_message_created_at: Option<i64>,
) -> Result<()> {
let params = [
SqliteValue::from(lookup_key),
SqliteValue::from(conversation_id),
SqliteValue::from(ended_at),
SqliteValue::from(last_message_idx),
SqliteValue::from(last_message_created_at),
];
tx.execute_with_params(
"INSERT OR REPLACE INTO conversation_external_tail_lookup(
lookup_key, conversation_id, ended_at, last_message_idx, last_message_created_at
) VALUES(?1, ?2, ?3, ?4, ?5)",
¶ms,
)?;
Ok(())
}
fn franken_insert_external_conversation_tail_lookup(
tx: &FrankenTransaction<'_>,
source_id: &str,
agent_id: i64,
external_id: &str,
existing: ExistingConversationWithTail,
) -> Result<()> {
let lookup_key = conversation_external_lookup_key(source_id, agent_id, external_id);
let ended_at = existing.tail_state.and_then(|state| state.ended_at);
let last_message_idx = existing.tail_state.map(|state| state.last_message_idx);
let last_message_created_at = existing
.tail_state
.map(|state| state.last_message_created_at);
franken_insert_external_conversation_tail_lookup_key(
tx,
&lookup_key,
existing.id,
ended_at,
last_message_idx,
last_message_created_at,
)
}
fn franken_update_external_conversation_tail_lookup_key(
tx: &FrankenTransaction<'_>,
lookup_key: &str,
ended_at_candidate: Option<i64>,
last_message_idx_candidate: Option<i64>,
last_message_created_at_candidate: Option<i64>,
) -> Result<()> {
if ended_at_candidate.is_none()
&& last_message_idx_candidate.is_none()
&& last_message_created_at_candidate.is_none()
{
return Ok(());
}
tx.execute_compat(
"UPDATE conversation_external_tail_lookup
SET ended_at = CASE
WHEN ?1 IS NULL THEN ended_at
ELSE MAX(IFNULL(ended_at, 0), ?1)
END,
last_message_idx = CASE
WHEN ?2 IS NULL THEN last_message_idx
WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
ELSE last_message_idx
END,
last_message_created_at = CASE
WHEN ?3 IS NULL THEN last_message_created_at
WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
ELSE last_message_created_at
END
WHERE lookup_key = ?4",
fparams![
ended_at_candidate,
last_message_idx_candidate,
last_message_created_at_candidate,
lookup_key
],
)?;
Ok(())
}
fn franken_set_external_conversation_tail_lookup_after_append(
tx: &FrankenTransaction<'_>,
lookup_key: &str,
ended_at: i64,
last_message_idx: i64,
last_message_created_at: i64,
) -> Result<()> {
tx.execute_compat(
"UPDATE conversation_external_tail_lookup
SET ended_at = ?1,
last_message_idx = ?2,
last_message_created_at = ?3
WHERE lookup_key = ?4",
fparams![
ended_at,
last_message_idx,
last_message_created_at,
lookup_key
],
)?;
Ok(())
}
fn franken_update_external_conversation_tail_after_append(
tx: &FrankenTransaction<'_>,
agent_id: i64,
conv: &Conversation,
used_append_tail_plan: bool,
exact_append_set: bool,
inserted_last_idx: Option<i64>,
inserted_last_created_at: Option<i64>,
) -> Result<()> {
let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv) else {
return Ok(());
};
if exact_append_set
&& let (Some(last_message_idx), Some(last_message_created_at)) =
(inserted_last_idx, inserted_last_created_at)
{
return franken_set_external_conversation_tail_lookup_after_append(
tx,
&lookup_key,
last_message_created_at,
last_message_idx,
last_message_created_at,
);
}
let ended_at_candidate = if used_append_tail_plan {
inserted_last_created_at
} else {
conv.messages.iter().filter_map(|m| m.created_at).max()
};
franken_update_external_conversation_tail_lookup_key(
tx,
&lookup_key,
ended_at_candidate,
inserted_last_idx,
inserted_last_created_at,
)
}
fn franken_find_existing_conversation_by_key(
tx: &FrankenTransaction<'_>,
key: &PendingConversationKey,
conv: Option<&Conversation>,
) -> Result<Option<i64>> {
franken_find_existing_conversation_by_key_impl(tx, key, conv, false)
}
fn franken_find_existing_conversation_by_key_after_conflict(
tx: &FrankenTransaction<'_>,
key: &PendingConversationKey,
conv: Option<&Conversation>,
) -> Result<Option<i64>> {
franken_find_existing_conversation_by_key_impl(tx, key, conv, true)
}
fn franken_find_existing_conversation_by_key_impl(
tx: &FrankenTransaction<'_>,
key: &PendingConversationKey,
conv: Option<&Conversation>,
allow_legacy_external_scan: bool,
) -> Result<Option<i64>> {
match key {
PendingConversationKey::External {
source_id,
agent_id,
external_id,
} => {
let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
if let Some(existing_id) = franken_find_external_conversation_lookup(tx, &lookup_key)? {
return Ok(Some(existing_id));
}
if !allow_legacy_external_scan {
return Ok(None);
}
let existing_id = tx
.query_row_map(
"SELECT id
FROM conversations
WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
fparams![source_id.as_str(), *agent_id, external_id.as_str()],
|row| row.get_typed(0),
)
.optional()?;
if let Some(existing_id) = existing_id {
let tail_state = franken_existing_conversation_append_tail_state(tx, existing_id)?;
franken_insert_external_conversation_tail_lookup_key(
tx,
&lookup_key,
existing_id,
tail_state.and_then(|state| state.ended_at),
tail_state.map(|state| state.last_message_idx),
tail_state.map(|state| state.last_message_created_at),
)?;
Ok(Some(existing_id))
} else {
Ok(None)
}
}
PendingConversationKey::SourcePath {
source_id,
agent_id,
source_path,
started_at,
} => {
let exact_match = tx
.query_row_map(
"SELECT c.id
FROM conversations c
WHERE c.source_id = ?1
AND c.agent_id = ?2
AND c.source_path = ?3
AND ((
COALESCE(
c.started_at,
(SELECT MIN(created_at)
FROM messages
WHERE conversation_id = c.id
AND created_at IS NOT NULL)
) IS NULL
AND ?4 IS NULL
) OR COALESCE(
c.started_at,
(SELECT MIN(created_at)
FROM messages
WHERE conversation_id = c.id
AND created_at IS NOT NULL)
) = ?4)
ORDER BY c.id
LIMIT 1",
fparams![
source_id.as_str(),
*agent_id,
source_path.as_str(),
*started_at
],
|row| row.get_typed(0),
)
.optional()?;
if exact_match.is_some() {
return Ok(exact_match);
}
let Some(conv) = conv else {
return Ok(None);
};
let incoming_fingerprints = conversation_message_fingerprints(conv);
if incoming_fingerprints.is_empty() {
return Ok(None);
}
let incoming_replay_fingerprints = conversation_message_replay_fingerprints(conv);
let candidates: Vec<(i64, Option<i64>)> = tx.query_map_collect(
"SELECT
c.id,
COALESCE(
c.started_at,
(SELECT MIN(created_at)
FROM messages
WHERE conversation_id = c.id
AND created_at IS NOT NULL)
) AS effective_started_at
FROM conversations c
WHERE c.source_id = ?1
AND c.agent_id = ?2
AND c.source_path = ?3
ORDER BY c.id",
fparams![source_id.as_str(), *agent_id, source_path.as_str()],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)?;
let mut best_candidate: Option<(i64, ConversationMergeEvidence)> = None;
for (candidate_id, candidate_started_at) in candidates {
let existing_fingerprints =
franken_existing_message_fingerprints(tx, candidate_id)?;
let existing_replay_fingerprints =
replay_fingerprints_from_merge_set(&existing_fingerprints);
let Some(evidence) = conversation_merge_evidence(
&incoming_fingerprints,
&incoming_replay_fingerprints,
&existing_fingerprints,
&existing_replay_fingerprints,
*started_at,
candidate_started_at,
) else {
continue;
};
let candidate_key = (
evidence.exact_overlap,
evidence.replay_overlap,
evidence.started_close,
evidence.smaller_replay_set,
std::cmp::Reverse(evidence.start_distance_ms),
);
let should_replace = best_candidate
.as_ref()
.map(|(_, best_evidence)| {
candidate_key
> (
best_evidence.exact_overlap,
best_evidence.replay_overlap,
best_evidence.started_close,
best_evidence.smaller_replay_set,
std::cmp::Reverse(best_evidence.start_distance_ms),
)
})
.unwrap_or(true);
if should_replace {
best_candidate = Some((candidate_id, evidence));
}
}
Ok(best_candidate.map(|(candidate_id, _)| candidate_id))
}
}
}
fn franken_insert_conversation_or_get_existing(
tx: &FrankenTransaction<'_>,
agent_id: i64,
workspace_id: Option<i64>,
conv: &Conversation,
) -> Result<ConversationInsertStatus> {
let conversation_key = conversation_merge_key(agent_id, conv);
if let Some(existing_id) =
franken_find_existing_conversation_by_key(tx, &conversation_key, Some(conv))?
{
return Ok(ConversationInsertStatus::Existing(existing_id));
}
franken_insert_conversation_or_get_existing_after_miss(
tx,
agent_id,
workspace_id,
conv,
&conversation_key,
)
}
fn franken_insert_conversation_or_get_existing_after_miss(
tx: &FrankenTransaction<'_>,
agent_id: i64,
workspace_id: Option<i64>,
conv: &Conversation,
conversation_key: &PendingConversationKey,
) -> Result<ConversationInsertStatus> {
match franken_insert_conversation(tx, agent_id, workspace_id, conv) {
Ok(Some(conv_id)) => Ok(ConversationInsertStatus::Inserted(conv_id)),
Ok(None) => {
let existing_id =
franken_find_existing_conversation_by_key_after_conflict(
tx,
conversation_key,
Some(conv),
)?
.with_context(|| {
format!(
"conversation INSERT produced a duplicate conflict but existing row was not found for source_id={} agent_id={} external_id={:?} source_path={}",
conv.source_id,
agent_id,
conv.external_id,
conv.source_path.display()
)
})?;
tracing::warn!(
source_id = %conv.source_id,
agent_id,
external_id = ?conv.external_id,
existing_id,
source_path = %conv.source_path.display(),
"conversation INSERT: duplicate gracefully recovered, reusing existing row"
);
Ok(ConversationInsertStatus::Existing(existing_id))
}
Err(error) => {
tracing::error!(
source_id = %conv.source_id,
agent_id,
external_id = ?conv.external_id,
error = %error,
source_path = %conv.source_path.display(),
"franken_insert_conversation failed"
);
Err(error)
}
}
}
fn franken_insert_conversation(
tx: &FrankenTransaction<'_>,
agent_id: i64,
workspace_id: Option<i64>,
conv: &Conversation,
) -> Result<Option<i64>> {
let (metadata_json_str, metadata_bin) = franken_metadata_insert_payload(&conv.metadata_json)?;
let (last_message_idx, last_message_created_at) = conversation_tail_state(conv);
let metadata_bin_bytes = metadata_bin.as_deref();
match tx.execute_compat(
"INSERT INTO conversations(
agent_id, workspace_id, source_id, external_id, title, source_path,
started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin,
last_message_idx, last_message_created_at
) VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14)",
fparams![
agent_id,
workspace_id,
conv.source_id.as_str(),
conv.external_id.as_deref(),
conv.title.as_deref(),
path_to_string(&conv.source_path),
conv.started_at,
conv.ended_at,
conv.approx_tokens,
metadata_json_str.as_deref(),
conv.origin_host.as_deref(),
metadata_bin_bytes,
last_message_idx,
last_message_created_at
],
) {
Ok(_) => {
let conv_id = franken_last_rowid(tx)?;
franken_insert_conversation_tail_state(
tx,
conv_id,
conv.ended_at,
last_message_idx,
last_message_created_at,
)?;
if let Some(external_id) = conv.external_id.as_deref() {
franken_insert_external_conversation_tail_lookup(
tx,
conv.source_id.as_str(),
agent_id,
external_id,
ExistingConversationWithTail {
id: conv_id,
tail_state: existing_conversation_tail_state_from_cached(
last_message_idx,
last_message_created_at,
conv.ended_at,
),
},
)?;
}
Ok(Some(conv_id))
}
Err(frankensqlite::FrankenError::UniqueViolation { .. }) => {
tracing::debug!(
source_id = %conv.source_id,
agent_id,
external_id = ?conv.external_id,
source_path = %conv.source_path.display(),
"conversation INSERT: duplicate provenance conflict"
);
Ok(None)
}
Err(error) => Err(error.into()),
}
}
type MetadataInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
fn franken_metadata_insert_payload(value: &serde_json::Value) -> Result<MetadataInsertPayload<'_>> {
if let Some(raw) = historical_raw_json(value) {
Ok((Some(Cow::Borrowed(raw)), None))
} else if value.is_null() {
Ok((Some(Cow::Borrowed("null")), None))
} else if value.as_object().is_some_and(|object| object.is_empty()) {
Ok((None, None))
} else if let Some(metadata_bin) = serialize_json_to_msgpack(value) {
Ok((None, Some(metadata_bin)))
} else {
Ok((Some(Cow::Owned(serde_json::to_string(value)?)), None))
}
}
fn franken_insert_new_message(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
msg: &Message,
) -> Result<i64> {
let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
let extra_bin_bytes = extra_bin.as_deref();
tx.execute_compat(
"INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
fparams![
conversation_id,
msg.idx,
role_as_str(&msg.role),
msg.author.as_deref(),
msg.created_at,
msg.content.as_str(),
extra_json_str.as_deref(),
extra_bin_bytes
],
)?;
franken_last_rowid(tx)
}
type MessageInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
fn franken_message_insert_payload(msg: &Message) -> Result<MessageInsertPayload<'_>> {
if let Some(raw) = historical_raw_json(&msg.extra_json) {
Ok((Some(Cow::Borrowed(raw)), None))
} else if msg.extra_json.is_null() {
Ok((None, None))
} else {
let extra_bin = serialize_json_to_msgpack(&msg.extra_json);
if extra_bin.is_some() {
Ok((None, extra_bin))
} else {
Ok((
Some(Cow::Owned(serde_json::to_string(&msg.extra_json)?)),
None,
))
}
}
}
const MESSAGE_INSERT_BATCH_SIZE: usize = 100;
const APPEND_MESSAGE_INSERT_BATCH_SIZE: usize = 50;
fn message_insert_batch_sql(row_count: usize) -> &'static str {
static MESSAGE_INSERT_BATCH_SQL: std::sync::OnceLock<Vec<String>> = std::sync::OnceLock::new();
let max_batch_size = MESSAGE_INSERT_BATCH_SIZE.max(APPEND_MESSAGE_INSERT_BATCH_SIZE);
let cached_sql = MESSAGE_INSERT_BATCH_SQL.get_or_init(|| {
let mut sql_by_row_count = Vec::with_capacity(max_batch_size + 1);
sql_by_row_count.push(String::new());
for row_count in 1..=max_batch_size {
let placeholders = (0..row_count)
.map(|idx| {
let base = idx * 8;
format!(
"(?{},?{},?{},?{},?{},?{},?{},?{})",
base + 1,
base + 2,
base + 3,
base + 4,
base + 5,
base + 6,
base + 7,
base + 8
)
})
.collect::<Vec<_>>()
.join(",");
sql_by_row_count.push(format!(
"INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin) VALUES {placeholders}"
));
}
sql_by_row_count
});
cached_sql
.get(row_count)
.map(String::as_str)
.expect("message insert batch size must be covered by the cached SQL table")
}
fn franken_batch_insert_new_messages(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
messages: &[&Message],
) -> Result<Vec<i64>> {
franken_batch_insert_new_messages_with_batch_size(
tx,
conversation_id,
messages,
MESSAGE_INSERT_BATCH_SIZE,
)
}
fn franken_append_insert_new_messages(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
messages: &[&Message],
) -> Result<Vec<i64>> {
franken_batch_insert_new_messages_with_batch_size(
tx,
conversation_id,
messages,
APPEND_MESSAGE_INSERT_BATCH_SIZE,
)
}
fn franken_batch_insert_new_messages_with_batch_size(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
messages: &[&Message],
batch_size: usize,
) -> Result<Vec<i64>> {
let batch_size = batch_size.max(1);
let mut inserted_ids = Vec::with_capacity(messages.len());
for chunk in messages.chunks(batch_size) {
if chunk.len() == 1 {
inserted_ids.push(franken_insert_new_message(tx, conversation_id, chunk[0])?);
continue;
}
let sql = message_insert_batch_sql(chunk.len());
let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
for msg in chunk {
let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
param_values.push(SqliteValue::from(conversation_id));
param_values.push(SqliteValue::from(msg.idx));
param_values.push(SqliteValue::from(role_as_str(&msg.role)));
param_values.push(SqliteValue::from(msg.author.as_deref()));
param_values.push(SqliteValue::from(msg.created_at));
param_values.push(SqliteValue::from(msg.content.as_str()));
param_values.push(SqliteValue::from(extra_json_str.as_deref()));
param_values.push(SqliteValue::from(extra_bin.as_deref()));
}
tx.execute_with_params(sql, ¶m_values)?;
let last_id = franken_last_rowid(tx)?;
let first_id = last_id
.checked_sub((chunk.len() - 1) as i64)
.with_context(|| {
format!(
"inferring rowid range for {}-row message batch ending at {last_id}",
chunk.len()
)
})?;
inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
}
Ok(inserted_ids)
}
#[cfg(test)]
fn franken_insert_new_message_with_profile(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
msg: &Message,
profile: &mut MessageInsertSubstageProfile,
) -> Result<i64> {
profile.single_row_calls += 1;
profile.batch_rows += 1;
let payload_start = Instant::now();
let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
profile.payload_duration += payload_start.elapsed();
let extra_bin_bytes = extra_bin.as_deref();
let execute_start = Instant::now();
tx.execute_compat(
"INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
fparams![
conversation_id,
msg.idx,
role_as_str(&msg.role),
msg.author.as_deref(),
msg.created_at,
msg.content.as_str(),
extra_json_str.as_deref(),
extra_bin_bytes
],
)?;
profile.execute_duration += execute_start.elapsed();
let rowid_start = Instant::now();
let rowid = franken_last_rowid(tx)?;
profile.rowid_duration += rowid_start.elapsed();
Ok(rowid)
}
#[cfg(test)]
fn franken_batch_insert_new_messages_with_profile(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
messages: &[&Message],
profile: &mut MessageInsertSubstageProfile,
) -> Result<Vec<i64>> {
franken_batch_insert_new_messages_with_profile_batch_size(
tx,
conversation_id,
messages,
profile,
MESSAGE_INSERT_BATCH_SIZE,
)
}
#[cfg(test)]
fn franken_append_insert_new_messages_with_profile(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
messages: &[&Message],
profile: &mut MessageInsertSubstageProfile,
) -> Result<Vec<i64>> {
franken_batch_insert_new_messages_with_profile_batch_size(
tx,
conversation_id,
messages,
profile,
APPEND_MESSAGE_INSERT_BATCH_SIZE,
)
}
#[cfg(test)]
fn franken_batch_insert_new_messages_with_profile_batch_size(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
messages: &[&Message],
profile: &mut MessageInsertSubstageProfile,
batch_size: usize,
) -> Result<Vec<i64>> {
let batch_size = batch_size.max(1);
let mut inserted_ids = Vec::with_capacity(messages.len());
for chunk in messages.chunks(batch_size) {
if chunk.len() == 1 {
inserted_ids.push(franken_insert_new_message_with_profile(
tx,
conversation_id,
chunk[0],
profile,
)?);
continue;
}
profile.batch_calls += 1;
profile.batch_rows += chunk.len();
let sql_build_start = Instant::now();
let sql = message_insert_batch_sql(chunk.len());
profile.sql_build_duration += sql_build_start.elapsed();
let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
for msg in chunk {
let payload_start = Instant::now();
let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
profile.payload_duration += payload_start.elapsed();
let param_build_start = Instant::now();
param_values.push(SqliteValue::from(conversation_id));
param_values.push(SqliteValue::from(msg.idx));
param_values.push(SqliteValue::from(role_as_str(&msg.role)));
param_values.push(SqliteValue::from(msg.author.as_deref()));
param_values.push(SqliteValue::from(msg.created_at));
param_values.push(SqliteValue::from(msg.content.as_str()));
param_values.push(SqliteValue::from(extra_json_str.as_deref()));
param_values.push(SqliteValue::from(extra_bin.as_deref()));
profile.param_build_duration += param_build_start.elapsed();
}
let execute_start = Instant::now();
tx.execute_with_params(sql, ¶m_values)?;
profile.execute_duration += execute_start.elapsed();
let rowid_start = Instant::now();
let last_id = franken_last_rowid(tx)?;
let first_id = last_id
.checked_sub((chunk.len() - 1) as i64)
.with_context(|| {
format!(
"inferring rowid range for {}-row message batch ending at {last_id}",
chunk.len()
)
})?;
inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
profile.rowid_duration += rowid_start.elapsed();
}
Ok(inserted_ids)
}
fn franken_insert_snippets(
tx: &FrankenTransaction<'_>,
message_id: i64,
snippets: &[Snippet],
) -> Result<()> {
for snip in snippets {
let file_path_str = snip.file_path.as_ref().map(path_to_string);
tx.execute_compat(
"INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text)
VALUES(?1,?2,?3,?4,?5,?6)",
fparams![
message_id,
file_path_str.as_deref(),
snip.start_line,
snip.end_line,
snip.language.as_deref(),
snip.snippet_text.as_deref()
],
)?;
}
Ok(())
}
fn franken_existing_message_fingerprints(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
) -> Result<HashSet<MessageMergeFingerprint>> {
let rows = tx.query_params(
"SELECT idx, role, author, created_at, content
FROM messages
WHERE conversation_id = ?1",
fparams![conversation_id],
)?;
let mut fingerprints = HashSet::with_capacity(rows.len());
for row in rows {
let role: String = row.get_typed(1)?;
let content: String = row.get_typed(4)?;
fingerprints.insert(MessageMergeFingerprint {
idx: row.get_typed(0)?,
created_at: row.get_typed(3)?,
role: role_from_str(&role),
author: row.get_typed(2)?,
content_hash: *blake3::hash(content.as_bytes()).as_bytes(),
});
}
Ok(fingerprints)
}
struct ExistingMessageLookup {
by_idx: HashMap<i64, MessageMergeFingerprint>,
replay: HashSet<MessageReplayFingerprint>,
}
fn franken_existing_message_lookup(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
incoming_messages: &[Message],
) -> Result<ExistingMessageLookup> {
if incoming_messages.is_empty() {
return Ok(ExistingMessageLookup {
by_idx: HashMap::new(),
replay: HashSet::new(),
});
}
let min_idx = incoming_messages
.iter()
.map(|msg| msg.idx)
.min()
.unwrap_or(0);
let max_idx = incoming_messages
.iter()
.map(|msg| msg.idx)
.max()
.unwrap_or(min_idx);
let requires_full_scan = incoming_messages.iter().any(|msg| msg.created_at.is_none());
let created_bounds = incoming_messages
.iter()
.filter_map(|msg| msg.created_at)
.fold(None, |bounds: Option<(i64, i64)>, created_at| {
Some(match bounds {
Some((min_created_at, max_created_at)) => (
min_created_at.min(created_at),
max_created_at.max(created_at),
),
None => (created_at, created_at),
})
});
let mut indexed_by_idx = HashMap::with_capacity(incoming_messages.len());
let mut indexed_replay = HashSet::with_capacity(incoming_messages.len());
let mut exact_idx_match = true;
for msg in incoming_messages {
record_message_lookup_exact_idx_probe();
let Some((role, author, created_at, content)) = tx
.query_row_map(
"SELECT role, author, created_at, content
FROM messages INDEXED BY sqlite_autoindex_messages_1
WHERE conversation_id = ?1 AND idx = ?2
LIMIT 1",
fparams![conversation_id, msg.idx],
|row| {
Ok((
row.get_typed::<String>(0)?,
row.get_typed::<Option<String>>(1)?,
row.get_typed::<Option<i64>>(2)?,
row.get_typed::<String>(3)?,
))
},
)
.optional()?
else {
exact_idx_match = false;
break;
};
let role = role_from_str(&role);
let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
let fingerprint = MessageMergeFingerprint {
idx: msg.idx,
created_at,
role: role.clone(),
author: author.clone(),
content_hash,
};
if fingerprint != message_merge_fingerprint(msg) {
exact_idx_match = false;
break;
}
indexed_by_idx.insert(msg.idx, fingerprint);
indexed_replay.insert(MessageReplayFingerprint {
created_at,
role,
author,
content_hash,
});
}
if exact_idx_match {
return Ok(ExistingMessageLookup {
by_idx: indexed_by_idx,
replay: indexed_replay,
});
}
let (rows, replay_full_scan) = if requires_full_scan {
let rows = tx.query_params(
"SELECT idx, role, author, created_at, content
FROM messages INDEXED BY sqlite_autoindex_messages_1
WHERE conversation_id = ?1",
fparams![conversation_id],
)?;
record_message_lookup_full_scan_query(rows.len());
(rows, true)
} else if let Some((min_created_at, max_created_at)) = created_bounds {
let mut rows = tx.query_params(
"SELECT idx, role, author, created_at, content
FROM messages INDEXED BY sqlite_autoindex_messages_1
WHERE conversation_id = ?1
AND idx >= ?2
AND idx <= ?3",
fparams![conversation_id, min_idx, max_idx],
)?;
rows.extend(tx.query_params(
"SELECT idx, role, author, created_at, content
FROM messages INDEXED BY sqlite_autoindex_messages_1
WHERE conversation_id = ?1
AND created_at IS NOT NULL
AND created_at >= ?2
AND created_at <= ?3",
fparams![conversation_id, min_created_at, max_created_at],
)?);
record_message_lookup_bounded_queries(2, rows.len());
(rows, false)
} else {
let rows = tx.query_params(
"SELECT idx, role, author, created_at, content
FROM messages INDEXED BY sqlite_autoindex_messages_1
WHERE conversation_id = ?1",
fparams![conversation_id],
)?;
record_message_lookup_full_scan_query(rows.len());
(rows, true)
};
let mut by_idx = HashMap::with_capacity(rows.len());
let mut replay = HashSet::with_capacity(rows.len());
for row in rows {
let idx: i64 = row.get_typed(0)?;
let role: String = row.get_typed(1)?;
let author: Option<String> = row.get_typed(2)?;
let created_at: Option<i64> = row.get_typed(3)?;
let content: String = row.get_typed(4)?;
let role = role_from_str(&role);
let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
if idx >= min_idx && idx <= max_idx {
by_idx.insert(
idx,
MessageMergeFingerprint {
idx,
created_at,
role: role.clone(),
author: author.clone(),
content_hash,
},
);
}
let replay_matches = if replay_full_scan {
true
} else if let Some((min_created_at, max_created_at)) = created_bounds {
created_at.is_some_and(|ts| ts >= min_created_at && ts <= max_created_at)
} else {
true
};
if replay_matches {
replay.insert(MessageReplayFingerprint {
created_at,
role,
author,
content_hash,
});
}
}
Ok(ExistingMessageLookup { by_idx, replay })
}
fn franken_existing_message_lookup_with_pending(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
incoming_messages: &[Message],
pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
) -> Result<ExistingMessageLookup> {
if let (Some(by_idx), Some(replay)) = (
pending_message_fingerprints.get(&conversation_id),
pending_message_replay_fingerprints.get(&conversation_id),
) {
if incoming_messages.iter().all(|msg| {
by_idx.contains_key(&msg.idx) || replay.contains(&message_replay_fingerprint(msg))
}) {
return Ok(ExistingMessageLookup {
by_idx: by_idx.clone(),
replay: replay.clone(),
});
}
let fresh = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
let mut merged_by_idx = by_idx.clone();
let mut merged_replay = replay.clone();
merged_by_idx.extend(fresh.by_idx);
merged_replay.extend(fresh.replay);
pending_message_fingerprints.insert(conversation_id, merged_by_idx.clone());
pending_message_replay_fingerprints.insert(conversation_id, merged_replay.clone());
return Ok(ExistingMessageLookup {
by_idx: merged_by_idx,
replay: merged_replay,
});
}
let lookup = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
pending_message_fingerprints.insert(conversation_id, lookup.by_idx.clone());
pending_message_replay_fingerprints.insert(conversation_id, lookup.replay.clone());
Ok(lookup)
}
fn franken_batch_insert_fts(tx: &FrankenTransaction<'_>, entries: &[FtsEntry]) -> Result<usize> {
if entries.is_empty() {
return Ok(0);
}
let mut inserted = 0;
for chunk in entries.chunks(FTS5_BATCH_SIZE) {
let placeholders: String = chunk
.iter()
.enumerate()
.map(|(i, _)| {
let base = i * 7 + 1; format!(
"(?{},?{},?{},?{},?{},?{},?{})",
base,
base + 1,
base + 2,
base + 3,
base + 4,
base + 5,
base + 6
)
})
.collect::<Vec<_>>()
.join(",");
let sql = format!(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
);
let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
for entry in chunk {
param_values.push(SqliteValue::from(entry.message_id));
param_values.push(SqliteValue::from(entry.content.as_str()));
param_values.push(SqliteValue::from(entry.title.as_str()));
param_values.push(SqliteValue::from(entry.agent.as_str()));
param_values.push(SqliteValue::from(entry.workspace.as_str()));
param_values.push(SqliteValue::from(entry.source_path.as_str()));
param_values.push(SqliteValue::from(entry.created_at));
}
match tx.execute_with_params(&sql, ¶m_values) {
Ok(_) => {
inserted += chunk.len();
}
Err(err) => {
tracing::warn!(
error = %err,
chunk_docs = chunk.len(),
"frankensqlite FTS batch insert failed; skipping db-resident FTS maintenance because Tantivy is authoritative"
);
return Ok(inserted);
}
}
}
Ok(inserted)
}
fn franken_batch_insert_fts_on_connection(
conn: &FrankenConnection,
entries: &[FtsEntry],
) -> Result<usize> {
if entries.is_empty() {
return Ok(0);
}
let mut inserted = 0;
for chunk in entries.chunks(FTS5_BATCH_SIZE) {
let placeholders: String = chunk
.iter()
.enumerate()
.map(|(i, _)| {
let base = i * 7 + 1;
format!(
"(?{},?{},?{},?{},?{},?{},?{})",
base,
base + 1,
base + 2,
base + 3,
base + 4,
base + 5,
base + 6
)
})
.collect::<Vec<_>>()
.join(",");
let sql = format!(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
);
let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
for entry in chunk {
param_values.push(SqliteValue::from(entry.message_id));
param_values.push(SqliteValue::from(entry.content.as_str()));
param_values.push(SqliteValue::from(entry.title.as_str()));
param_values.push(SqliteValue::from(entry.agent.as_str()));
param_values.push(SqliteValue::from(entry.workspace.as_str()));
param_values.push(SqliteValue::from(entry.source_path.as_str()));
param_values.push(SqliteValue::from(entry.created_at));
}
conn.execute_with_params(&sql, ¶m_values)
.with_context(|| {
format!(
"inserting {} rows into fts_messages during streaming FTS maintenance",
chunk.len()
)
})?;
inserted += chunk.len();
}
Ok(inserted)
}
fn franken_update_daily_stats_in_tx(
storage: &FrankenStorage,
tx: &FrankenTransaction<'_>,
agent_slug: &str,
source_id: &str,
started_at: Option<i64>,
delta: StatsDelta,
) -> Result<()> {
let day_id = started_at
.map(FrankenStorage::day_id_from_millis)
.unwrap_or(0);
let now = FrankenStorage::now_millis();
let targets = [
DailyStatsTarget {
day_id,
agent_slug,
source_id,
},
DailyStatsTarget {
day_id,
agent_slug: "all",
source_id,
},
DailyStatsTarget {
day_id,
agent_slug,
source_id: "all",
},
DailyStatsTarget {
day_id,
agent_slug: "all",
source_id: "all",
},
];
if agent_slug != "all"
&& source_id != "all"
&& franken_update_ensured_daily_stats_targets_in_tx(storage, tx, &targets, now, delta)?
{
return Ok(());
}
for target in targets {
franken_apply_daily_stats_delta_in_tx(storage, tx, target, now, delta)?;
}
Ok(())
}
#[derive(Clone, Copy)]
struct DailyStatsTarget<'a> {
day_id: i64,
agent_slug: &'a str,
source_id: &'a str,
}
fn franken_update_ensured_daily_stats_targets_in_tx(
storage: &FrankenStorage,
tx: &FrankenTransaction<'_>,
targets: &[DailyStatsTarget<'_>; 4],
now: i64,
delta: StatsDelta,
) -> Result<bool> {
let cache_keys = targets.map(|target| {
EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id)
});
if !storage.daily_stats_keys_already_ensured(&cache_keys) {
return Ok(false);
}
let primary = targets[0];
let rows_changed = tx.execute_compat(
"UPDATE daily_stats
SET session_count = session_count + ?4,
message_count = message_count + ?5,
total_chars = total_chars + ?6,
last_updated = ?7
WHERE day_id = ?1
AND ((agent_slug = ?2 AND source_id = ?3)
OR (agent_slug = 'all' AND source_id = ?3)
OR (agent_slug = ?2 AND source_id = 'all')
OR (agent_slug = 'all' AND source_id = 'all'))",
fparams![
primary.day_id,
primary.agent_slug,
primary.source_id,
delta.session_count_delta,
delta.message_count_delta,
delta.total_chars_delta,
now
],
)?;
if rows_changed == targets.len() {
return Ok(true);
}
for (target, cache_key) in targets.iter().copied().zip(cache_keys) {
let exists = tx
.query_row_map(
"SELECT 1 FROM daily_stats
WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3
LIMIT 1",
fparams![target.day_id, target.agent_slug, target.source_id],
|row| row.get_typed::<i64>(0),
)
.optional()?
.is_some();
if exists {
continue;
}
tx.execute_compat(
"INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
VALUES(?1,?2,?3,?4,?5,?6,?7)",
fparams![
target.day_id,
target.agent_slug,
target.source_id,
delta.session_count_delta,
delta.message_count_delta,
delta.total_chars_delta,
now
],
)?;
storage.mark_daily_stats_key_ensured(cache_key);
}
Ok(true)
}
fn franken_apply_daily_stats_delta_in_tx(
storage: &FrankenStorage,
tx: &FrankenTransaction<'_>,
target: DailyStatsTarget<'_>,
now: i64,
delta: StatsDelta,
) -> Result<()> {
let cache_key = EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id);
if storage.daily_stats_key_already_ensured(&cache_key) {
let rows_changed = tx.execute_compat(
"UPDATE daily_stats
SET session_count = session_count + ?4,
message_count = message_count + ?5,
total_chars = total_chars + ?6,
last_updated = ?7
WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3",
fparams![
target.day_id,
target.agent_slug,
target.source_id,
delta.session_count_delta,
delta.message_count_delta,
delta.total_chars_delta,
now
],
)?;
if rows_changed > 0 {
return Ok(());
}
}
tx.execute_compat(
"INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
VALUES(?1,?2,?3,?4,?5,?6,?7)
ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
session_count = session_count + excluded.session_count,
message_count = message_count + excluded.message_count,
total_chars = total_chars + excluded.total_chars,
last_updated = excluded.last_updated",
fparams![
target.day_id,
target.agent_slug,
target.source_id,
delta.session_count_delta,
delta.message_count_delta,
delta.total_chars_delta,
now
],
)?;
storage.mark_daily_stats_key_ensured(cache_key);
Ok(())
}
fn franken_update_daily_stats_batched_in_tx(
tx: &FrankenTransaction<'_>,
entries: &[(i64, String, String, StatsDelta)],
) -> Result<usize> {
if entries.is_empty() {
return Ok(0);
}
let now = FrankenStorage::now_millis();
let mut total_affected = 0;
for (day_id, agent, source, delta) in entries {
total_affected += tx.execute_compat(
"INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
VALUES(?1,?2,?3,?4,?5,?6,?7)
ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
session_count = session_count + excluded.session_count,
message_count = message_count + excluded.message_count,
total_chars = total_chars + excluded.total_chars,
last_updated = excluded.last_updated",
fparams![
*day_id,
agent.as_str(),
source.as_str(),
delta.session_count_delta,
delta.message_count_delta,
delta.total_chars_delta,
now
],
)?;
}
Ok(total_affected)
}
fn franken_insert_token_usage_batched_in_tx(
tx: &FrankenTransaction<'_>,
entries: &[TokenUsageEntry],
) -> Result<usize> {
if entries.is_empty() {
return Ok(0);
}
let mut total_inserted = 0;
for e in entries {
let params_vec: Vec<ParamValue> = vec![
ParamValue::from(e.message_id),
ParamValue::from(e.conversation_id),
ParamValue::from(e.agent_id),
ParamValue::from(e.workspace_id),
ParamValue::from(e.source_id.clone()),
ParamValue::from(e.timestamp_ms),
ParamValue::from(e.day_id),
ParamValue::from(e.model_name.clone()),
ParamValue::from(e.model_family.clone()),
ParamValue::from(e.model_tier.clone()),
ParamValue::from(e.service_tier.clone()),
ParamValue::from(e.provider.clone()),
ParamValue::from(e.input_tokens),
ParamValue::from(e.output_tokens),
ParamValue::from(e.cache_read_tokens),
ParamValue::from(e.cache_creation_tokens),
ParamValue::from(e.thinking_tokens),
ParamValue::from(e.total_tokens),
ParamValue::from(e.estimated_cost_usd),
ParamValue::from(e.role.clone()),
ParamValue::from(e.content_chars),
ParamValue::from(e.has_tool_calls as i64),
ParamValue::from(e.tool_call_count as i64),
ParamValue::from(e.data_source.clone()),
];
let values = param_slice_to_values(¶ms_vec);
total_inserted += tx.execute_with_params(
"INSERT OR IGNORE INTO token_usage (
message_id, conversation_id, agent_id, workspace_id, source_id,
timestamp_ms, day_id,
model_name, model_family, model_tier, service_tier, provider,
input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
thinking_tokens, total_tokens, estimated_cost_usd,
role, content_chars, has_tool_calls, tool_call_count, data_source
)
VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
&values,
)?;
}
Ok(total_inserted)
}
fn franken_update_token_daily_stats_batched_in_tx(
tx: &FrankenTransaction<'_>,
entries: &[(i64, String, String, String, TokenStatsDelta)],
) -> Result<usize> {
if entries.is_empty() {
return Ok(0);
}
let now = FrankenStorage::now_millis();
let mut total_affected = 0;
for (day_id, agent, source, model, delta) in entries {
total_affected += tx.execute_compat(
"INSERT INTO token_daily_stats (
day_id, agent_slug, source_id, model_family,
api_call_count, user_message_count, assistant_message_count, tool_message_count,
total_input_tokens, total_output_tokens, total_cache_read_tokens,
total_cache_creation_tokens, total_thinking_tokens, grand_total_tokens,
total_content_chars, total_tool_calls, estimated_cost_usd, session_count,
last_updated
)
VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19)
ON CONFLICT(day_id, agent_slug, source_id, model_family) DO UPDATE SET
api_call_count = api_call_count + excluded.api_call_count,
user_message_count = user_message_count + excluded.user_message_count,
assistant_message_count = assistant_message_count + excluded.assistant_message_count,
tool_message_count = tool_message_count + excluded.tool_message_count,
total_input_tokens = total_input_tokens + excluded.total_input_tokens,
total_output_tokens = total_output_tokens + excluded.total_output_tokens,
total_cache_read_tokens = total_cache_read_tokens + excluded.total_cache_read_tokens,
total_cache_creation_tokens = total_cache_creation_tokens + excluded.total_cache_creation_tokens,
total_thinking_tokens = total_thinking_tokens + excluded.total_thinking_tokens,
grand_total_tokens = grand_total_tokens + excluded.grand_total_tokens,
total_content_chars = total_content_chars + excluded.total_content_chars,
total_tool_calls = total_tool_calls + excluded.total_tool_calls,
estimated_cost_usd = estimated_cost_usd + excluded.estimated_cost_usd,
session_count = session_count + excluded.session_count,
last_updated = excluded.last_updated",
fparams![
*day_id,
agent.as_str(),
source.as_str(),
model.as_str(),
delta.api_call_count,
delta.user_message_count,
delta.assistant_message_count,
delta.tool_message_count,
delta.total_input_tokens,
delta.total_output_tokens,
delta.total_cache_read_tokens,
delta.total_cache_creation_tokens,
delta.total_thinking_tokens,
delta.grand_total_tokens,
delta.total_content_chars,
delta.total_tool_calls,
delta.estimated_cost_usd,
delta.session_count,
now
],
)?;
}
Ok(total_affected)
}
fn franken_insert_message_metrics_batched_in_tx(
tx: &FrankenTransaction<'_>,
entries: &[MessageMetricsEntry],
) -> Result<usize> {
if entries.is_empty() {
return Ok(0);
}
let mut total_inserted = 0;
for e in entries {
let params_vec: Vec<ParamValue> = vec![
ParamValue::from(e.message_id),
ParamValue::from(e.created_at_ms),
ParamValue::from(e.hour_id),
ParamValue::from(e.day_id),
ParamValue::from(e.agent_slug.clone()),
ParamValue::from(e.workspace_id),
ParamValue::from(e.source_id.clone()),
ParamValue::from(e.role.clone()),
ParamValue::from(e.content_chars),
ParamValue::from(e.content_tokens_est),
ParamValue::from(e.model_name.clone()),
ParamValue::from(e.model_family.clone()),
ParamValue::from(e.model_tier.clone()),
ParamValue::from(e.provider.clone()),
ParamValue::from(e.api_input_tokens),
ParamValue::from(e.api_output_tokens),
ParamValue::from(e.api_cache_read_tokens),
ParamValue::from(e.api_cache_creation_tokens),
ParamValue::from(e.api_thinking_tokens),
ParamValue::from(e.api_service_tier.clone()),
ParamValue::from(e.api_data_source.clone()),
ParamValue::from(e.tool_call_count),
ParamValue::from(e.has_tool_calls as i64),
ParamValue::from(e.has_plan as i64),
];
let values = param_slice_to_values(¶ms_vec);
total_inserted += tx.execute_with_params(
"INSERT OR IGNORE INTO message_metrics (
message_id, created_at_ms, hour_id, day_id,
agent_slug, workspace_id, source_id, role,
content_chars, content_tokens_est,
model_name, model_family, model_tier, provider,
api_input_tokens, api_output_tokens, api_cache_read_tokens,
api_cache_creation_tokens, api_thinking_tokens,
api_service_tier, api_data_source,
tool_call_count, has_tool_calls, has_plan
)
VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
&values,
)?;
}
Ok(total_inserted)
}
fn franken_flush_rollup_table(
tx: &FrankenTransaction<'_>,
table: &str,
bucket_col: &str,
deltas: &HashMap<(i64, String, i64, String), UsageRollupDelta>,
now: i64,
) -> Result<usize> {
if deltas.is_empty() {
return Ok(0);
}
let mut total_affected = 0;
for ((bucket_id, agent, workspace_id, source), d) in deltas {
let sql = format!(
"INSERT INTO {table} (
{bucket_col}, agent_slug, workspace_id, source_id,
message_count, user_message_count, assistant_message_count,
tool_call_count, plan_message_count, plan_content_tokens_est_total,
plan_api_tokens_total, api_coverage_message_count,
content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
api_tokens_total, api_input_tokens_total, api_output_tokens_total,
api_cache_read_tokens_total, api_cache_creation_tokens_total,
api_thinking_tokens_total, last_updated
)
VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
ON CONFLICT({bucket_col}, agent_slug, workspace_id, source_id) DO UPDATE SET
message_count = message_count + excluded.message_count,
user_message_count = user_message_count + excluded.user_message_count,
assistant_message_count = assistant_message_count + excluded.assistant_message_count,
tool_call_count = tool_call_count + excluded.tool_call_count,
plan_message_count = plan_message_count + excluded.plan_message_count,
plan_content_tokens_est_total = plan_content_tokens_est_total + excluded.plan_content_tokens_est_total,
plan_api_tokens_total = plan_api_tokens_total + excluded.plan_api_tokens_total,
api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
api_tokens_total = api_tokens_total + excluded.api_tokens_total,
api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
last_updated = excluded.last_updated"
);
total_affected += tx.execute_compat(
&sql,
fparams![
*bucket_id,
agent.as_str(),
*workspace_id,
source.as_str(),
d.message_count,
d.user_message_count,
d.assistant_message_count,
d.tool_call_count,
d.plan_message_count,
d.plan_content_tokens_est_total,
d.plan_api_tokens_total,
d.api_coverage_message_count,
d.content_tokens_est_total,
d.content_tokens_est_user,
d.content_tokens_est_assistant,
d.api_tokens_total,
d.api_input_tokens_total,
d.api_output_tokens_total,
d.api_cache_read_tokens_total,
d.api_cache_creation_tokens_total,
d.api_thinking_tokens_total,
now
],
)?;
}
Ok(total_affected)
}
fn franken_flush_model_daily_rollup_table(
tx: &FrankenTransaction<'_>,
deltas: &HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
now: i64,
) -> Result<usize> {
if deltas.is_empty() {
return Ok(0);
}
let mut total_affected = 0;
for ((day_id, agent, workspace_id, source, model_family, model_tier), d) in deltas {
total_affected += tx.execute_compat(
"INSERT INTO usage_models_daily (
day_id, agent_slug, workspace_id, source_id, model_family, model_tier,
message_count, user_message_count, assistant_message_count,
tool_call_count, plan_message_count, api_coverage_message_count,
content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
api_tokens_total, api_input_tokens_total, api_output_tokens_total,
api_cache_read_tokens_total, api_cache_creation_tokens_total,
api_thinking_tokens_total, last_updated
)
VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
ON CONFLICT(day_id, agent_slug, workspace_id, source_id, model_family, model_tier) DO UPDATE SET
message_count = message_count + excluded.message_count,
user_message_count = user_message_count + excluded.user_message_count,
assistant_message_count = assistant_message_count + excluded.assistant_message_count,
tool_call_count = tool_call_count + excluded.tool_call_count,
plan_message_count = plan_message_count + excluded.plan_message_count,
api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
api_tokens_total = api_tokens_total + excluded.api_tokens_total,
api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
last_updated = excluded.last_updated",
fparams![
*day_id,
agent.as_str(),
*workspace_id,
source.as_str(),
model_family.as_str(),
model_tier.as_str(),
d.message_count,
d.user_message_count,
d.assistant_message_count,
d.tool_call_count,
d.plan_message_count,
d.api_coverage_message_count,
d.content_tokens_est_total,
d.content_tokens_est_user,
d.content_tokens_est_assistant,
d.api_tokens_total,
d.api_input_tokens_total,
d.api_output_tokens_total,
d.api_cache_read_tokens_total,
d.api_cache_creation_tokens_total,
d.api_thinking_tokens_total,
now
],
)?;
}
Ok(total_affected)
}
fn franken_flush_analytics_rollups_in_tx(
tx: &FrankenTransaction<'_>,
agg: &AnalyticsRollupAggregator,
) -> Result<(usize, usize, usize)> {
let now = FrankenStorage::now_millis();
let hourly_affected =
franken_flush_rollup_table(tx, "usage_hourly", "hour_id", &agg.hourly, now)?;
let daily_affected = franken_flush_rollup_table(tx, "usage_daily", "day_id", &agg.daily, now)?;
let models_daily_affected = franken_flush_model_daily_rollup_table(tx, &agg.models_daily, now)?;
Ok((hourly_affected, daily_affected, models_daily_affected))
}
fn franken_update_conversation_token_summaries_in_tx(
tx: &FrankenTransaction<'_>,
conversation_id: i64,
) -> Result<()> {
tx.execute_compat(
"UPDATE conversations SET
total_input_tokens = (SELECT SUM(input_tokens) FROM token_usage WHERE conversation_id = ?1),
total_output_tokens = (SELECT SUM(output_tokens) FROM token_usage WHERE conversation_id = ?1),
total_cache_read_tokens = (SELECT SUM(cache_read_tokens) FROM token_usage WHERE conversation_id = ?1),
total_cache_creation_tokens = (SELECT SUM(cache_creation_tokens) FROM token_usage WHERE conversation_id = ?1),
grand_total_tokens = (SELECT SUM(total_tokens) FROM token_usage WHERE conversation_id = ?1),
estimated_cost_usd = (SELECT SUM(estimated_cost_usd) FROM token_usage WHERE conversation_id = ?1),
primary_model = (SELECT model_name FROM token_usage WHERE conversation_id = ?1
AND model_name IS NOT NULL
GROUP BY model_name ORDER BY COUNT(*) DESC LIMIT 1),
api_call_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
AND data_source = 'api'),
tool_call_count = (SELECT SUM(tool_call_count) FROM token_usage WHERE conversation_id = ?1),
user_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
AND role = 'user'),
assistant_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
AND role IN ('assistant', 'agent'))
WHERE id = ?1",
fparams![conversation_id],
)?;
Ok(())
}
impl FrankenStorage {
pub fn rebuild_token_daily_stats(&self) -> Result<usize> {
const CONVERSATION_BATCH_SIZE: usize = 1_000;
const TOKEN_USAGE_BATCH_SIZE: usize = 10_000;
let total_usage_rows: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
row.get_typed(0)
})?;
tracing::info!(
target: "cass::analytics",
total_usage_rows,
"token_daily_stats_rebuild_start"
);
let mut tx = self.conn.transaction()?;
tx.execute("DELETE FROM token_daily_stats")?;
let mut last_conversation_id = 0_i64;
let mut rows_created = 0_usize;
loop {
let conversation_rows = tx.query_map_collect(
"SELECT c.id, c.started_at, c.source_id,
COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown')
FROM conversations c
WHERE c.id > ?1
ORDER BY c.id
LIMIT ?2",
fparams![last_conversation_id, CONVERSATION_BATCH_SIZE as i64],
|row| {
Ok((
row.get_typed::<i64>(0)?,
row.get_typed::<Option<i64>>(1)?,
row.get_typed::<String>(2)?,
row.get_typed::<String>(3)?,
))
},
)?;
if conversation_rows.is_empty() {
break;
}
let mut aggregate = TokenStatsAggregator::new();
for (conversation_id, started_at, source_id, agent_slug) in conversation_rows {
last_conversation_id = conversation_id;
let conversation_day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
let mut last_token_usage_id = 0_i64;
let mut session_model_family = String::from("unknown");
loop {
let usage_rows = tx.query_map_collect(
"SELECT id, day_id, role,
COALESCE(model_family, 'unknown'),
input_tokens, output_tokens, cache_read_tokens,
cache_creation_tokens, thinking_tokens,
has_tool_calls, tool_call_count,
content_chars, estimated_cost_usd
FROM token_usage
WHERE conversation_id = ?1
AND id > ?2
ORDER BY id
LIMIT ?3",
fparams![
conversation_id,
last_token_usage_id,
TOKEN_USAGE_BATCH_SIZE as i64
],
|row| {
Ok((
row.get_typed::<i64>(0)?,
row.get_typed::<i64>(1)?,
row.get_typed::<String>(2)?,
row.get_typed::<String>(3)?,
row.get_typed::<Option<i64>>(4)?,
row.get_typed::<Option<i64>>(5)?,
row.get_typed::<Option<i64>>(6)?,
row.get_typed::<Option<i64>>(7)?,
row.get_typed::<Option<i64>>(8)?,
row.get_typed::<i64>(9)?,
row.get_typed::<i64>(10)?,
row.get_typed::<i64>(11)?,
row.get_typed::<Option<f64>>(12)?,
))
},
)?;
if usage_rows.is_empty() {
break;
}
for (
token_usage_id,
day_id,
role,
model_family,
input_tokens,
output_tokens,
cache_read_tokens,
cache_creation_tokens,
thinking_tokens,
has_tool_calls,
tool_call_count,
content_chars,
estimated_cost_usd,
) in usage_rows
{
last_token_usage_id = token_usage_id;
if model_family != "unknown" {
session_model_family = model_family.clone();
}
let usage = crate::connectors::ExtractedTokenUsage {
model_name: None,
provider: None,
input_tokens,
output_tokens,
cache_read_tokens,
cache_creation_tokens,
thinking_tokens,
service_tier: None,
has_tool_calls: has_tool_calls != 0,
tool_call_count: u32::try_from(tool_call_count.max(0)).unwrap_or(0),
data_source: franken_agent_detection::TokenDataSource::Api,
};
aggregate.record(
&agent_slug,
&source_id,
day_id,
&model_family,
&role,
&usage,
content_chars,
estimated_cost_usd.unwrap_or(0.0),
);
}
}
aggregate.record_session(
&agent_slug,
&source_id,
conversation_day_id,
&session_model_family,
);
}
let entries = aggregate.expand();
rows_created = rows_created.saturating_add(entries.len());
franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
}
tx.commit()?;
tracing::info!(
target: "cass::analytics",
rows_created,
"token_daily_stats_rebuild_complete"
);
Ok(rows_created)
}
pub fn rebuild_analytics(&self) -> Result<AnalyticsRebuildResult> {
let start = Instant::now();
let total_messages: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})?;
tracing::info!(
target: "cass::analytics",
total_messages,
"analytics_rebuild_start"
);
let mut tx = self.conn.transaction()?;
tx.execute("DELETE FROM message_metrics")?;
tx.execute("DELETE FROM usage_hourly")?;
tx.execute("DELETE FROM usage_daily")?;
tx.execute("DELETE FROM usage_models_daily")?;
const CHUNK_SIZE: i64 = 10_000;
let mut offset: i64 = 0;
let mut total_inserted: usize = 0;
let mut usage_hourly_rows: usize = 0;
let mut usage_daily_rows: usize = 0;
let mut usage_models_daily_rows: usize = 0;
loop {
#[allow(clippy::type_complexity)]
let rows: Vec<(
i64,
String,
String,
Option<serde_json::Value>,
Option<i64>,
Option<i64>,
String,
Option<i64>,
String,
)> = tx.query_map_collect(
"SELECT m.id, m.idx, m.role, m.content, m.extra_json, m.extra_bin,
m.created_at,
c.id AS conv_id, c.started_at AS conv_started_at,
c.source_id, c.workspace_id,
COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown') AS agent_slug
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
ORDER BY m.id
LIMIT ?1 OFFSET ?2",
fparams![CHUNK_SIZE, offset],
|row| {
let msg_id: i64 = row.get_typed(0)?;
let role: String = row.get_typed(2)?;
let content: String = row.get_typed(3)?;
let extra_json = row
.get_typed::<Option<String>>(4)?
.and_then(|s| serde_json::from_str(&s).ok())
.or_else(|| {
row.get_typed::<Option<Vec<u8>>>(5)
.ok()
.flatten()
.and_then(|b| rmp_serde::from_slice(&b).ok())
});
let msg_ts: Option<i64> = row.get_typed(6)?;
let conv_started_at: Option<i64> = row.get_typed(8)?;
let source_id: String = row.get_typed(9)?;
let workspace_id: Option<i64> = row.get_typed(10)?;
let agent_slug: String = row.get_typed(11)?;
let effective_ts = msg_ts.or(conv_started_at).unwrap_or(0);
Ok((
msg_id,
role,
content,
extra_json,
Some(effective_ts),
workspace_id,
source_id,
conv_started_at,
agent_slug,
))
},
)?;
if rows.is_empty() {
break;
}
let chunk_len = rows.len();
let mut entries = Vec::with_capacity(chunk_len);
let mut rollup_agg = AnalyticsRollupAggregator::new();
for (
msg_id,
role,
content,
extra_json,
effective_ts,
workspace_id,
source_id,
_conv_started_at,
agent_slug,
) in &rows
{
let ts = effective_ts.unwrap_or(0);
let day_id = Self::day_id_from_millis(ts);
let hour_id = Self::hour_id_from_millis(ts);
let content_chars = content.len() as i64;
let content_tokens_est = content_chars / 4;
let extra = extra_json
.as_ref()
.cloned()
.unwrap_or(serde_json::Value::Null);
let usage =
crate::connectors::extract_tokens_for_agent(agent_slug, &extra, content, role);
let model_info = usage
.model_name
.as_deref()
.map(crate::connectors::normalize_model);
let model_family = model_info
.as_ref()
.map(|i| i.family.clone())
.unwrap_or_else(|| "unknown".into());
let model_tier = model_info
.as_ref()
.map(|i| i.tier.clone())
.unwrap_or_else(|| "unknown".into());
let provider = usage
.provider
.clone()
.or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
.unwrap_or_else(|| "unknown".into());
let entry = MessageMetricsEntry {
message_id: *msg_id,
created_at_ms: ts,
hour_id,
day_id,
agent_slug: agent_slug.clone(),
workspace_id: workspace_id.unwrap_or(0),
source_id: source_id.clone(),
role: role.clone(),
content_chars,
content_tokens_est,
model_name: usage.model_name.clone(),
model_family,
model_tier,
provider,
api_input_tokens: usage.input_tokens,
api_output_tokens: usage.output_tokens,
api_cache_read_tokens: usage.cache_read_tokens,
api_cache_creation_tokens: usage.cache_creation_tokens,
api_thinking_tokens: usage.thinking_tokens,
api_service_tier: usage.service_tier,
api_data_source: usage.data_source.as_str().to_string(),
tool_call_count: usage.tool_call_count as i64,
has_tool_calls: usage.has_tool_calls,
has_plan: has_plan_for_role(role, content),
};
rollup_agg.record(&entry);
entries.push(entry);
}
total_inserted += franken_insert_message_metrics_batched_in_tx(&tx, &entries)?;
let (hourly, daily, models_daily) =
franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
usage_hourly_rows += hourly;
usage_daily_rows += daily;
usage_models_daily_rows += models_daily;
offset += chunk_len as i64;
tracing::debug!(
target: "cass::analytics",
offset,
chunk = chunk_len,
inserted = entries.len(),
total = total_inserted,
"analytics_rebuild_chunk"
);
if (chunk_len as i64) < CHUNK_SIZE {
break;
}
}
tx.commit()?;
let elapsed = start.elapsed();
let elapsed_ms = elapsed.as_millis() as u64;
let msgs_per_sec = if elapsed_ms > 0 {
(total_inserted as f64) / (elapsed_ms as f64 / 1000.0)
} else {
0.0
};
tracing::info!(
target: "cass::analytics",
message_metrics_rows = total_inserted,
usage_hourly_rows,
usage_daily_rows,
usage_models_daily_rows,
elapsed_ms,
messages_per_sec = format!("{:.0}", msgs_per_sec),
"analytics_rebuild_complete"
);
Ok(AnalyticsRebuildResult {
message_metrics_rows: total_inserted,
usage_hourly_rows,
usage_daily_rows,
usage_models_daily_rows,
elapsed_ms,
messages_per_sec: msgs_per_sec,
})
}
pub fn rebuild_daily_stats(&self) -> Result<DailyStatsRebuildResult> {
const DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE: usize = 1_000;
const DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE: usize = 10_000;
let mut conversation_batch_size = rebuild_batch_size_env(
"CASS_DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE",
DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE,
);
let mut message_batch_size = rebuild_batch_size_env(
"CASS_DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE",
DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE,
);
let total_messages: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})?;
let message_metrics_rows: i64 =
self.conn
.query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
row.get_typed(0)
})?;
let use_message_metrics = total_messages > 0 && total_messages == message_metrics_rows;
tracing::info!(
target: "cass::perf::daily_stats",
total_messages,
message_metrics_rows,
use_message_metrics,
"daily_stats rebuild selected message source"
);
let mut tx = self.conn.transaction()?;
tx.execute("DELETE FROM daily_stats")?;
let mut last_conversation_id = 0_i64;
let mut conversation_batch_count = 0_usize;
let mut conversations_processed = 0_usize;
let mut messages_processed = 0_usize;
let mut message_batch_count = 0_usize;
let mut raw_entries_flushed = 0_usize;
let mut expanded_entries_flushed = 0_usize;
let message_scan_sql = if use_message_metrics {
"SELECT m.idx, mm.content_chars
FROM messages m
JOIN message_metrics mm ON mm.message_id = m.id
WHERE m.conversation_id = ?1
AND m.idx > ?2
ORDER BY m.conversation_id, m.idx
LIMIT ?3"
} else {
"SELECT m.idx, COALESCE(LENGTH(CAST(m.content AS BLOB)), 0)
FROM messages m
WHERE m.conversation_id = ?1
AND m.idx > ?2
ORDER BY m.conversation_id, m.idx
LIMIT ?3"
};
loop {
let conversation_rows = match self.conn.query_with_params(
"SELECT c.id, c.started_at,
COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
c.source_id
FROM conversations c
WHERE c.id > ?1
ORDER BY c.id
LIMIT ?2",
¶ms_from_iter([
ParamValue::from(last_conversation_id),
ParamValue::from(conversation_batch_size as i64),
]),
) {
Ok(rows) => rows,
Err(err) if is_out_of_memory_error(&err) && conversation_batch_size > 1 => {
let previous_batch_size = conversation_batch_size;
conversation_batch_size = (conversation_batch_size / 2).max(1);
tracing::warn!(
previous_batch_size,
conversation_batch_size,
last_conversation_id,
"daily_stats conversation scan ran out of memory; retrying with smaller batch"
);
continue;
}
Err(err) => return Err(err.into()),
};
if conversation_rows.is_empty() {
break;
}
let mut aggregate = StatsAggregator::new();
let mut conversation_batch_meta: Vec<(i64, i64, String, String)> =
Vec::with_capacity(conversation_rows.len());
for row in &conversation_rows {
let conversation_id: i64 = row.get_typed(0)?;
let started_at: Option<i64> = row.get_typed(1)?;
let agent_slug: String = row.get_typed(2)?;
let source_id: String = row.get_typed(3)?;
last_conversation_id = conversation_id;
let day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
aggregate.record_delta(&agent_slug, &source_id, day_id, 1, 0, 0);
conversation_batch_meta.push((conversation_id, day_id, agent_slug, source_id));
conversations_processed += 1;
}
conversation_batch_count += 1;
raw_entries_flushed += aggregate.raw_entry_count();
let entries = aggregate.expand();
expanded_entries_flushed += entries.len();
if !entries.is_empty() {
franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
}
if conversation_batch_count.is_multiple_of(25) {
tracing::info!(
target: "cass::perf::daily_stats",
conversations_processed,
batches = conversation_batch_count,
batch_size = conversation_batch_size,
last_conversation_id,
"daily_stats rebuild conversation scan progress"
);
}
if conversation_batch_meta.is_empty() {
continue;
}
for (conversation_id, day_id, agent_slug, source_id) in conversation_batch_meta {
let mut cursor_message_idx = -1_i64;
loop {
let message_rows = match self.conn.query_with_params(
message_scan_sql,
¶ms_from_iter([
ParamValue::from(conversation_id),
ParamValue::from(cursor_message_idx),
ParamValue::from(message_batch_size as i64),
]),
) {
Ok(rows) => rows,
Err(err) if is_out_of_memory_error(&err) && message_batch_size > 1 => {
let previous_batch_size = message_batch_size;
message_batch_size = (message_batch_size / 2).max(1);
tracing::warn!(
previous_batch_size,
message_batch_size,
conversation_id,
cursor_message_idx,
"daily_stats message scan ran out of memory; retrying with smaller batch"
);
continue;
}
Err(err) => return Err(err.into()),
};
if message_rows.is_empty() {
break;
}
let mut aggregate = StatsAggregator::new();
for row in &message_rows {
let message_idx: i64 = row.get_typed(0)?;
let content_len: i64 = row.get_typed(1)?;
cursor_message_idx = message_idx;
aggregate.record_delta(&agent_slug, &source_id, day_id, 0, 1, content_len);
messages_processed += 1;
}
message_batch_count += 1;
raw_entries_flushed += aggregate.raw_entry_count();
let entries = aggregate.expand();
expanded_entries_flushed += entries.len();
if !entries.is_empty() {
franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
}
if message_batch_count.is_multiple_of(50) {
tracing::info!(
target: "cass::perf::daily_stats",
messages_processed,
batches = message_batch_count,
batch_size = message_batch_size,
source = if use_message_metrics {
"message_metrics"
} else {
"messages"
},
conversation_id,
cursor_message_idx,
"daily_stats rebuild message scan progress"
);
}
}
}
}
let rows_created: i64 =
tx.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
row.get_typed(0)
})?;
let total_sessions: i64 = tx.query_row_map(
"SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| row.get_typed(0),
)?;
tx.commit()?;
tracing::info!(
target: "cass::perf::daily_stats",
rows_created,
total_sessions,
conversations_processed,
conversation_batches = conversation_batch_count,
conversation_batch_size,
message_batches = message_batch_count,
message_batch_size,
messages_processed,
use_message_metrics,
raw_entries_flushed,
expanded_entries_flushed,
"Daily stats rebuilt from conversations"
);
Ok(DailyStatsRebuildResult {
rows_created,
total_sessions,
})
}
}
#[derive(Debug, Default)]
pub struct IndexingCache {
agent_ids: HashMap<String, i64>,
workspace_ids: HashMap<PathBuf, i64>,
hits: u64,
misses: u64,
}
pub trait IndexingCacheStorage {
fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64>;
fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64>;
}
impl IndexingCacheStorage for FrankenStorage {
fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64> {
self.ensure_agent(agent)
}
fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
self.ensure_workspace(path, display_name)
}
}
impl IndexingCache {
pub fn new() -> Self {
Self {
agent_ids: HashMap::new(),
workspace_ids: HashMap::new(),
hits: 0,
misses: 0,
}
}
pub fn is_enabled() -> bool {
dotenvy::var("CASS_SQLITE_CACHE")
.map(|v| v != "0" && v.to_lowercase() != "false")
.unwrap_or(true)
}
pub fn get_or_insert_agent<S>(&mut self, storage: &S, agent: &Agent) -> Result<i64>
where
S: IndexingCacheStorage + ?Sized,
{
if let Some(&cached) = self.agent_ids.get(&agent.slug) {
self.hits += 1;
return Ok(cached);
}
self.misses += 1;
let id = storage.ensure_indexing_agent(agent)?;
self.agent_ids.insert(agent.slug.clone(), id);
Ok(id)
}
pub fn get_or_insert_workspace(
&mut self,
storage: &(impl IndexingCacheStorage + ?Sized),
path: &Path,
display_name: Option<&str>,
) -> Result<i64> {
if let Some(&cached) = self.workspace_ids.get(path) {
self.hits += 1;
return Ok(cached);
}
self.misses += 1;
let id = storage.ensure_indexing_workspace(path, display_name)?;
self.workspace_ids.insert(path.to_path_buf(), id);
Ok(id)
}
pub fn stats(&self) -> (u64, u64, f64) {
let total = self.hits + self.misses;
let hit_rate = if total > 0 {
self.hits as f64 / total as f64
} else {
0.0
};
(self.hits, self.misses, hit_rate)
}
pub fn clear(&mut self) {
self.agent_ids.clear();
self.workspace_ids.clear();
self.hits = 0;
self.misses = 0;
}
pub fn agent_count(&self) -> usize {
self.agent_ids.len()
}
pub fn workspace_count(&self) -> usize {
self.workspace_ids.len()
}
}
#[derive(Clone, Copy, Debug, Default)]
pub struct StatsDelta {
pub session_count_delta: i64,
pub message_count_delta: i64,
pub total_chars_delta: i64,
}
#[derive(Debug, Default)]
pub struct StatsAggregator {
deltas: HashMap<(i64, String, String), StatsDelta>,
}
impl StatsAggregator {
pub fn new() -> Self {
Self {
deltas: HashMap::new(),
}
}
pub fn record(
&mut self,
agent_slug: &str,
source_id: &str,
day_id: i64,
message_count: i64,
total_chars: i64,
) {
self.record_delta(agent_slug, source_id, day_id, 1, message_count, total_chars);
}
pub fn record_delta(
&mut self,
agent_slug: &str,
source_id: &str,
day_id: i64,
session_count_delta: i64,
message_count_delta: i64,
total_chars_delta: i64,
) {
if session_count_delta == 0 && message_count_delta == 0 && total_chars_delta == 0 {
return;
}
let key = (day_id, agent_slug.to_owned(), source_id.to_owned());
let delta = self.deltas.entry(key).or_default();
delta.session_count_delta += session_count_delta;
delta.message_count_delta += message_count_delta;
delta.total_chars_delta += total_chars_delta;
}
pub fn expand(&self) -> Vec<(i64, String, String, StatsDelta)> {
let mut expanded: HashMap<(i64, String, String), StatsDelta> = HashMap::new();
for ((day_id, agent, source), delta) in &self.deltas {
let permutations = [
(agent.as_str(), source.as_str()),
("all", source.as_str()),
(agent.as_str(), "all"),
("all", "all"),
];
for idx in 0..permutations.len() {
let (a, s) = permutations[idx];
if permutations[..idx].contains(&(a, s)) {
continue;
}
let key = (*day_id, a.to_owned(), s.to_owned());
let entry = expanded.entry(key).or_default();
entry.session_count_delta += delta.session_count_delta;
entry.message_count_delta += delta.message_count_delta;
entry.total_chars_delta += delta.total_chars_delta;
}
}
let mut out: Vec<(i64, String, String, StatsDelta)> = expanded
.into_iter()
.map(|((d, a, s), delta)| (d, a, s, delta))
.collect();
out.sort_by(|(d1, a1, s1, _), (d2, a2, s2, _)| {
d1.cmp(d2).then_with(|| a1.cmp(a2)).then_with(|| s1.cmp(s2))
});
out
}
pub fn is_empty(&self) -> bool {
self.deltas.is_empty()
}
pub fn raw_entry_count(&self) -> usize {
self.deltas.len()
}
}
#[derive(Clone, Debug, Default)]
pub struct TokenStatsDelta {
pub api_call_count: i64,
pub user_message_count: i64,
pub assistant_message_count: i64,
pub tool_message_count: i64,
pub total_input_tokens: i64,
pub total_output_tokens: i64,
pub total_cache_read_tokens: i64,
pub total_cache_creation_tokens: i64,
pub total_thinking_tokens: i64,
pub grand_total_tokens: i64,
pub total_content_chars: i64,
pub total_tool_calls: i64,
pub estimated_cost_usd: f64,
pub session_count: i64,
}
#[derive(Debug, Default)]
pub struct TokenStatsAggregator {
deltas: HashMap<(i64, String, String, String), TokenStatsDelta>,
}
impl TokenStatsAggregator {
pub fn new() -> Self {
Self {
deltas: HashMap::new(),
}
}
#[allow(clippy::too_many_arguments)]
pub fn record(
&mut self,
agent_slug: &str,
source_id: &str,
day_id: i64,
model_family: &str,
role: &str,
usage: &crate::connectors::ExtractedTokenUsage,
content_chars: i64,
estimated_cost_usd: f64,
) {
let key = (
day_id,
agent_slug.to_owned(),
source_id.to_owned(),
model_family.to_owned(),
);
let delta = self.deltas.entry(key).or_default();
delta.api_call_count += 1;
match role {
"user" => delta.user_message_count += 1,
"assistant" | "agent" => delta.assistant_message_count += 1,
"tool" => delta.tool_message_count += 1,
_ => {}
}
delta.total_input_tokens += usage.input_tokens.unwrap_or(0);
delta.total_output_tokens += usage.output_tokens.unwrap_or(0);
delta.total_cache_read_tokens += usage.cache_read_tokens.unwrap_or(0);
delta.total_cache_creation_tokens += usage.cache_creation_tokens.unwrap_or(0);
delta.total_thinking_tokens += usage.thinking_tokens.unwrap_or(0);
delta.grand_total_tokens += usage.total_tokens().unwrap_or(0);
delta.total_content_chars += content_chars;
delta.total_tool_calls += usage.tool_call_count as i64;
delta.estimated_cost_usd += estimated_cost_usd;
}
pub fn record_session(
&mut self,
agent_slug: &str,
source_id: &str,
day_id: i64,
model_family: &str,
) {
let key = (
day_id,
agent_slug.to_owned(),
source_id.to_owned(),
model_family.to_owned(),
);
self.deltas.entry(key).or_default().session_count += 1;
}
pub fn expand(&self) -> Vec<(i64, String, String, String, TokenStatsDelta)> {
let mut expanded: HashMap<(i64, String, String, String), TokenStatsDelta> = HashMap::new();
for ((day_id, agent, source, model), delta) in &self.deltas {
let permutations = [
(agent.as_str(), source.as_str(), model.as_str()),
("all", source.as_str(), model.as_str()),
(agent.as_str(), "all", model.as_str()),
(agent.as_str(), source.as_str(), "all"),
("all", "all", "all"),
];
for idx in 0..permutations.len() {
let (a, s, m) = permutations[idx];
if permutations[..idx].contains(&(a, s, m)) {
continue;
}
let key = (*day_id, a.to_owned(), s.to_owned(), m.to_owned());
let entry = expanded.entry(key).or_default();
entry.api_call_count += delta.api_call_count;
entry.user_message_count += delta.user_message_count;
entry.assistant_message_count += delta.assistant_message_count;
entry.tool_message_count += delta.tool_message_count;
entry.total_input_tokens += delta.total_input_tokens;
entry.total_output_tokens += delta.total_output_tokens;
entry.total_cache_read_tokens += delta.total_cache_read_tokens;
entry.total_cache_creation_tokens += delta.total_cache_creation_tokens;
entry.total_thinking_tokens += delta.total_thinking_tokens;
entry.grand_total_tokens += delta.grand_total_tokens;
entry.total_content_chars += delta.total_content_chars;
entry.total_tool_calls += delta.total_tool_calls;
entry.estimated_cost_usd += delta.estimated_cost_usd;
entry.session_count += delta.session_count;
}
}
let mut out: Vec<(i64, String, String, String, TokenStatsDelta)> = expanded
.into_iter()
.map(|((d, a, s, m), delta)| (d, a, s, m, delta))
.collect();
out.sort_by(|(d1, a1, s1, m1, _), (d2, a2, s2, m2, _)| {
d1.cmp(d2)
.then_with(|| a1.cmp(a2))
.then_with(|| s1.cmp(s2))
.then_with(|| m1.cmp(m2))
});
out
}
pub fn is_empty(&self) -> bool {
self.deltas.is_empty()
}
pub fn raw_entry_count(&self) -> usize {
self.deltas.len()
}
}
#[derive(Clone, Debug, Default)]
pub struct UsageRollupDelta {
pub message_count: i64,
pub user_message_count: i64,
pub assistant_message_count: i64,
pub tool_call_count: i64,
pub plan_message_count: i64,
pub plan_content_tokens_est_total: i64,
pub plan_api_tokens_total: i64,
pub api_coverage_message_count: i64,
pub content_tokens_est_total: i64,
pub content_tokens_est_user: i64,
pub content_tokens_est_assistant: i64,
pub api_tokens_total: i64,
pub api_input_tokens_total: i64,
pub api_output_tokens_total: i64,
pub api_cache_read_tokens_total: i64,
pub api_cache_creation_tokens_total: i64,
pub api_thinking_tokens_total: i64,
}
#[derive(Debug, Clone)]
pub struct MessageMetricsEntry {
pub message_id: i64,
pub created_at_ms: i64,
pub hour_id: i64,
pub day_id: i64,
pub agent_slug: String,
pub workspace_id: i64,
pub source_id: String,
pub role: String,
pub content_chars: i64,
pub content_tokens_est: i64,
pub model_name: Option<String>,
pub model_family: String,
pub model_tier: String,
pub provider: String,
pub api_input_tokens: Option<i64>,
pub api_output_tokens: Option<i64>,
pub api_cache_read_tokens: Option<i64>,
pub api_cache_creation_tokens: Option<i64>,
pub api_thinking_tokens: Option<i64>,
pub api_service_tier: Option<String>,
pub api_data_source: String,
pub tool_call_count: i64,
pub has_tool_calls: bool,
pub has_plan: bool,
}
#[derive(Debug, Default)]
pub struct AnalyticsRollupAggregator {
hourly: HashMap<(i64, String, i64, String), UsageRollupDelta>,
daily: HashMap<(i64, String, i64, String), UsageRollupDelta>,
models_daily: HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
}
impl AnalyticsRollupAggregator {
pub fn new() -> Self {
Self::default()
}
pub fn record(&mut self, entry: &MessageMetricsEntry) {
let content_est = entry.content_tokens_est;
let api_total = entry.api_input_tokens.unwrap_or(0)
+ entry.api_output_tokens.unwrap_or(0)
+ entry.api_cache_read_tokens.unwrap_or(0)
+ entry.api_cache_creation_tokens.unwrap_or(0)
+ entry.api_thinking_tokens.unwrap_or(0);
let is_api = entry.api_data_source == "api";
let is_user = entry.role == "user";
let is_assistant = entry.role == "assistant" || entry.role == "agent";
for (map, bucket_id) in [
(&mut self.hourly, entry.hour_id),
(&mut self.daily, entry.day_id),
] {
let key = (
bucket_id,
entry.agent_slug.clone(),
entry.workspace_id,
entry.source_id.clone(),
);
let d = map.entry(key).or_default();
d.message_count += 1;
if is_user {
d.user_message_count += 1;
d.content_tokens_est_user += content_est;
}
if is_assistant {
d.assistant_message_count += 1;
d.content_tokens_est_assistant += content_est;
}
d.tool_call_count += entry.tool_call_count;
if entry.has_plan {
d.plan_message_count += 1;
d.plan_content_tokens_est_total += content_est;
if is_api {
d.plan_api_tokens_total += api_total;
}
}
if is_api {
d.api_coverage_message_count += 1;
d.api_tokens_total += api_total;
d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
}
d.content_tokens_est_total += content_est;
}
let model_key = (
entry.day_id,
entry.agent_slug.clone(),
entry.workspace_id,
entry.source_id.clone(),
entry.model_family.clone(),
entry.model_tier.clone(),
);
let d = self.models_daily.entry(model_key).or_default();
d.message_count += 1;
if is_user {
d.user_message_count += 1;
d.content_tokens_est_user += content_est;
}
if is_assistant {
d.assistant_message_count += 1;
d.content_tokens_est_assistant += content_est;
}
d.tool_call_count += entry.tool_call_count;
if entry.has_plan {
d.plan_message_count += 1;
d.plan_content_tokens_est_total += content_est;
if is_api {
d.plan_api_tokens_total += api_total;
}
}
if is_api {
d.api_coverage_message_count += 1;
d.api_tokens_total += api_total;
d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
}
d.content_tokens_est_total += content_est;
}
pub fn is_empty(&self) -> bool {
self.hourly.is_empty() && self.daily.is_empty() && self.models_daily.is_empty()
}
pub fn hourly_entry_count(&self) -> usize {
self.hourly.len()
}
pub fn daily_entry_count(&self) -> usize {
self.daily.len()
}
pub fn models_daily_entry_count(&self) -> usize {
self.models_daily.len()
}
}
fn has_plan_for_role(role: &str, content: &str) -> bool {
let role = role.trim();
(role.eq_ignore_ascii_case("assistant") || role.eq_ignore_ascii_case("agent"))
&& has_plan_heuristic(content)
}
fn has_plan_heuristic(content: &str) -> bool {
if content.len() < 24 {
return false;
}
let lower = content.to_lowercase();
let looks_like_tool_blob = lower.contains("```")
|| lower.contains("\"tool\"")
|| lower.contains("stdout:")
|| lower.contains("stderr:")
|| lower.contains("exit code:");
let mut lines: Vec<&str> = Vec::with_capacity(60);
let mut in_fenced_code = false;
for raw in lower.lines() {
let line = raw.trim();
if line.starts_with("```") {
in_fenced_code = !in_fenced_code;
continue;
}
if in_fenced_code || line.is_empty() {
continue;
}
lines.push(line);
if lines.len() >= 60 {
break;
}
}
let header_pos = lines.iter().position(|line| {
line.starts_with("## plan")
|| line.starts_with("# plan")
|| line.starts_with("plan:")
|| line.starts_with("implementation plan")
|| line.starts_with("next steps:")
|| line.starts_with("action plan:")
});
let preview_top = lines.iter().take(8).copied().collect::<Vec<_>>().join("\n");
let header_near_top = header_pos.is_some_and(|idx| idx <= 6) || preview_top.contains("plan:");
if !header_near_top {
return false;
}
if looks_like_tool_blob && header_pos.is_none() {
return false;
}
let numbered_steps = lines
.iter()
.filter(|line| is_numbered_step_line(line))
.count();
let bullet_steps = lines
.iter()
.filter(|line| {
line.starts_with("- ")
|| line.starts_with("* ")
|| line.starts_with("+ ")
|| line.starts_with("- [ ] ")
|| line.starts_with("- [x] ")
})
.count();
numbered_steps >= 2 || (numbered_steps >= 1 && bullet_steps >= 1) || bullet_steps >= 3
}
fn is_numbered_step_line(line: &str) -> bool {
let trimmed = line.trim_start();
let digit_count = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
if digit_count == 0 || digit_count > 3 {
return false;
}
let rest = &trimmed[digit_count..];
rest.starts_with(". ") || rest.starts_with(") ")
}
#[derive(Debug, Clone)]
pub struct TokenUsageEntry {
pub message_id: i64,
pub conversation_id: i64,
pub agent_id: i64,
pub workspace_id: Option<i64>,
pub source_id: String,
pub timestamp_ms: i64,
pub day_id: i64,
pub model_name: Option<String>,
pub model_family: Option<String>,
pub model_tier: Option<String>,
pub service_tier: Option<String>,
pub provider: Option<String>,
pub input_tokens: Option<i64>,
pub output_tokens: Option<i64>,
pub cache_read_tokens: Option<i64>,
pub cache_creation_tokens: Option<i64>,
pub thinking_tokens: Option<i64>,
pub total_tokens: Option<i64>,
pub estimated_cost_usd: Option<f64>,
pub role: String,
pub content_chars: i64,
pub has_tool_calls: bool,
pub tool_call_count: u32,
pub data_source: String,
}
#[derive(Debug, Clone)]
pub struct PricingEntry {
pub model_pattern: String,
pub provider: String,
pub input_cost_per_mtok: f64,
pub output_cost_per_mtok: f64,
pub cache_read_cost_per_mtok: Option<f64>,
pub cache_creation_cost_per_mtok: Option<f64>,
pub effective_day_id: i64,
}
#[derive(Debug, Clone, Default)]
pub struct PricingDiagnostics {
pub priced_count: u64,
pub unpriced_count: u64,
pub unknown_models: HashMap<String, u64>,
}
impl PricingDiagnostics {
fn record_priced(&mut self) {
self.priced_count += 1;
}
fn record_unpriced(&mut self, model_name: Option<&str>) {
self.unpriced_count += 1;
let key = model_name.unwrap_or("(none)").to_string();
*self.unknown_models.entry(key).or_insert(0) += 1;
}
pub fn log_summary(&self) {
let total = self.priced_count + self.unpriced_count;
if total == 0 {
return;
}
let pct = (self.priced_count as f64 / total as f64) * 100.0;
tracing::info!(
target: "cass::analytics::pricing",
priced = self.priced_count,
unpriced = self.unpriced_count,
total = total,
coverage_pct = format!("{pct:.1}%"),
"pricing coverage"
);
if !self.unknown_models.is_empty() {
let mut sorted: Vec<_> = self.unknown_models.iter().collect();
sorted.sort_by(|a, b| b.1.cmp(a.1));
for (model, count) in sorted.iter().take(5) {
tracing::debug!(
target: "cass::analytics::pricing",
model = model.as_str(),
count = count,
"unknown model (no pricing)"
);
}
}
}
}
#[derive(Debug, Clone)]
pub struct PricingTable {
entries: Vec<PricingEntry>,
}
impl PricingTable {
pub fn load(conn: &FrankenConnection) -> Result<Self> {
Self::franken_load(conn)
}
pub fn franken_load(conn: &FrankenConnection) -> Result<Self> {
let rows = conn.query(
"SELECT model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
FROM model_pricing
ORDER BY effective_date DESC",
)?;
let mut entries = Vec::with_capacity(rows.len());
for row in &rows {
let effective_date: String = row.get_typed(6)?;
let effective_day_id = date_str_to_day_id(&effective_date)?;
entries.push(PricingEntry {
model_pattern: row.get_typed(0)?,
provider: row.get_typed(1)?,
input_cost_per_mtok: row.get_typed(2)?,
output_cost_per_mtok: row.get_typed(3)?,
cache_read_cost_per_mtok: row.get_typed(4)?,
cache_creation_cost_per_mtok: row.get_typed(5)?,
effective_day_id,
});
}
Ok(Self { entries })
}
pub fn lookup(&self, model_name: &str, message_day_id: i64) -> Option<&PricingEntry> {
let mut best: Option<&PricingEntry> = None;
for entry in &self.entries {
if entry.effective_day_id > message_day_id {
continue;
}
if !sql_like_match(model_name, &entry.model_pattern) {
continue;
}
match best {
None => best = Some(entry),
Some(current) => {
if entry.effective_day_id > current.effective_day_id
|| (entry.effective_day_id == current.effective_day_id
&& entry.model_pattern.len() > current.model_pattern.len())
{
best = Some(entry);
}
}
}
}
best
}
pub fn compute_cost(
&self,
model_name: Option<&str>,
message_day_id: i64,
input_tokens: Option<i64>,
output_tokens: Option<i64>,
cache_read_tokens: Option<i64>,
cache_creation_tokens: Option<i64>,
) -> Option<f64> {
let model = model_name?;
let pricing = self.lookup(model, message_day_id)?;
if input_tokens.is_none() && output_tokens.is_none() {
return None;
}
let mut cost = 0.0;
let cache_read = cache_read_tokens.unwrap_or(0);
let cache_creation = cache_creation_tokens.unwrap_or(0);
let non_cache_input = input_tokens
.unwrap_or(0)
.saturating_sub(cache_read)
.saturating_sub(cache_creation)
.max(0);
cost += non_cache_input as f64 * pricing.input_cost_per_mtok / 1_000_000.0;
cost += output_tokens.unwrap_or(0) as f64 * pricing.output_cost_per_mtok / 1_000_000.0;
if let Some(cache_price) = pricing.cache_read_cost_per_mtok {
cost += cache_read as f64 * cache_price / 1_000_000.0;
}
if let Some(cache_price) = pricing.cache_creation_cost_per_mtok {
cost += cache_creation as f64 * cache_price / 1_000_000.0;
}
Some(cost)
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
}
fn date_str_to_day_id(s: &str) -> Result<i64> {
use chrono::NaiveDate;
const EPOCH_2020: NaiveDate = match NaiveDate::from_ymd_opt(2020, 1, 1) {
Some(d) => d,
None => unreachable!(),
};
NaiveDate::parse_from_str(s, "%Y-%m-%d")
.map(|d| (d - EPOCH_2020).num_days())
.with_context(|| format!("invalid effective_date '{s}'"))
}
fn sql_like_match(value: &str, pattern: &str) -> bool {
sql_like_match_bytes(
value.to_ascii_lowercase().as_bytes(),
pattern.to_ascii_lowercase().as_bytes(),
)
}
fn utf8_char_len(b: u8) -> usize {
if b < 0x80 {
1
} else if b < 0xE0 {
2
} else if b < 0xF0 {
3
} else {
4
}
}
fn sql_like_match_bytes(val: &[u8], pat: &[u8]) -> bool {
if pat.is_empty() {
return val.is_empty();
}
match pat[0] {
b'%' => {
let mut p = 1;
while p < pat.len() && pat[p] == b'%' {
p += 1;
}
let rest = &pat[p..];
let mut i = 0;
while i <= val.len() {
if sql_like_match_bytes(&val[i..], rest) {
return true;
}
if i < val.len() {
i += utf8_char_len(val[i]);
} else {
break;
}
}
false
}
b'_' => {
if val.is_empty() {
return false;
}
let char_len = utf8_char_len(val[0]);
val.len() >= char_len && sql_like_match_bytes(&val[char_len..], &pat[1..])
}
c => !val.is_empty() && val[0] == c && sql_like_match_bytes(&val[1..], &pat[1..]),
}
}
fn rebuild_batch_size_env(var: &str, default: usize) -> usize {
dotenvy::var(var)
.ok()
.and_then(|raw| raw.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(default)
}
fn is_out_of_memory_error<E: OutOfMemoryProbe + ?Sized>(err: &E) -> bool {
err.is_out_of_memory()
}
trait OutOfMemoryProbe {
fn is_out_of_memory(&self) -> bool;
}
impl OutOfMemoryProbe for anyhow::Error {
fn is_out_of_memory(&self) -> bool {
self.chain().any(|cause| {
if cause
.downcast_ref::<frankensqlite::FrankenError>()
.is_some_and(|err| matches!(err, frankensqlite::FrankenError::OutOfMemory))
{
return true;
}
is_exact_out_of_memory_message(&cause.to_string())
})
}
}
impl OutOfMemoryProbe for frankensqlite::FrankenError {
fn is_out_of_memory(&self) -> bool {
matches!(self, frankensqlite::FrankenError::OutOfMemory)
}
}
fn is_exact_out_of_memory_message(message: &str) -> bool {
matches!(
message.trim().to_ascii_lowercase().as_str(),
"out of memory" | "not enough memory"
)
}
#[derive(Debug, Clone)]
pub struct DailyCount {
pub day_id: i64,
pub sessions: i64,
pub messages: i64,
pub chars: i64,
}
#[derive(Debug, Clone)]
pub struct AnalyticsRebuildResult {
pub message_metrics_rows: usize,
pub usage_hourly_rows: usize,
pub usage_daily_rows: usize,
pub usage_models_daily_rows: usize,
pub elapsed_ms: u64,
pub messages_per_sec: f64,
}
#[derive(Debug, Clone)]
pub struct DailyStatsRebuildResult {
pub rows_created: i64,
pub total_sessions: i64,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct AgentArchivePurgeResult {
pub conversations_deleted: usize,
pub messages_deleted: usize,
}
#[derive(Debug, Clone)]
pub struct DailyStatsHealth {
pub populated: bool,
pub row_count: i64,
pub oldest_update_ms: Option<i64>,
pub conversation_count: i64,
pub materialized_total: i64,
pub drift: i64,
}
const FTS5_BATCH_SIZE: usize = 100;
#[derive(Debug, Clone)]
struct FtsRebuildMessageRow {
rowid: i64,
message_id: i64,
conversation_id: i64,
content: String,
created_at: Option<i64>,
}
#[derive(Debug, Clone)]
struct FtsConversationProjection {
title: String,
agent_id: Option<i64>,
workspace_id: Option<i64>,
source_path: String,
}
#[derive(Debug, Clone)]
pub struct FtsEntry {
pub content: String,
pub title: String,
pub agent: String,
pub workspace: String,
pub source_path: String,
pub created_at: Option<i64>,
pub message_id: i64,
}
impl FtsEntry {
pub fn from_message(message_id: i64, msg: &Message, conv: &Conversation) -> Self {
FtsEntry {
content: msg.content.clone(),
title: conv.title.clone().unwrap_or_default(),
agent: conv.agent_slug.clone(),
workspace: conv
.workspace
.as_ref()
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default(),
source_path: path_to_string(&conv.source_path),
created_at: msg.created_at.or(conv.started_at),
message_id,
}
}
}
const FTS_ENTRY_BATCH_MAX_DOCS: usize = 512;
const FTS_ENTRY_BATCH_MAX_CHARS: usize = 1024 * 1024;
const FTS_REBUILD_BATCH_SIZE_DEFAULT: usize = 5_000;
fn fts_rebuild_batch_size() -> usize {
dotenvy::var("CASS_FTS_REBUILD_BATCH_SIZE")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.filter(|&n| n > 0)
.unwrap_or(FTS_REBUILD_BATCH_SIZE_DEFAULT)
}
fn flush_pending_fts_entries(
storage: &FrankenStorage,
tx: &FrankenTransaction<'_>,
entries: &mut Vec<FtsEntry>,
pending_chars: &mut usize,
inserted_total: &mut usize,
) -> Result<()> {
if entries.is_empty() {
return Ok(());
}
if storage.fts_messages_present_cached(tx) {
*inserted_total += franken_batch_insert_fts(tx, entries)?;
}
entries.clear();
*pending_chars = 0;
Ok(())
}
fn path_to_string<P: AsRef<Path>>(p: P) -> String {
p.as_ref().to_string_lossy().into_owned()
}
fn role_str(role: &MessageRole) -> String {
role_as_str(role).to_owned()
}
fn role_as_str(role: &MessageRole) -> &str {
match role {
MessageRole::User => "user",
MessageRole::Agent => "agent",
MessageRole::Tool => "tool",
MessageRole::System => "system",
MessageRole::Other(v) => v.as_str(),
}
}
fn agent_kind_str(kind: AgentKind) -> String {
match kind {
AgentKind::Cli => "cli".into(),
AgentKind::VsCode => "vscode".into(),
AgentKind::Hybrid => "hybrid".into(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use serial_test::serial;
use tempfile::TempDir;
struct EnvGuard {
key: &'static str,
previous: Option<String>,
}
impl Drop for EnvGuard {
fn drop(&mut self) {
if let Some(value) = &self.previous {
unsafe {
std::env::set_var(self.key, value);
}
} else {
unsafe {
std::env::remove_var(self.key);
}
}
}
}
fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
let previous = dotenvy::var(key).ok();
unsafe {
std::env::set_var(key, value.as_ref());
}
EnvGuard { key, previous }
}
#[test]
fn doctor_mutation_open_guard_only_targets_canonical_archive_db() {
let dir = TempDir::new().unwrap();
let canonical = dir.path().join("agent_search.db");
let scratch = dir.path().join("scratch.db");
assert_eq!(
doctor_mutation_lock_path_for_db_open(&canonical),
Some(dir.path().join("doctor/locks/doctor-repair.lock"))
);
assert_eq!(doctor_mutation_lock_path_for_db_open(&scratch), None);
}
#[test]
fn doctor_lock_metadata_pid_detection_is_exact() {
let current = std::process::id();
assert!(doctor_lock_metadata_pid_is_current_process(&format!(
"schema_version=1\npid={current}\nmode=safe_auto_run\n"
)));
assert!(!doctor_lock_metadata_pid_is_current_process(
"schema_version=1\npid=not-a-pid\n"
));
assert!(!doctor_lock_metadata_pid_is_current_process(&format!(
"pid={}\n",
current.saturating_add(1)
)));
}
#[test]
fn doctor_storage_open_refuses_active_doctor_mutation_lock_from_other_process() {
use std::io::Write as _;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
{
let storage = FrankenStorage::open(&db_path).unwrap();
storage.close().unwrap();
}
let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
let mut lock_file = fs::OpenOptions::new()
.create(true)
.truncate(false)
.read(true)
.write(true)
.open(&lock_path)
.unwrap();
fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
lock_file.set_len(0).unwrap();
lock_file.write_all(b"schema_version=1\npid=1\n").unwrap();
lock_file.sync_all().unwrap();
let err =
open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
.expect_err("active doctor mutation lock must block canonical DB opens");
let message = err.to_string();
assert!(
message.contains("doctor mutation lock") && message.contains("active"),
"error should identify the active doctor mutation lock: {message}"
);
fs2::FileExt::unlock(&lock_file).unwrap();
}
#[test]
fn doctor_storage_open_allows_current_doctor_process_probe() {
use std::io::Write as _;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
{
let storage = FrankenStorage::open(&db_path).unwrap();
storage.close().unwrap();
}
let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
let mut lock_file = fs::OpenOptions::new()
.create(true)
.truncate(false)
.read(true)
.write(true)
.open(&lock_path)
.unwrap();
fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
lock_file.set_len(0).unwrap();
write!(lock_file, "schema_version=1\npid={}\n", std::process::id()).unwrap();
lock_file.sync_all().unwrap();
let conn =
open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
.expect(
"doctor process must be able to run post-repair read probes under its own lock",
);
drop(conn);
fs2::FileExt::unlock(&lock_file).unwrap();
}
#[test]
fn autocommit_retain_disable_tries_compat_then_canonical_pragma() {
let mut attempts = Vec::new();
let selected = disable_autocommit_retain(|pragma| {
attempts.push(pragma);
if pragma == "PRAGMA fsqlite.autocommit_retain = OFF;" {
Err("compat namespace unavailable")
} else {
Ok(())
}
})
.expect("canonical pragma should disable autocommit retain");
assert_eq!(selected, "PRAGMA autocommit_retain = OFF;");
assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
}
#[test]
fn autocommit_retain_disable_fails_closed_when_no_pragma_works() {
let mut attempts = Vec::new();
let err = disable_autocommit_retain(|pragma| {
attempts.push(pragma);
Err("unsupported pragma")
})
.expect_err("unsupported autocommit retain controls should fail closed");
assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
let message = err.to_string();
assert!(
message.contains("refusing to keep a long-lived MVCC connection"),
"error should force callers away from unbounded snapshot retention: {message}"
);
assert!(
message.contains("PRAGMA fsqlite.autocommit_retain = OFF;")
&& message.contains("PRAGMA autocommit_retain = OFF;"),
"error should preserve attempted PRAGMAs for diagnostics: {message}"
);
}
fn rusqlite_test_fixture_conn(db_path: &Path) -> rusqlite::Connection {
rusqlite::Connection::open(db_path).expect("open rusqlite test fixture connection")
}
fn seed_historical_db_direct(
db_path: &Path,
conversations: &[crate::model::types::Conversation],
) {
if let Some(parent) = db_path.parent() {
fs::create_dir_all(parent).unwrap();
}
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
conn.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA).unwrap();
conn.execute_compat(
"INSERT INTO agents(id, slug, name, version, kind, created_at, updated_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
fparams![1_i64, "codex", "Codex", "0.2.3", "cli", 0_i64, 0_i64],
)
.unwrap();
let mut next_message_id = 1_i64;
for (conv_index, conv) in conversations.iter().enumerate() {
let conversation_id = i64::try_from(conv_index + 1).unwrap();
let workspace_id = conv.workspace.as_ref().map(|workspace| {
let workspace_id = conversation_id;
let workspace_path = workspace.to_string_lossy().into_owned();
conn.execute_compat(
"INSERT INTO workspaces(id, path, display_name) VALUES(?1, ?2, ?3)",
fparams![
workspace_id,
workspace_path.as_str(),
workspace_path.as_str()
],
)
.unwrap();
workspace_id
});
let source_path = conv.source_path.to_string_lossy().into_owned();
let metadata_json = conv.metadata_json.to_string();
conn.execute_compat(
"INSERT INTO conversations (
id, agent_id, workspace_id, source_id, external_id, title, source_path,
started_at, ended_at, approx_tokens, metadata_json, origin_host
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
fparams![
conversation_id,
1_i64,
workspace_id,
conv.source_id.as_str(),
conv.external_id.as_deref(),
conv.title.as_deref(),
source_path.as_str(),
conv.started_at,
conv.ended_at,
conv.approx_tokens,
metadata_json.as_str(),
conv.origin_host.as_deref()
],
)
.unwrap();
for msg in &conv.messages {
let extra_json = msg.extra_json.to_string();
let role = role_str(&msg.role);
conn.execute_compat(
"INSERT INTO messages(
id, conversation_id, idx, role, author, created_at, content, extra_json
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
fparams![
next_message_id,
conversation_id,
msg.idx,
role.as_str(),
msg.author.as_deref(),
msg.created_at,
msg.content.as_str(),
extra_json.as_str()
],
)
.unwrap();
next_message_id += 1;
}
}
}
#[test]
fn is_user_data_file_detects_bookmarks() {
assert!(is_user_data_file(Path::new("/data/bookmarks.db")));
assert!(is_user_data_file(Path::new("bookmarks.db")));
}
#[test]
fn is_user_data_file_detects_tui_state() {
assert!(is_user_data_file(Path::new("/data/tui_state.json")));
}
#[test]
fn is_user_data_file_detects_sources_toml() {
assert!(is_user_data_file(Path::new("/config/sources.toml")));
}
#[test]
fn is_user_data_file_detects_env() {
assert!(is_user_data_file(Path::new(".env")));
}
#[test]
fn is_user_data_file_rejects_other_files() {
assert!(!is_user_data_file(Path::new("index.db")));
assert!(!is_user_data_file(Path::new("conversations.db")));
assert!(!is_user_data_file(Path::new("random.txt")));
}
#[test]
fn create_backup_returns_none_for_nonexistent() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("nonexistent.db");
let result = create_backup(&db_path).unwrap();
assert!(result.is_none());
}
#[test]
fn create_backup_creates_named_file() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
std::fs::write(&db_path, b"test data").unwrap();
let backup_path = create_backup(&db_path).unwrap();
assert!(backup_path.is_some());
let backup = backup_path.unwrap();
assert!(backup.exists());
assert!(
backup
.file_name()
.unwrap()
.to_str()
.unwrap()
.contains("backup")
);
}
#[test]
fn create_backup_paths_are_unique() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
std::fs::write(&db_path, b"test data").unwrap();
let first = create_backup(&db_path).unwrap().unwrap();
let second = create_backup(&db_path).unwrap().unwrap();
assert_ne!(first, second);
assert!(first.exists());
assert!(second.exists());
}
#[test]
fn lexical_rebuild_messages_query_uses_conversation_idx_access_path() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("conv-1".into()),
title: Some("Lexical rebuild".into()),
source_path: PathBuf::from("/tmp/conv-1.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_010),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_020),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
let conversation_id = storage
.conn
.query_row_map(
"SELECT id FROM conversations WHERE external_id = ?1",
fparams!["conv-1"],
|row| row.get_typed::<i64>(0),
)
.unwrap();
let opcodes: Vec<String> = storage
.conn
.query_map_collect(
"EXPLAIN \
SELECT id, idx, role, author, created_at, content \
FROM messages \
WHERE conversation_id = ?1 ORDER BY idx",
fparams![conversation_id],
|row| row.get_typed(1),
)
.unwrap();
assert!(
opcodes.iter().any(|opcode| opcode == "SeekGE"),
"expected lexical rebuild message fetch to seek into the conversation_id/idx access path, got {opcodes:?}"
);
assert!(
!opcodes.iter().any(|opcode| opcode == "SorterOpen"),
"expected lexical rebuild message fetch to avoid sorter temp b-trees, got {opcodes:?}"
);
}
#[test]
fn schema_check_rebuild_classification_ignores_transient_errors() {
assert!(!schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::Busy
));
assert!(!schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::DatabaseLocked {
path: PathBuf::from("/tmp/test.db"),
}
));
assert!(!schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::CannotOpen {
path: PathBuf::from("/tmp/test.db"),
}
));
assert!(!schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::Io(std::io::Error::other("disk hiccup"))
));
}
#[test]
fn schema_check_rebuild_classification_keeps_corruption_errors() {
assert!(schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::DatabaseCorrupt {
detail: "bad header".to_string(),
}
));
assert!(schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::WalCorrupt {
detail: "bad wal".to_string(),
}
));
assert!(schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::NotADatabase {
path: PathBuf::from("/tmp/test.db"),
}
));
assert!(schema_check_error_requires_rebuild(
&frankensqlite::FrankenError::ShortRead {
expected: 4096,
actual: 64,
}
));
}
#[test]
fn create_backup_refuses_raw_copy_after_retryable_vacuum_errors() {
let retryable_errors = [
frankensqlite::FrankenError::Busy,
frankensqlite::FrankenError::BusyRecovery,
frankensqlite::FrankenError::BusySnapshot {
conflicting_pages: "1,2".to_string(),
},
frankensqlite::FrankenError::DatabaseLocked {
path: PathBuf::from("/tmp/test.db"),
},
frankensqlite::FrankenError::LockFailed {
detail: "fcntl lock still held".to_string(),
},
frankensqlite::FrankenError::WriteConflict { page: 7, holder: 9 },
frankensqlite::FrankenError::SerializationFailure { page: 11 },
frankensqlite::FrankenError::Internal("database is locked".to_string()),
];
for err in retryable_errors {
assert!(
backup_vacuum_error_requires_consistent_retry(&err),
"retryable VACUUM failure must not fall back to raw bundle copy: {err}"
);
}
assert!(!backup_vacuum_error_requires_consistent_retry(
&frankensqlite::FrankenError::NotADatabase {
path: PathBuf::from("/tmp/test.db")
}
));
assert!(!backup_vacuum_error_requires_consistent_retry(
&frankensqlite::FrankenError::DatabaseCorrupt {
detail: "bad header".to_string()
}
));
}
#[test]
fn create_backup_uses_hidden_vacuum_stage_path() {
let backup_path = PathBuf::from("/tmp/test.db.backup.123.456.0");
let stage_path = vacuum_stage_backup_path(&backup_path);
let stage_name = stage_path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or_default();
assert!(stage_name.starts_with('.'));
assert!(stage_name.ends_with(".vacuum-in-progress"));
assert!(
!is_backup_root_name(stage_name, "test.db.backup."),
"incomplete VACUUM output must not be discoverable as a backup root"
);
}
#[test]
fn create_backup_preserves_content() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let original_content = b"test database content 12345";
std::fs::write(&db_path, original_content).unwrap();
let backup_path = create_backup(&db_path).unwrap().unwrap();
let backup_content = std::fs::read(&backup_path).unwrap();
assert_eq!(backup_content, original_content);
}
#[test]
fn create_backup_copies_sidecars_when_present() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
std::fs::write(&db_path, b"db").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
let backup_path = create_backup(&db_path).unwrap().unwrap();
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
b"wal"
);
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
b"shm"
);
}
#[test]
#[cfg(unix)]
fn create_backup_rejects_symlink_root_during_raw_fallback() {
use std::os::unix::fs::symlink;
let dir = TempDir::new().unwrap();
let outside_db = dir.path().join("outside.db");
let db_path = dir.path().join("test.db");
std::fs::write(&outside_db, b"not sqlite").unwrap();
symlink(&outside_db, &db_path).unwrap();
let err = create_backup(&db_path).unwrap_err();
assert!(
err.to_string().contains("bundle symlink"),
"unexpected error: {err:#}"
);
assert_eq!(std::fs::read(&outside_db).unwrap(), b"not sqlite");
let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
.unwrap()
.filter_map(|entry| entry.ok())
.map(|entry| entry.file_name().to_string_lossy().into_owned())
.filter(|name| name.starts_with("test.db.backup."))
.collect();
assert!(
backup_roots.is_empty(),
"symlinked backup source must not publish backup roots: {backup_roots:?}"
);
}
#[test]
#[cfg(unix)]
fn create_backup_rejects_symlink_sidecar_without_partial_backup() {
use std::os::unix::fs::symlink;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let outside_wal = dir.path().join("outside.wal");
let wal_path = database_sidecar_path(&db_path, "-wal");
std::fs::write(&db_path, b"not sqlite").unwrap();
std::fs::write(&outside_wal, b"outside wal").unwrap();
symlink(&outside_wal, &wal_path).unwrap();
let err = create_backup(&db_path).unwrap_err();
assert!(
err.to_string().contains("bundle symlink"),
"unexpected error: {err:#}"
);
assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
.unwrap()
.filter_map(|entry| entry.ok())
.map(|entry| entry.file_name().to_string_lossy().into_owned())
.filter(|name| name.starts_with("test.db.backup."))
.collect();
assert!(
backup_roots.is_empty(),
"sidecar preflight failure must not leave a partial backup root: {backup_roots:?}"
);
}
#[test]
fn cleanup_old_backups_keeps_recent() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
for i in 0..5 {
let backup_name = format!("test.db.backup.{}", 1000 + i);
std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
}
cleanup_old_backups(&db_path, 3).unwrap();
let backups: Vec<_> = std::fs::read_dir(dir.path())
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_name().to_str().unwrap_or("").contains("backup"))
.collect();
assert_eq!(backups.len(), 3);
}
#[test]
fn cleanup_old_backups_ignores_wal_and_shm_sidecars() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
for i in 0..3 {
let backup_name = format!("test.db.backup.{}", 1000 + i);
let backup_path = dir.path().join(&backup_name);
std::fs::write(&backup_path, format!("backup {i}")).unwrap();
std::fs::write(format!("{}-wal", backup_path.display()), b"wal").unwrap();
std::fs::write(format!("{}-shm", backup_path.display()), b"shm").unwrap();
std::thread::sleep(std::time::Duration::from_millis(20));
}
cleanup_old_backups(&db_path, 2).unwrap();
let mut roots = Vec::new();
let mut wals = Vec::new();
let mut shms = Vec::new();
for entry in std::fs::read_dir(dir.path())
.unwrap()
.filter_map(|e| e.ok())
{
let name = entry.file_name().to_string_lossy().into_owned();
if name.ends_with("-wal") {
wals.push(name);
} else if name.ends_with("-shm") {
shms.push(name);
} else if name.contains("backup") {
roots.push(name);
}
}
assert_eq!(roots.len(), 2, "should keep two backup roots");
assert_eq!(
wals.len(),
2,
"should keep WAL sidecars only for retained backups"
);
assert_eq!(
shms.len(),
2,
"should keep SHM sidecars only for retained backups"
);
}
#[test]
fn move_database_bundle_moves_database_and_sidecars() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let backup_path = dir.path().join("test.db.corrupt");
std::fs::write(&db_path, b"db").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
let moved = move_database_bundle(&db_path, &backup_path).unwrap();
assert_eq!(
moved,
DatabaseBundleMoveResult {
database: true,
wal: true,
shm: true
}
);
assert!(moved.moved_any());
assert!(!db_path.exists());
assert!(!database_sidecar_path(&db_path, "-wal").exists());
assert!(!database_sidecar_path(&db_path, "-shm").exists());
assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
b"wal"
);
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
b"shm"
);
}
#[test]
fn move_database_bundle_preserves_orphan_sidecars_without_main_db() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let backup_path = dir.path().join("test.db.corrupt");
std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
let moved = move_database_bundle(&db_path, &backup_path).unwrap();
assert_eq!(
moved,
DatabaseBundleMoveResult {
database: false,
wal: true,
shm: true
}
);
assert!(moved.moved_any());
assert!(!db_path.exists());
assert!(!database_sidecar_path(&db_path, "-wal").exists());
assert!(!database_sidecar_path(&db_path, "-shm").exists());
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
b"wal"
);
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
b"shm"
);
}
#[test]
#[cfg(unix)]
fn move_database_bundle_moves_dangling_symlink_database_root() {
use std::os::unix::fs::symlink;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let backup_path = dir.path().join("test.db.corrupt");
let missing_target = dir.path().join("missing-target.db");
symlink(&missing_target, &db_path).unwrap();
let moved = move_database_bundle(&db_path, &backup_path).unwrap();
assert_eq!(
moved,
DatabaseBundleMoveResult {
database: true,
wal: false,
shm: false
}
);
assert!(std::fs::symlink_metadata(&db_path).is_err());
assert!(
std::fs::symlink_metadata(&backup_path)
.unwrap()
.file_type()
.is_symlink()
);
assert!(!missing_target.exists());
}
#[test]
#[cfg(unix)]
fn move_database_bundle_moves_dangling_symlink_sidecars_without_main_db() {
use std::os::unix::fs::symlink;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let backup_path = dir.path().join("test.db.corrupt");
let missing_wal_target = dir.path().join("missing-wal");
let missing_shm_target = dir.path().join("missing-shm");
let wal_path = database_sidecar_path(&db_path, "-wal");
let shm_path = database_sidecar_path(&db_path, "-shm");
symlink(&missing_wal_target, &wal_path).unwrap();
symlink(&missing_shm_target, &shm_path).unwrap();
let moved = move_database_bundle(&db_path, &backup_path).unwrap();
assert_eq!(
moved,
DatabaseBundleMoveResult {
database: false,
wal: true,
shm: true
}
);
assert!(std::fs::symlink_metadata(&wal_path).is_err());
assert!(std::fs::symlink_metadata(&shm_path).is_err());
assert!(
std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-wal"))
.unwrap()
.file_type()
.is_symlink()
);
assert!(
std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-shm"))
.unwrap()
.file_type()
.is_symlink()
);
assert!(!missing_wal_target.exists());
assert!(!missing_shm_target.exists());
}
#[test]
fn copy_database_bundle_copies_database_and_sidecars() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let copied_path = dir.path().join("copy.db");
std::fs::write(&db_path, b"db").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
copy_database_bundle(&db_path, &copied_path).unwrap();
assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
assert_eq!(
std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
b"wal"
);
assert_eq!(
std::fs::read(database_sidecar_path(&copied_path, "-shm")).unwrap(),
b"shm"
);
assert_eq!(std::fs::read(&db_path).unwrap(), b"db");
}
#[test]
fn copy_database_bundle_creates_destination_parent() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let copied_path = dir.path().join("nested/copies/copy.db");
std::fs::write(&db_path, b"db").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
copy_database_bundle(&db_path, &copied_path).unwrap();
assert!(copied_path.parent().unwrap().is_dir());
assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
assert_eq!(
std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
b"wal"
);
}
#[test]
#[cfg(unix)]
fn copy_database_bundle_rejects_symlink_source_root() {
use std::os::unix::fs::symlink;
let dir = TempDir::new().unwrap();
let outside_db = dir.path().join("outside.db");
let db_path = dir.path().join("test.db");
let copied_path = dir.path().join("copy.db");
std::fs::write(&outside_db, b"outside").unwrap();
symlink(&outside_db, &db_path).unwrap();
let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
assert!(
err.to_string().contains("bundle symlink"),
"unexpected error: {err:#}"
);
assert!(!copied_path.exists());
assert_eq!(std::fs::read(&outside_db).unwrap(), b"outside");
}
#[test]
#[cfg(unix)]
fn copy_database_bundle_rejects_symlink_sidecar() {
use std::os::unix::fs::symlink;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let copied_path = dir.path().join("copy.db");
let outside_wal = dir.path().join("outside.wal");
let wal_path = database_sidecar_path(&db_path, "-wal");
std::fs::write(&db_path, b"db").unwrap();
std::fs::write(&outside_wal, b"outside wal").unwrap();
symlink(&outside_wal, &wal_path).unwrap();
let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
assert!(
err.to_string().contains("bundle symlink"),
"unexpected error: {err:#}"
);
assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
assert!(!copied_path.exists());
assert!(!database_sidecar_path(&copied_path, "-wal").exists());
}
#[test]
fn move_database_bundle_creates_destination_parent_and_moves_sidecars() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let backup_path = dir.path().join("nested/backups/test.db.corrupt");
std::fs::write(&db_path, b"db").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
let moved = move_database_bundle(&db_path, &backup_path).unwrap();
assert_eq!(
moved,
DatabaseBundleMoveResult {
database: true,
wal: true,
shm: true
}
);
assert!(backup_path.parent().unwrap().is_dir());
assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
b"wal"
);
assert_eq!(
std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
b"shm"
);
}
#[test]
fn remove_database_files_removes_orphan_sidecars_without_main_db() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
remove_database_files(&db_path).unwrap();
assert!(!db_path.exists());
assert!(!database_sidecar_path(&db_path, "-wal").exists());
assert!(!database_sidecar_path(&db_path, "-shm").exists());
}
#[test]
fn cleanup_old_backups_ignores_backup_named_directories() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
for i in 0..3 {
let backup_name = format!("test.db.backup.{}", 1000 + i);
std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
}
std::fs::create_dir(dir.path().join("test.db.backup.directory")).unwrap();
cleanup_old_backups(&db_path, 2).unwrap();
let mut backup_files = Vec::new();
let mut backup_dirs = Vec::new();
for entry in std::fs::read_dir(dir.path())
.unwrap()
.filter_map(|e| e.ok())
{
let name = entry.file_name().to_string_lossy().into_owned();
if !name.starts_with("test.db.backup.") {
continue;
}
if entry.path().is_dir() {
backup_dirs.push(name);
} else {
backup_files.push(name);
}
}
assert_eq!(
backup_files.len(),
2,
"only real backup files count toward retention"
);
assert_eq!(
backup_dirs.len(),
1,
"backup-named directories should be ignored"
);
}
#[test]
fn open_creates_new_database() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("new.db");
assert!(!db_path.exists());
let storage = SqliteStorage::open(&db_path).unwrap();
assert!(db_path.exists());
storage.close().unwrap();
}
#[test]
fn open_readonly_fails_for_nonexistent() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("nonexistent.db");
let result = SqliteStorage::open_readonly(&db_path);
assert!(result.is_err());
}
#[test]
fn open_readonly_succeeds_for_existing() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("existing.db");
let _storage = SqliteStorage::open(&db_path).unwrap();
drop(_storage);
let storage = SqliteStorage::open_readonly(&db_path).unwrap();
assert!(storage.schema_version().is_ok());
}
#[test]
fn reopen_existing_current_schema_is_idempotent() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("existing.db");
{
let storage = SqliteStorage::open(&db_path).unwrap();
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
}
let reopened = SqliteStorage::open(&db_path).unwrap();
assert_eq!(
reopened.schema_version().unwrap(),
CURRENT_SCHEMA_VERSION,
"reopening current schema DB should be idempotent"
);
}
#[test]
fn open_or_rebuild_current_schema_does_not_trigger_rebuild() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("existing.db");
{
let storage = SqliteStorage::open(&db_path).unwrap();
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
}
let reopened = SqliteStorage::open_or_rebuild(&db_path)
.expect("current schema DB should open without rebuild");
assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
}
#[test]
fn open_or_rebuild_does_not_treat_non_database_paths_as_corruption() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("db_dir");
std::fs::create_dir(&db_path).unwrap();
let result = SqliteStorage::open_or_rebuild(&db_path);
match result {
Err(MigrationError::Database(_)) | Err(MigrationError::Io(_)) => {}
Err(MigrationError::RebuildRequired { reason, .. }) => {
panic!("should not rebuild non-database path: {reason}")
}
Err(MigrationError::Other(msg)) => {
panic!("should preserve underlying open error, got Other: {msg}")
}
Ok(_) => panic!("directory path must not open as a database"),
}
assert!(
db_path.is_dir(),
"non-database directory must be left in place"
);
}
#[test]
fn schema_version_returns_current() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let version = storage.schema_version().unwrap();
assert!(version >= 5, "Schema version should be at least 5");
}
#[test]
fn migration_v13_creates_analytics_tables() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let version = storage.schema_version().unwrap();
assert_eq!(
version, CURRENT_SCHEMA_VERSION,
"Schema version must match CURRENT_SCHEMA_VERSION after migration"
);
let conn = storage.raw();
fn col_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
conn.query_map_collect(
&format!("PRAGMA table_info({})", table),
fparams![],
|row: &FrankenRow| row.get_typed(1),
)
.unwrap()
}
fn idx_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
conn.query_map_collect(
&format!("PRAGMA index_list({})", table),
fparams![],
|row: &FrankenRow| row.get_typed(1),
)
.unwrap()
}
let mm_cols = col_names(conn, "message_metrics");
for expected in &[
"message_id",
"hour_id",
"day_id",
"content_tokens_est",
"model_name",
"model_family",
"model_tier",
"provider",
"api_input_tokens",
"has_plan",
"agent_slug",
"role",
"api_data_source",
] {
assert!(
mm_cols.contains(&expected.to_string()),
"message_metrics missing column: {expected}"
);
}
let uh_cols = col_names(conn, "usage_hourly");
for expected in &[
"hour_id",
"plan_message_count",
"plan_content_tokens_est_total",
"plan_api_tokens_total",
"api_coverage_message_count",
"content_tokens_est_user",
"api_thinking_tokens_total",
] {
assert!(
uh_cols.contains(&expected.to_string()),
"usage_hourly missing column: {expected}"
);
}
let ud_cols = col_names(conn, "usage_daily");
for expected in &[
"day_id",
"plan_content_tokens_est_total",
"plan_api_tokens_total",
"api_thinking_tokens_total",
"content_tokens_est_assistant",
"message_count",
] {
assert!(
ud_cols.contains(&expected.to_string()),
"usage_daily missing column: {expected}"
);
}
let umd_cols = col_names(conn, "usage_models_daily");
for expected in &[
"day_id",
"model_family",
"model_tier",
"message_count",
"api_tokens_total",
"api_coverage_message_count",
] {
assert!(
umd_cols.contains(&expected.to_string()),
"usage_models_daily missing column: {expected}"
);
}
let mm_idxs = idx_names(conn, "message_metrics");
assert!(
mm_idxs.iter().any(|n| n.contains("idx_mm_hour")),
"message_metrics must have hour index"
);
assert!(
mm_idxs.iter().any(|n| n.contains("idx_mm_agent_day")),
"message_metrics must have agent+day index"
);
assert!(
mm_idxs
.iter()
.any(|n| n.contains("idx_mm_model_family_day")),
"message_metrics must have model_family+day index"
);
let uh_idxs = idx_names(conn, "usage_hourly");
assert!(
uh_idxs.iter().any(|n| n.contains("idx_uh_agent")),
"usage_hourly must have agent index"
);
let ud_idxs = idx_names(conn, "usage_daily");
assert!(
ud_idxs.iter().any(|n| n.contains("idx_ud_agent")),
"usage_daily must have agent index"
);
let umd_idxs = idx_names(conn, "usage_models_daily");
assert!(
umd_idxs.iter().any(|n| n.contains("idx_umd_model_day")),
"usage_models_daily must have model+day index"
);
let conversation_cols = col_names(conn, "conversations");
assert!(
conversation_cols.contains(&"last_message_idx".to_string())
&& conversation_cols.contains(&"last_message_created_at".to_string()),
"fresh schema must include V15 tail columns without ALTER TABLE on conversations"
);
let fts_schema_rows: i64 = conn
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
fparams![],
|row: &FrankenRow| row.get_typed(0),
)
.unwrap();
assert_eq!(
fts_schema_rows, 0,
"fresh schema should not create and immediately drop derived fts_messages"
);
let integrity: Vec<String> = conn
.query_map_collect("PRAGMA integrity_check;", fparams![], |row: &FrankenRow| {
row.get_typed(0)
})
.unwrap();
assert_eq!(
integrity,
vec!["ok".to_string()],
"fresh schema must pass SQLite integrity_check"
);
}
#[test]
fn hour_id_round_trip() {
let ts_ms = 1_770_508_800_000_i64;
let hour_id = SqliteStorage::hour_id_from_millis(ts_ms);
let day_id = SqliteStorage::day_id_from_millis(ts_ms);
assert_eq!(hour_id / 24, day_id, "hour_id/24 should equal day_id");
let back = SqliteStorage::millis_from_hour_id(hour_id);
assert!(
back <= ts_ms && ts_ms - back < 3_600_000,
"Round-trip should land within the same hour"
);
}
#[test]
fn day_and_hour_ids_floor_negative_millis() {
let ts_ms = -1_i64;
let expected_secs = -1_i64;
let epoch_2020_secs = 1_577_836_800_i64;
assert_eq!(
SqliteStorage::day_id_from_millis(ts_ms),
(expected_secs - epoch_2020_secs).div_euclid(86_400)
);
assert_eq!(
SqliteStorage::hour_id_from_millis(ts_ms),
(expected_secs - epoch_2020_secs).div_euclid(3_600)
);
}
#[test]
fn migration_v13_from_v10() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
{
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
conn.execute_batch("PRAGMA journal_mode=WAL;").unwrap();
conn.execute_batch(
"CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT);",
)
.unwrap();
conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', '10')")
.unwrap();
conn.execute_batch(MIGRATION_V1).unwrap();
conn.execute_batch(MIGRATION_V2).unwrap();
conn.execute_batch(MIGRATION_V4).unwrap();
conn.execute_batch(MIGRATION_V5).unwrap();
conn.execute_batch(MIGRATION_V6).unwrap();
conn.execute_batch(MIGRATION_V7).unwrap();
conn.execute_batch(MIGRATION_V8).unwrap();
conn.execute_batch(MIGRATION_V9).unwrap();
conn.execute_batch(MIGRATION_V10).unwrap();
conn.execute("UPDATE meta SET value = '10' WHERE key = 'schema_version'")
.unwrap();
}
materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
let storage = SqliteStorage::open(&db_path).unwrap();
let version = storage.schema_version().unwrap();
assert_eq!(
version, CURRENT_SCHEMA_VERSION,
"Should have migrated from v10 to the current schema"
);
let count: i64 = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name IN ('message_metrics', 'usage_hourly', 'usage_daily', 'usage_models_daily')",
&[],
|row: &FrankenRow| row.get_typed::<i64>(0),
)
.unwrap();
assert_eq!(count, 4, "All 4 analytics tables should exist");
}
#[test]
fn analytics_ingest_populates_metrics_and_rollups() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: Some("1.0".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let ts_ms = 1_770_551_400_000_i64;
let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
let usage_json = serde_json::json!({
"message": {
"model": "claude-opus-4-6",
"usage": {
"input_tokens": 100,
"output_tokens": 50,
"cache_read_input_tokens": 200,
"cache_creation_input_tokens": 30,
"service_tier": "standard"
}
}
});
let conv = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: None,
external_id: Some("test-conv-1".into()),
title: Some("Test conversation".into()),
source_path: PathBuf::from("/tmp/test.jsonl"),
started_at: Some(ts_ms),
ended_at: Some(ts_ms + 60_000),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(ts_ms),
content: "Hello, can you help me with a plan?".into(),
extra_json: serde_json::Value::Null,
snippets: vec![],
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(ts_ms + 30_000),
content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
extra_json: usage_json,
snippets: vec![],
},
Message {
id: None,
idx: 2,
role: MessageRole::User,
author: None,
created_at: Some(ts_ms + 60_000),
content: "Great, let's proceed!".into(),
extra_json: serde_json::Value::Null,
snippets: vec![],
},
],
source_id: "local".into(),
origin_host: None,
};
let outcomes = storage
.insert_conversations_batched(&[(agent_id, None, &conv)])
.unwrap();
assert_eq!(outcomes.len(), 1);
assert_eq!(outcomes[0].inserted_indices.len(), 3);
let conn = storage.raw();
let mm_count: i64 = conn
.query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
row.get_typed::<i64>(0)
})
.unwrap();
assert_eq!(mm_count, 3, "Should have 3 message_metrics rows");
#[allow(clippy::type_complexity)]
let rows: Vec<(i64, i64, String, i64, i64, String, String, String, String)> = conn
.query_map_collect(
"SELECT hour_id, day_id, role, content_tokens_est, has_plan, api_data_source, model_family, model_tier, provider FROM message_metrics ORDER BY message_id",
fparams![],
|row: &FrankenRow| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
row.get_typed(4)?,
row.get_typed(5)?,
row.get_typed(6)?,
row.get_typed(7)?,
row.get_typed(8)?,
))
},
)
.unwrap();
assert_eq!(rows.len(), 3);
assert_eq!(rows[0].0, expected_hour);
assert_eq!(rows[0].1, expected_day);
assert_eq!(rows[0].2, "user");
assert_eq!(
rows[1].4, 1,
"Assistant message with plan should have has_plan=1"
);
assert_eq!(
rows[1].5, "api",
"Claude Code assistant message should have api data source"
);
assert_eq!(rows[0].5, "estimated");
assert_eq!(rows[2].5, "estimated");
assert_eq!(rows[1].6, "claude");
assert_eq!(rows[1].7, "opus");
assert_eq!(rows[1].8, "anthropic");
assert_eq!(rows[0].6, "unknown");
let user_chars = "Hello, can you help me with a plan?".len() as i64;
assert_eq!(rows[0].3, user_chars / 4);
let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api, uh_api_cov): (
i64,
i64,
i64,
i64,
i64,
i64,
i64,
) = conn
.query_row_map(
"SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
plan_content_tokens_est_total, plan_api_tokens_total, api_coverage_message_count
FROM usage_hourly WHERE hour_id = ?",
fparams![expected_hour],
|row: &FrankenRow| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
row.get_typed(4)?,
row.get_typed(5)?,
row.get_typed(6)?,
))
},
)
.unwrap();
assert_eq!(uh_msg, 3, "Hourly rollup should have 3 messages");
assert_eq!(uh_user, 2, "Hourly rollup should have 2 user messages");
assert_eq!(uh_asst, 1, "Hourly rollup should have 1 assistant message");
assert_eq!(uh_plan, 1, "Hourly rollup should have 1 plan message");
assert!(
uh_plan_content > 0,
"Hourly rollup should include plan content tokens"
);
assert!(
uh_plan_api > 0,
"Hourly rollup should include plan API tokens"
);
assert_eq!(
uh_api_cov, 1,
"Hourly rollup should have 1 API-covered message"
);
let (ud_msg, ud_api_cov): (i64, i64) = conn
.query_row_map(
"SELECT message_count, api_coverage_message_count FROM usage_daily WHERE day_id = ?",
fparams![expected_day],
|row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(ud_msg, 3, "Daily rollup should match hourly");
assert_eq!(
ud_api_cov, 1,
"Daily api_coverage should be 1 (only assistant msg has real API data)"
);
let api_only_input: i64 = conn
.query_row_map(
"SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE day_id = ? AND api_data_source = 'api'",
fparams![expected_day],
|row: &FrankenRow| row.get_typed::<i64>(0),
)
.unwrap();
assert_eq!(
api_only_input, 100,
"Only API-sourced input tokens should be 100"
);
let mm_total_content_est: i64 = conn
.query_row_map(
"SELECT SUM(content_tokens_est) FROM message_metrics WHERE day_id = ?",
fparams![expected_day],
|row| row.get_typed::<i64>(0),
)
.unwrap();
let mm_plan_content_est: i64 = conn
.query_row_map(
"SELECT COALESCE(SUM(content_tokens_est), 0) FROM message_metrics WHERE day_id = ? AND has_plan = 1",
fparams![expected_day],
|row: &FrankenRow| row.get_typed::<i64>(0),
)
.unwrap();
let mm_plan_api_total: i64 = conn
.query_row_map(
"SELECT COALESCE(SUM(COALESCE(api_input_tokens, 0) + COALESCE(api_output_tokens, 0) + COALESCE(api_cache_read_tokens, 0) + COALESCE(api_cache_creation_tokens, 0) + COALESCE(api_thinking_tokens, 0)), 0)
FROM message_metrics WHERE day_id = ? AND has_plan = 1 AND api_data_source = 'api'",
fparams![expected_day],
|row: &FrankenRow| row.get_typed::<i64>(0),
)
.unwrap();
let ud_content_est: i64 = conn
.query_row_map(
"SELECT content_tokens_est_total FROM usage_daily WHERE day_id = ?",
fparams![expected_day],
|row| row.get_typed::<i64>(0),
)
.unwrap();
let (ud_plan_content_est, ud_plan_api_total): (i64, i64) = conn
.query_row_map(
"SELECT plan_content_tokens_est_total, plan_api_tokens_total FROM usage_daily WHERE day_id = ?",
fparams![expected_day],
|row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
mm_total_content_est, ud_content_est,
"Daily rollup content_tokens_est_total must equal SUM of message_metrics"
);
assert_eq!(
mm_plan_content_est, ud_plan_content_est,
"Daily rollup plan_content_tokens_est_total must equal planned message_metrics content sum"
);
assert_eq!(
mm_plan_api_total, ud_plan_api_total,
"Daily rollup plan_api_tokens_total must equal planned message_metrics API token sum"
);
let (claude_msg, claude_user, claude_asst, claude_api_total, claude_api_cov): (
i64,
i64,
i64,
i64,
i64,
) = conn
.query_row_map(
"SELECT message_count, user_message_count, assistant_message_count, api_tokens_total, api_coverage_message_count
FROM usage_models_daily
WHERE day_id = ? AND model_family = 'claude' AND model_tier = 'opus'",
fparams![expected_day],
|row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?, row.get_typed(3)?, row.get_typed(4)?)),
)
.unwrap();
assert_eq!(claude_msg, 1);
assert_eq!(claude_user, 0);
assert_eq!(claude_asst, 1);
assert_eq!(claude_api_total, 380);
assert_eq!(claude_api_cov, 1);
let unknown_msg: i64 = conn
.query_row_map(
"SELECT message_count FROM usage_models_daily
WHERE day_id = ? AND model_family = 'unknown' AND model_tier = 'unknown'",
fparams![expected_day],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(
unknown_msg, 2,
"user messages should land in unknown model bucket"
);
}
#[test]
fn has_plan_heuristic_detects_plans() {
assert!(has_plan_heuristic(
"## Plan\n\n1. First step\n2. Second step"
));
assert!(has_plan_heuristic(
"# Plan\nHere is what we will do:\n1. Step one\n2. Step two"
));
assert!(has_plan_heuristic(
"Plan:\n- Gather baseline\n- Implement changes\n- Validate with tests"
));
assert!(has_plan_heuristic(
"Next steps:\n1. Update schema\n2. Rebuild rollups"
));
assert!(!has_plan_heuristic("Hello world"));
assert!(!has_plan_heuristic("Short"));
assert!(!has_plan_heuristic(
"This is a regular message without plans"
));
assert!(!has_plan_heuristic(
"```json\n{\"tool\":\"shell\",\"stdout\":\"1. install\\n2. run\"}\n```"
));
}
#[test]
fn has_plan_for_role_only_counts_assistant_messages() {
let plan_text = "## Plan\n1. First\n2. Second";
assert!(has_plan_for_role("assistant", plan_text));
assert!(has_plan_for_role("agent", plan_text));
assert!(has_plan_for_role("Assistant", plan_text));
assert!(!has_plan_for_role("user", plan_text));
assert!(!has_plan_for_role("tool", plan_text));
}
#[test]
fn api_rollups_require_api_data_source() {
let mut agg = AnalyticsRollupAggregator::new();
let estimated_plan = MessageMetricsEntry {
message_id: 1,
created_at_ms: 0,
hour_id: 1,
day_id: 1,
agent_slug: "codex".into(),
workspace_id: 0,
source_id: "local".into(),
role: "assistant".into(),
content_chars: 120,
content_tokens_est: 30,
model_name: None,
model_family: "unknown".into(),
model_tier: "unknown".into(),
provider: "unknown".into(),
api_input_tokens: Some(100),
api_output_tokens: Some(50),
api_cache_read_tokens: Some(0),
api_cache_creation_tokens: Some(0),
api_thinking_tokens: Some(0),
api_service_tier: None,
api_data_source: "estimated".into(),
tool_call_count: 0,
has_tool_calls: false,
has_plan: true,
};
agg.record(&estimated_plan);
let api_plan = MessageMetricsEntry {
message_id: 2,
created_at_ms: 0,
hour_id: 1,
day_id: 1,
agent_slug: "codex".into(),
workspace_id: 0,
source_id: "local".into(),
role: "assistant".into(),
content_chars: 80,
content_tokens_est: 20,
model_name: None,
model_family: "unknown".into(),
model_tier: "unknown".into(),
provider: "unknown".into(),
api_input_tokens: Some(40),
api_output_tokens: Some(10),
api_cache_read_tokens: Some(0),
api_cache_creation_tokens: Some(0),
api_thinking_tokens: Some(0),
api_service_tier: None,
api_data_source: "api".into(),
tool_call_count: 0,
has_tool_calls: false,
has_plan: true,
};
agg.record(&api_plan);
let key = (1_i64, "codex".to_string(), 0_i64, "local".to_string());
let hourly = agg.hourly.get(&key).expect("hourly rollup key must exist");
let daily = agg.daily.get(&key).expect("daily rollup key must exist");
let model_key = (
1_i64,
"codex".to_string(),
0_i64,
"local".to_string(),
"unknown".to_string(),
"unknown".to_string(),
);
let models_daily = agg
.models_daily
.get(&model_key)
.expect("model rollup key must exist");
assert_eq!(hourly.plan_message_count, 2);
assert_eq!(hourly.plan_content_tokens_est_total, 50);
assert_eq!(hourly.plan_api_tokens_total, 50);
assert_eq!(daily.plan_api_tokens_total, 50);
assert_eq!(models_daily.plan_api_tokens_total, 50);
assert_eq!(hourly.api_tokens_total, 50);
assert_eq!(hourly.api_input_tokens_total, 40);
assert_eq!(hourly.api_output_tokens_total, 10);
assert_eq!(hourly.api_coverage_message_count, 1);
assert_eq!(daily.api_tokens_total, 50);
assert_eq!(models_daily.api_tokens_total, 50);
}
#[test]
fn has_plan_heuristic_curated_corpus_thresholds() {
let positives = [
"## Plan\n1. Inspect current schema\n2. Add migration\n3. Verify rebuild",
"Plan:\n1) Reproduce\n2) Patch\n3) Add tests",
"Implementation plan:\n- Parse inputs\n- Update rollups\n- Run checks",
"Next steps:\n1. Reserve file\n2. Implement\n3. Report status",
"# Plan\n1. Gather requirements\n2. Ship changes",
"Action plan:\n- Identify root cause\n- Fix it\n- Validate",
];
let negatives = [
"The plan is to move fast and fix things later.",
"```json\n{\"tool\":\"shell\",\"stdout\":\"1. ls\\n2. cat\"}\n```",
"stdout:\n1. Build started\n2. Build finished\nexit code: 0",
"I can help with that request. Let me know if you want details.",
"Here is a list:\n- apples\n- oranges",
"Status update: completed tasks and blockers below.",
];
let tp = positives
.iter()
.filter(|msg| has_plan_heuristic(msg))
.count();
let fp = negatives
.iter()
.filter(|msg| has_plan_heuristic(msg))
.count();
let recall = tp as f64 / positives.len() as f64;
let false_positive_rate = fp as f64 / negatives.len() as f64;
assert!(
recall >= 0.80,
"plan heuristic recall too low: got {recall:.2}"
);
assert!(
false_positive_rate <= 0.20,
"plan heuristic false-positive rate too high: got {false_positive_rate:.2}"
);
}
#[test]
fn rebuild_analytics_repopulates_from_messages() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: Some("1.0".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let ts_ms = 1_770_551_400_000_i64;
let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
let usage_json = serde_json::json!({
"message": {
"model": "claude-opus-4-6",
"usage": {
"input_tokens": 100,
"output_tokens": 50,
"cache_read_input_tokens": 200,
"cache_creation_input_tokens": 30,
"service_tier": "standard"
}
}
});
let conv = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: None,
external_id: Some("test-rebuild-1".into()),
title: Some("Test conversation".into()),
source_path: PathBuf::from("/tmp/test.jsonl"),
started_at: Some(ts_ms),
ended_at: Some(ts_ms + 60_000),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(ts_ms),
content: "Hello, can you help me with a plan?".into(),
extra_json: serde_json::Value::Null,
snippets: vec![],
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(ts_ms + 30_000),
content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
extra_json: usage_json,
snippets: vec![],
},
Message {
id: None,
idx: 2,
role: MessageRole::User,
author: None,
created_at: Some(ts_ms + 60_000),
content: "Great, let's proceed!".into(),
extra_json: serde_json::Value::Null,
snippets: vec![],
},
],
source_id: "local".into(),
origin_host: None,
};
storage
.insert_conversations_batched(&[(agent_id, None, &conv)])
.unwrap();
let conn = storage.raw();
let orig_mm: i64 = conn
.query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
row.get_typed(0)
})
.unwrap();
let orig_hourly: i64 = conn
.query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
row.get_typed(0)
})
.unwrap();
let orig_daily: i64 = conn
.query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
row.get_typed(0)
})
.unwrap();
let orig_models_daily: i64 = conn
.query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
row.get_typed(0)
})
.unwrap();
let orig_api_input: i64 = conn
.query_row_map(
"SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
&[],
|row: &FrankenRow| row.get_typed(0),
)
.unwrap();
assert_eq!(orig_mm, 3);
assert!(orig_hourly > 0);
assert!(orig_daily > 0);
assert!(orig_models_daily > 0);
conn.execute("DELETE FROM message_metrics").unwrap();
conn.execute("DELETE FROM usage_hourly").unwrap();
conn.execute("DELETE FROM usage_daily").unwrap();
conn.execute("DELETE FROM usage_models_daily").unwrap();
let zero: i64 = conn
.query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(zero, 0);
let result = storage.rebuild_analytics().unwrap();
assert_eq!(result.message_metrics_rows, 3);
assert!(result.usage_hourly_rows > 0);
assert!(result.usage_daily_rows > 0);
assert!(result.usage_models_daily_rows > 0);
assert!(
result.elapsed_ms < 10_000,
"Rebuild should be fast for 3 msgs"
);
let conn = storage.raw();
let rebuilt_mm: i64 = conn
.query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(
rebuilt_mm, orig_mm,
"Rebuilt message_metrics count should match"
);
let rebuilt_hourly: i64 = conn
.query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(
rebuilt_hourly, orig_hourly,
"Rebuilt hourly rows should match"
);
let rebuilt_daily: i64 = conn
.query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(rebuilt_daily, orig_daily, "Rebuilt daily rows should match");
let rebuilt_models_daily: i64 = conn
.query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(
rebuilt_models_daily, orig_models_daily,
"Rebuilt model rollup rows should match"
);
let rebuilt_api_input: i64 = conn
.query_row_map(
"SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
&[],
|row: &FrankenRow| row.get_typed(0),
)
.unwrap();
assert_eq!(
rebuilt_api_input, orig_api_input,
"Rebuilt API input tokens should match original"
);
let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api): (
i64,
i64,
i64,
i64,
i64,
i64,
) = conn
.query_row_map(
"SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
plan_content_tokens_est_total, plan_api_tokens_total
FROM usage_hourly WHERE hour_id = ?",
fparams![expected_hour],
|row: &FrankenRow| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
row.get_typed(4)?,
row.get_typed(5)?,
))
},
)
.unwrap();
assert_eq!(uh_msg, 3);
assert_eq!(uh_user, 2);
assert_eq!(uh_asst, 1);
assert_eq!(uh_plan, 1);
assert!(uh_plan_content > 0);
assert!(uh_plan_api > 0);
let ud_msg: i64 = conn
.query_row_map(
"SELECT message_count FROM usage_daily WHERE day_id = ?",
fparams![expected_day],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(ud_msg, 3);
}
#[test]
fn insert_conversations_batched_flushes_large_fts_batches() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
storage
.ensure_search_fallback_fts_consistency()
.expect("ensure FTS consistency before insert");
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let content = "y".repeat((FTS_ENTRY_BATCH_MAX_CHARS / 2) + 1);
let messages: Vec<_> = (0_i64..2)
.map(|i| Message {
id: None,
idx: i,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_000 + i),
content: format!("{i}-{content}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
.collect();
let conv = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("fts-large-batch".into()),
title: Some("FTS Large Batch".into()),
source_path: PathBuf::from("/tmp/rollout.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let outcomes = storage
.insert_conversations_batched(&[(agent_id, None, &conv)])
.unwrap();
assert_eq!(outcomes.len(), 1);
assert_eq!(outcomes[0].inserted_indices.len(), conv.messages.len());
let message_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let fts_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(message_count, conv.messages.len() as i64);
assert_eq!(fts_count, conv.messages.len() as i64);
}
fn make_profiled_storage_remote_conversation(
external_id: i64,
msg_count: usize,
) -> Conversation {
Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/ws/profiled-storage-remote")),
external_id: Some(format!("profiled-storage-remote-{external_id}")),
title: Some(format!(
"Profiled storage remote conversation {external_id}"
)),
source_path: PathBuf::from(format!("/log/profiled-storage-remote-{external_id}.jsonl")),
started_at: Some(10_000 + external_id * 100),
ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
approx_tokens: Some(msg_count as i64 * 32),
metadata_json: serde_json::json!({ "bench": true }),
messages: (0..msg_count)
.map(|idx| Message {
id: None,
idx: idx as i64,
role: if idx % 2 == 0 {
MessageRole::User
} else {
MessageRole::Agent
},
author: Some("tester".into()),
created_at: Some(20_000 + external_id * 100 + idx as i64),
content: format!(
"profiled storage remote content ext={external_id} idx={idx} {}",
"x".repeat(64)
),
extra_json: serde_json::json!({ "idx": idx }),
snippets: Vec::new(),
})
.collect(),
source_id: "profiled-storage-remote-source".into(),
origin_host: Some("builder-profile".into()),
}
}
fn make_profiled_append_remote_merge_conversation(
external_id: i64,
msg_count: usize,
) -> Conversation {
let base_ts = 100_000 + external_id * 1_000;
Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/ws/profiled-append-remote")),
external_id: Some(format!("profiled-append-remote-{external_id}")),
title: Some(format!("Profiled append remote conversation {external_id}")),
source_path: PathBuf::from(format!("/log/profiled-append-remote-{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + msg_count as i64),
approx_tokens: Some(msg_count as i64 * 50),
metadata_json: serde_json::json!({ "bench": true }),
messages: (0..msg_count)
.map(|idx| Message {
id: None,
idx: idx as i64,
role: if idx % 2 == 0 {
MessageRole::User
} else {
MessageRole::Agent
},
author: Some(format!("model-{}", external_id % 5)),
created_at: Some(base_ts + idx as i64),
content: format!(
"Profiled append remote conversation {} message {}: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
external_id, idx
),
extra_json: serde_json::json!({ "bench": true }),
snippets: Vec::new(),
})
.collect(),
source_id: "profiled-append-remote-source".into(),
origin_host: Some("builder-profile".into()),
}
}
#[test]
fn insert_conversation_tree_batched_new_message_ids_match_snippet_rows() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("batched-message-ids.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace_id = storage
.ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
.unwrap();
let mut conv = make_profiled_storage_remote_conversation(42, 5);
for (idx, msg) in conv.messages.iter_mut().enumerate() {
msg.snippets.push(Snippet {
id: None,
file_path: Some(PathBuf::from(format!("src/file_{idx}.rs"))),
start_line: Some((idx + 1) as i64),
end_line: Some((idx + 2) as i64),
language: Some("rust".into()),
snippet_text: Some(format!("fn snippet_{idx}() {{}}")),
});
}
let outcome = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &conv)
.unwrap();
let message_count: i64 = storage
.conn
.query_row_map(
"SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
fparams![outcome.conversation_id],
|row| row.get_typed(0),
)
.unwrap();
let joined_snippet_count: i64 = storage
.conn
.query_row_map(
"SELECT COUNT(*)
FROM snippets s
JOIN messages m ON s.message_id = m.id
WHERE m.conversation_id = ?1",
fparams![outcome.conversation_id],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(message_count, conv.messages.len() as i64);
assert_eq!(joined_snippet_count, conv.messages.len() as i64);
}
#[test]
fn insert_conversation_tree_batched_appended_message_ids_match_snippet_rows() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("batched-append-message-ids.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace_id = storage
.ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
.unwrap();
let mut initial = make_profiled_storage_remote_conversation(77, 2);
for (idx, msg) in initial.messages.iter_mut().enumerate() {
msg.snippets.push(Snippet {
id: None,
file_path: Some(PathBuf::from(format!("src/append_initial_{idx}.rs"))),
start_line: Some((idx + 1) as i64),
end_line: Some((idx + 2) as i64),
language: Some("rust".into()),
snippet_text: Some(format!("fn append_initial_{idx}() {{}}")),
});
}
let first = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &initial)
.unwrap();
assert_eq!(first.inserted_indices, vec![0, 1]);
let mut appended = make_profiled_storage_remote_conversation(77, 5);
for (idx, msg) in appended.messages.iter_mut().enumerate() {
msg.snippets.push(Snippet {
id: None,
file_path: Some(PathBuf::from(format!("src/append_full_{idx}.rs"))),
start_line: Some((idx + 10) as i64),
end_line: Some((idx + 11) as i64),
language: Some("rust".into()),
snippet_text: Some(format!("fn append_full_{idx}() {{}}")),
});
}
let second = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &appended)
.unwrap();
assert_eq!(second.conversation_id, first.conversation_id);
assert_eq!(second.inserted_indices, vec![2, 3, 4]);
let message_count: i64 = storage
.conn
.query_row_map(
"SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
fparams![first.conversation_id],
|row| row.get_typed(0),
)
.unwrap();
let joined_snippets: Vec<(i64, String)> = storage
.conn
.query_map_collect(
"SELECT m.idx, s.file_path
FROM snippets s
JOIN messages m ON s.message_id = m.id
WHERE m.conversation_id = ?1
ORDER BY m.idx, s.id",
fparams![first.conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(message_count, 5);
assert_eq!(
joined_snippets,
vec![
(0, "src/append_initial_0.rs".to_string()),
(1, "src/append_initial_1.rs".to_string()),
(2, "src/append_full_2.rs".to_string()),
(3, "src/append_full_3.rs".to_string()),
(4, "src/append_full_4.rs".to_string()),
]
);
}
#[test]
fn insert_conversation_tree_rehydrates_external_lookup_after_manual_clear() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("external-lookup-rehydrate.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace_id = storage
.ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
.unwrap();
let initial = make_profiled_storage_remote_conversation(88, 2);
let first = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &initial)
.unwrap();
let external_id = initial.external_id.as_deref().unwrap();
let lookup_key =
conversation_external_lookup_key(&initial.source_id, agent_id, external_id);
let lookup_id: i64 = storage
.conn
.query_row_map(
"SELECT conversation_id
FROM conversation_external_tail_lookup
WHERE lookup_key = ?1",
fparams![lookup_key.as_str()],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(lookup_id, first.conversation_id);
storage
.conn
.execute_compat(
"DELETE FROM conversation_external_tail_lookup WHERE lookup_key = ?1",
fparams![lookup_key.as_str()],
)
.unwrap();
let appended = make_profiled_storage_remote_conversation(88, 4);
let second = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &appended)
.unwrap();
assert_eq!(second.conversation_id, first.conversation_id);
assert_eq!(second.inserted_indices, vec![2, 3]);
let conversation_count: i64 = storage
.conn
.query_row_map(
"SELECT COUNT(*)
FROM conversations
WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
fparams![initial.source_id.as_str(), agent_id, external_id],
|row| row.get_typed(0),
)
.unwrap();
let restored_lookup: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
.conn
.query_row_map(
"SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
FROM conversation_external_tail_lookup
WHERE lookup_key = ?1",
fparams![lookup_key.as_str()],
|row| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
))
},
)
.unwrap();
let tail_state: (Option<i64>, Option<i64>, Option<i64>) = storage
.conn
.query_row_map(
"SELECT ended_at, last_message_idx, last_message_created_at
FROM conversation_tail_state
WHERE conversation_id = ?1",
fparams![first.conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
)
.unwrap();
assert_eq!(conversation_count, 1);
assert_eq!(
restored_lookup,
(
first.conversation_id,
tail_state.0,
tail_state.1,
tail_state.2
)
);
assert_eq!(
tail_state,
(
appended.messages[3].created_at,
Some(3),
appended.messages[3].created_at
)
);
}
#[test]
fn insert_conversation_tree_recreates_daily_stats_after_manual_clear() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace = PathBuf::from("/ws/profiled-storage-remote");
let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
storage
.insert_conversation_tree(
agent_id,
Some(workspace_id),
&make_profiled_storage_remote_conversation(0, 3),
)
.unwrap();
storage.conn.execute("DELETE FROM daily_stats").unwrap();
storage
.insert_conversation_tree(
agent_id,
Some(workspace_id),
&make_profiled_storage_remote_conversation(1, 2),
)
.unwrap();
let row_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let (session_count, message_count): (i64, i64) = storage
.conn
.query_row_map(
"SELECT session_count, message_count
FROM daily_stats
WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(row_count, 4);
assert_eq!(session_count, 1);
assert_eq!(message_count, 2);
}
#[test]
#[serial]
fn insert_conversation_tree_stage_profile_tracks_steady_state_remote_reuse() {
let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join(format!("profile-{msg_count}.db"));
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace = PathBuf::from("/ws/profiled-storage-remote");
let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
storage
.insert_conversation_tree(
agent_id,
Some(workspace_id),
&make_profiled_storage_remote_conversation(0, msg_count),
)
.unwrap();
let mut profile = InsertConversationTreePerfProfile::default();
for external_id in 1..=iterations {
storage
.insert_conversation_tree_with_profile(
agent_id,
Some(workspace_id),
&make_profiled_storage_remote_conversation(external_id as i64, msg_count),
&mut profile,
)
.unwrap();
}
let accounted_duration = profile.source_duration
+ profile.tx_open_duration
+ profile.existing_lookup_duration
+ profile.conversation_row_duration
+ profile.message_insert_duration
+ profile.snippet_insert_duration
+ profile.fts_entry_duration
+ profile.fts_flush_duration
+ profile.analytics_duration
+ profile.commit_duration;
assert_eq!(profile.invocations, iterations);
assert_eq!(profile.messages, iterations * msg_count);
assert_eq!(profile.inserted_messages, iterations * msg_count);
assert!(
profile.total_duration >= accounted_duration,
"accounted stage durations cannot exceed total duration"
);
profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
}
}
#[test]
#[serial]
fn insert_conversation_tree_stage_profile_tracks_append_remote_source_merge() {
let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join(format!("append-profile-{msg_count}.db"));
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace = PathBuf::from("/ws/profiled-append-remote");
let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
for external_id in 0..iterations {
storage
.insert_conversation_tree(
agent_id,
Some(workspace_id),
&make_profiled_append_remote_merge_conversation(
external_id as i64,
msg_count,
),
)
.unwrap();
}
let mut profile = InsertConversationTreePerfProfile::default();
for external_id in 0..iterations {
storage
.append_existing_conversation_with_profile(
agent_id,
Some(workspace_id),
&make_profiled_append_remote_merge_conversation(
external_id as i64,
msg_count * 2,
),
&mut profile,
)
.unwrap();
}
let accounted_duration = profile.source_duration
+ profile.tx_open_duration
+ profile.existing_lookup_duration
+ profile.existing_idx_lookup_duration
+ profile.existing_replay_lookup_duration
+ profile.dedupe_filter_duration
+ profile.conversation_row_duration
+ profile.message_insert_duration
+ profile.snippet_insert_duration
+ profile.fts_entry_duration
+ profile.fts_flush_duration
+ profile.analytics_duration
+ profile.commit_duration;
assert_eq!(profile.invocations, iterations);
assert_eq!(profile.messages, iterations * msg_count * 2);
assert_eq!(profile.inserted_messages, iterations * msg_count);
assert!(
profile.total_duration >= accounted_duration,
"accounted append stage durations cannot exceed total duration"
);
profile.log_summary(&format!("append_remote_merge_{msg_count}_msgs"));
}
}
#[test]
fn rebuild_daily_stats_recomputes_materialized_totals_without_monolithic_group_by() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let started_at = 1_700_000_000_000_i64;
let day_id = FrankenStorage::day_id_from_millis(started_at);
let hour_id = FrankenStorage::hour_id_from_millis(started_at);
storage
.conn
.execute_compat(
"INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
fparams![1_i64, "codex", "Codex", "cli"],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
fparams![2_i64, "claude", "Claude", "cli"],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO conversations (
id, agent_id, workspace_id, source_id, external_id, title, source_path,
started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
fparams![
1_i64,
1_i64,
LOCAL_SOURCE_ID,
"daily-a",
"Daily A",
"/tmp/daily-a.jsonl",
started_at,
started_at + 200,
"{}"
],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO conversations (
id, agent_id, workspace_id, source_id, external_id, title, source_path,
started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
fparams![
2_i64,
2_i64,
LOCAL_SOURCE_ID,
"daily-b",
"Daily B",
"/tmp/daily-b.jsonl",
started_at,
started_at + 300,
"{}"
],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO messages (
id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
fparams![1_i64, 1_i64, 0_i64, "user", started_at, "hello"],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO messages (
id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
fparams![2_i64, 1_i64, 1_i64, "assistant", started_at + 100, "response"],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO messages (
id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
fparams![3_i64, 2_i64, 0_i64, "user", started_at + 50, "abc"],
)
.unwrap();
for (message_id, agent_slug, role, content_len) in [
(1_i64, "codex", "user", 5_i64),
(2_i64, "codex", "assistant", 8_i64),
(3_i64, "claude", "user", 3_i64),
] {
storage
.conn
.execute_compat(
"INSERT INTO message_metrics (
message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
model_name, model_family, model_tier, provider
) VALUES (
?1, ?2, ?3, ?4, ?5, ?6, ?7,
?8, ?9, ?10, ?11, ?12,
?13, ?14, ?15,
?16, ?17, ?18, ?19, ?20,
?21, ?22, ?23, ?24
)",
fparams![
message_id,
started_at,
hour_id,
day_id,
agent_slug,
0_i64,
LOCAL_SOURCE_ID,
role,
content_len,
content_len / 4,
0_i64,
0_i64,
0_i64,
0_i64,
0_i64,
"",
"estimated",
0_i64,
0_i64,
0_i64,
"",
"unknown",
"unknown",
"unknown"
],
)
.unwrap();
}
storage.conn.execute("DELETE FROM daily_stats").unwrap();
let rebuilt = storage.rebuild_daily_stats().unwrap();
assert_eq!(rebuilt.total_sessions, 2);
let health = storage.daily_stats_health().unwrap();
assert_eq!(health.conversation_count, 2);
assert_eq!(health.materialized_total, 2);
assert_eq!(health.drift, 0);
let total_messages: i64 = storage
.conn
.query_row_map(
"SELECT message_count FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(total_messages, 3);
}
#[test]
fn rebuild_daily_stats_preserves_byte_counts_with_message_metrics() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let content = "ASCII🙂é漢字";
let expected_bytes = content.len() as i64;
let started_at = 1_704_067_200_000_i64;
let day_id = FrankenStorage::day_id_from_millis(started_at);
let hour_id = FrankenStorage::hour_id_from_millis(started_at);
storage
.conn
.execute_compat(
"INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
fparams![1_i64, "tester", "Tester", "cli"],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO conversations (
id, agent_id, workspace_id, source_id, external_id, title, source_path,
started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
fparams![
1_i64,
1_i64,
LOCAL_SOURCE_ID,
"unicode-metrics",
"Unicode Metrics",
"/tmp/unicode-metrics.jsonl",
started_at,
"{}"
],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO messages (
id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
fparams![1_i64, 1_i64, 0_i64, "user", started_at, content],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO message_metrics (
message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
model_name, model_family, model_tier, provider
) VALUES (
?1, ?2, ?3, ?4, ?5, ?6, ?7,
?8, ?9, ?10, ?11, ?12,
?13, ?14, ?15,
?16, ?17, ?18, ?19, ?20,
?21, ?22, ?23, ?24
)",
fparams![
1_i64,
started_at,
hour_id,
day_id,
"tester",
0_i64,
LOCAL_SOURCE_ID,
"user",
expected_bytes,
expected_bytes / 4,
0_i64,
0_i64,
0_i64,
0_i64,
0_i64,
"",
"estimated",
0_i64,
0_i64,
0_i64,
"",
"unknown",
"unknown",
"unknown"
],
)
.unwrap();
let mut tx = storage.conn.transaction().unwrap();
franken_update_daily_stats_in_tx(
&storage,
&tx,
"tester",
LOCAL_SOURCE_ID,
Some(started_at),
StatsDelta {
session_count_delta: 1,
message_count_delta: 1,
total_chars_delta: expected_bytes,
},
)
.unwrap();
tx.commit().unwrap();
let inline_total: i64 = storage
.conn
.query_row_map(
"SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(inline_total, expected_bytes);
storage.conn.execute("DELETE FROM daily_stats").unwrap();
let rebuilt = storage.rebuild_daily_stats().unwrap();
assert_eq!(rebuilt.total_sessions, 1);
let rebuilt_total: i64 = storage
.conn
.query_row_map(
"SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(rebuilt_total, expected_bytes);
}
#[test]
fn rebuild_daily_stats_raw_fallback_preserves_byte_counts() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let content = "fallback🙂é漢字";
let expected_bytes = content.len() as i64;
let started_at = 1_704_067_200_000_i64;
storage
.conn
.execute_compat(
"INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
fparams![1_i64, "tester", "Tester", "cli"],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO conversations (
id, agent_id, workspace_id, source_id, external_id, title, source_path,
started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
fparams![
1_i64,
1_i64,
LOCAL_SOURCE_ID,
"unicode-fallback",
"Unicode Fallback",
"/tmp/unicode-fallback.jsonl",
started_at,
"{}"
],
)
.unwrap();
storage
.conn
.execute_compat(
"INSERT INTO messages (
id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
fparams![1_i64, 1_i64, 0_i64, "assistant", started_at, content],
)
.unwrap();
let mut tx = storage.conn.transaction().unwrap();
franken_update_daily_stats_in_tx(
&storage,
&tx,
"tester",
LOCAL_SOURCE_ID,
Some(started_at),
StatsDelta {
session_count_delta: 1,
message_count_delta: 1,
total_chars_delta: expected_bytes,
},
)
.unwrap();
tx.commit().unwrap();
storage.conn.execute("DELETE FROM daily_stats").unwrap();
let rebuilt = storage.rebuild_daily_stats().unwrap();
assert_eq!(rebuilt.total_sessions, 1);
let rebuilt_total: i64 = storage
.conn
.query_row_map(
"SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(rebuilt_total, expected_bytes);
}
#[test]
fn insert_conversations_batched_appends_duplicate_external_id() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let base_conv = |messages: Vec<Message>| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("shared-session".into()),
title: Some("Shared Session".into()),
source_path: PathBuf::from("/tmp/rollout.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let conv_a = base_conv(vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
]);
let conv_b = base_conv(vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 2,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_200),
content: "third".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 3,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_300),
content: "fourth".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
]);
let outcomes = storage
.insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
.unwrap();
assert_eq!(outcomes.len(), 2);
assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
assert_eq!(outcomes[1].inserted_indices, vec![2, 3]);
assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
let conversation_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let conversation_count_not_indexed: i64 = storage
.conn
.query_row_map(
"SELECT COUNT(*) FROM conversations NOT INDEXED",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let conversation_count_source_index: i64 = storage
.conn
.query_row_map(
"SELECT COUNT(*) FROM conversations INDEXED BY idx_conversations_source_id",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let message_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let reopened_storage = SqliteStorage::open(&db_path).unwrap();
let reopened_conversation_count: i64 = reopened_storage
.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let reopened_conversation_count_not_indexed: i64 = reopened_storage
.conn
.query_row_map(
"SELECT COUNT(*) FROM conversations NOT INDEXED",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let reopened_conversation_ids: Vec<i64> = reopened_storage
.conn
.query_map_collect(
"SELECT id FROM conversations ORDER BY id",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let reopened_conversation_ids_not_indexed: Vec<i64> = reopened_storage
.conn
.query_map_collect(
"SELECT id FROM conversations NOT INDEXED ORDER BY id",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let reopened_conversation_ids_source_index: Vec<i64> = reopened_storage
.conn
.query_map_collect(
"SELECT id FROM conversations INDEXED BY idx_conversations_source_id ORDER BY id",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(reopened_conversation_ids, vec![outcomes[0].conversation_id]);
assert_eq!(
reopened_conversation_ids_not_indexed,
vec![outcomes[0].conversation_id]
);
assert_eq!(
reopened_conversation_ids_source_index,
vec![outcomes[0].conversation_id]
);
assert_eq!(reopened_conversation_count, 1);
assert_eq!(reopened_conversation_count_not_indexed, 1);
assert_eq!(conversation_count_not_indexed, 1);
assert_eq!(conversation_count_source_index, 1);
assert_eq!(conversation_count, 1);
assert_eq!(message_count, 4);
}
#[test]
fn franken_insert_conversation_or_get_existing_recovers_unique_conflict() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("recover-duplicate".into()),
title: Some("Recover Duplicate".into()),
source_path: PathBuf::from("/tmp/rollout.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: "local".into(),
origin_host: None,
};
let tx = storage.conn.transaction().unwrap();
let inserted_id = franken_insert_conversation(&tx, agent_id, None, &conv)
.unwrap()
.expect("first insert should succeed");
let conversation_key = conversation_merge_key(agent_id, &conv);
let resolved = franken_insert_conversation_or_get_existing_after_miss(
&tx,
agent_id,
None,
&conv,
&conversation_key,
)
.unwrap();
match resolved {
ConversationInsertStatus::Existing(existing_id) => {
assert_eq!(existing_id, inserted_id);
}
ConversationInsertStatus::Inserted(new_id) => {
panic!("expected existing conversation id, got freshly inserted {new_id}");
}
}
let conversation_count: i64 = tx
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 1);
}
#[test]
fn insert_conversations_batched_merges_duplicate_external_id_with_gaps() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let base_conv = |messages: Vec<Message>| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("shared-session-gap".into()),
title: Some("Shared Session Gap".into()),
source_path: PathBuf::from("/tmp/rollout.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let conv_a = base_conv(vec![
Message {
id: None,
idx: 2,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_200),
content: "third".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 3,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_300),
content: "fourth".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
]);
let conv_b = base_conv(vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 3,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_300),
content: "fourth".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
]);
let outcomes = storage
.insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
.unwrap();
assert_eq!(outcomes.len(), 2);
assert_eq!(outcomes[0].inserted_indices, vec![2, 3]);
assert_eq!(outcomes[1].inserted_indices, vec![0, 1]);
assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
let stored_indices: Vec<i64> = storage
.conn
.query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(stored_indices, vec![0, 1, 2, 3]);
}
#[test]
fn insert_conversations_batched_refreshes_partial_pending_message_lookup() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let make_message = |idx: i64, content: &str| Message {
id: None,
idx,
role: if idx == 0 {
MessageRole::User
} else {
MessageRole::Agent
},
author: None,
created_at: Some(1_700_000_000_000 + idx),
content: content.into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
};
let base_conv = |messages: Vec<Message>| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("partial-cache-session".into()),
title: Some("Partial cache session".into()),
source_path: PathBuf::from("/tmp/partial-cache.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let canonical = base_conv(vec![
make_message(0, "canonical zero"),
make_message(20, "canonical twenty"),
]);
storage
.insert_conversation_tree(agent_id, None, &canonical)
.unwrap();
let exact_prefix = base_conv(vec![make_message(0, "canonical zero")]);
let conflicting_tail = base_conv(vec![make_message(20, "conflicting twenty")]);
let outcomes = storage
.insert_conversations_batched(&[
(agent_id, None, &exact_prefix),
(agent_id, None, &conflicting_tail),
])
.unwrap();
assert_eq!(outcomes.len(), 2);
assert!(outcomes[0].inserted_indices.is_empty());
assert!(
outcomes[1].inserted_indices.is_empty(),
"the second batch item must refresh the partial pending lookup and retain the canonical idx=20 row"
);
let stored_messages: Vec<(i64, String)> = storage
.conn
.query_map_collect(
"SELECT idx, content FROM messages ORDER BY idx",
fparams![],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
stored_messages,
vec![
(0, "canonical zero".to_string()),
(20, "canonical twenty".to_string()),
]
);
}
#[test]
fn insert_conversations_batched_reprocessing_conversation_is_idempotent() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
const MESSAGE_COUNT: i64 = 64;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let messages: Vec<Message> = (0..MESSAGE_COUNT)
.map(|idx| Message {
id: None,
idx,
role: if idx % 2 == 0 {
MessageRole::User
} else {
MessageRole::Agent
},
author: None,
created_at: Some(1_700_000_000_000 + idx),
content: format!("message {idx}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
.collect();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("large-reprocess-session".into()),
title: Some("Large Reprocess Session".into()),
source_path: PathBuf::from("/tmp/large-reprocess-session.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_000 + MESSAGE_COUNT - 1),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let first = storage
.insert_conversations_batched(&[(agent_id, None, &conversation)])
.unwrap();
let second = storage
.insert_conversations_batched(&[(agent_id, None, &conversation)])
.unwrap();
assert_eq!(first.len(), 1);
assert_eq!(second.len(), 1);
assert_eq!(first[0].inserted_indices.len(), MESSAGE_COUNT as usize);
assert!(
second[0].inserted_indices.is_empty(),
"full reprocessing of a large conversation must not attempt duplicate idx inserts"
);
assert_eq!(first[0].conversation_id, second[0].conversation_id);
let conversation_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let message_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 1);
assert_eq!(message_count, MESSAGE_COUNT);
}
#[test]
fn parallel_insert_conversation_tree_keeps_unique_external_ids_distinct() {
use crate::connectors::{NormalizedConversation, NormalizedMessage};
use crate::indexer::persist::map_to_internal;
use crate::model::types::{Agent, AgentKind};
use frankensqlite::compat::{ConnectionExt, RowExt};
use rand::RngExt;
use rayon::prelude::*;
fn retryable_franken_error(err: &anyhow::Error) -> bool {
err.downcast_ref::<frankensqlite::FrankenError>()
.or_else(|| {
err.root_cause()
.downcast_ref::<frankensqlite::FrankenError>()
})
.is_some_and(|inner| {
matches!(
inner,
frankensqlite::FrankenError::Busy
| frankensqlite::FrankenError::BusyRecovery
| frankensqlite::FrankenError::BusySnapshot { .. }
| frankensqlite::FrankenError::WriteConflict { .. }
| frankensqlite::FrankenError::SerializationFailure { .. }
)
})
}
fn with_retry<F, T>(mut f: F) -> anyhow::Result<T>
where
F: FnMut() -> anyhow::Result<T>,
{
let mut rng = rand::rng();
let mut backoff_ms = 4_u64;
for attempt in 0..=24 {
match f() {
Ok(value) => return Ok(value),
Err(err) if attempt < 24 && retryable_franken_error(&err) => {
let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
std::thread::sleep(Duration::from_millis(sleep_ms));
backoff_ms = (backoff_ms * 2).min(512);
}
Err(err) => return Err(err),
}
}
unreachable!("retry loop must return on success or final failure")
}
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("parallel_insert_conversation_tree.db");
let seed = FrankenStorage::open(&db_path).unwrap();
drop(seed);
let conversations: Vec<NormalizedConversation> = (0..10)
.map(|i| NormalizedConversation {
agent_slug: format!("agent-{}", i % 3),
external_id: Some(format!("conv-{i}")),
title: Some(format!("Conversation {i}")),
workspace: Some(PathBuf::from(format!("/ws/{i}"))),
source_path: PathBuf::from(format!("/log/{i}.jsonl")),
started_at: Some(1_000 + i * 100),
ended_at: Some(1_000 + i * 100 + 50),
metadata: serde_json::json!({}),
messages: (0..3)
.map(|j| NormalizedMessage {
idx: j,
role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
author: Some("tester".into()),
created_at: Some(1_000 + i * 100 + j * 10),
content: format!("parallel-distinct-test conv={i} msg={j}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
})
.collect(),
})
.collect();
let mut outcomes: Vec<(String, i64, Vec<i64>)> = conversations
.par_chunks(3)
.map(|chunk| {
let storage = FrankenStorage::open_writer(&db_path).unwrap();
let mut agent_cache: HashMap<String, i64> = HashMap::new();
let mut workspace_cache: HashMap<PathBuf, i64> = HashMap::new();
let mut chunk_outcomes = Vec::with_capacity(chunk.len());
for conv in chunk {
let agent_slug = conv.agent_slug.clone();
let workspace = conv.workspace.clone();
let external_id = conv.external_id.clone().expect("external id");
let internal = map_to_internal(conv);
let outcome = with_retry(|| {
let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
*id
} else {
let agent = Agent {
id: None,
slug: agent_slug.clone(),
name: agent_slug.clone(),
version: None,
kind: AgentKind::Cli,
};
let id = storage.ensure_agent(&agent)?;
agent_cache.insert(agent_slug.clone(), id);
id
};
let workspace_id = if let Some(path) = &workspace {
if let Some(id) = workspace_cache.get(path) {
Some(*id)
} else {
let id = storage.ensure_workspace(path, None)?;
workspace_cache.insert(path.clone(), id);
Some(id)
}
} else {
None
};
storage.insert_conversation_tree(agent_id, workspace_id, &internal)
})
.unwrap();
chunk_outcomes.push((
external_id,
outcome.conversation_id,
outcome.inserted_indices,
));
}
storage.close().unwrap();
chunk_outcomes
})
.flatten()
.collect();
outcomes.sort_by(|left, right| left.0.cmp(&right.0));
assert!(
outcomes
.iter()
.all(|(_, _, inserted_indices)| inserted_indices == &vec![0, 1, 2]),
"unique external ids must not be routed through the existing-conversation merge path: {outcomes:?}"
);
let distinct_ids: HashSet<i64> = outcomes
.iter()
.map(|(_, conversation_id, _)| *conversation_id)
.collect();
assert_eq!(
distinct_ids.len(),
conversations.len(),
"unique external ids must produce distinct conversation ids: {outcomes:?}"
);
let reader = FrankenStorage::open(&db_path).unwrap();
let stored_rows: Vec<(i64, String)> = reader
.raw()
.query_map_collect(
"SELECT id, external_id FROM conversations ORDER BY id",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
let stored_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(
stored_count as usize,
conversations.len(),
"parallel distinct inserts must persist one row per external id; rows={stored_rows:?}; outcomes={outcomes:?}"
);
assert_eq!(
stored_rows.len(),
conversations.len(),
"parallel distinct inserts must remain visible after reopening; rows={stored_rows:?}; outcomes={outcomes:?}"
);
}
#[test]
fn insert_conversation_tree_merges_duplicate_external_id_with_gaps() {
use crate::connectors::{NormalizedConversation, NormalizedMessage};
use crate::indexer::persist::map_to_internal;
use crate::model::types::{Agent, AgentKind};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let base_conv = |messages: Vec<NormalizedMessage>| NormalizedConversation {
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("tree-gap-session".into()),
title: Some("Tree Gap Session".into()),
source_path: PathBuf::from("/tmp/tree.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
metadata: serde_json::Value::Null,
messages,
};
let conv_a = map_to_internal(&base_conv(vec![
NormalizedMessage {
idx: 2,
role: "user".into(),
author: None,
created_at: Some(1_700_000_000_200),
content: "third".into(),
extra: serde_json::Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
NormalizedMessage {
idx: 3,
role: "assistant".into(),
author: None,
created_at: Some(1_700_000_000_300),
content: "fourth".into(),
extra: serde_json::Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
]));
let conv_b = map_to_internal(&base_conv(vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra: serde_json::Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra: serde_json::Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
NormalizedMessage {
idx: 3,
role: "assistant".into(),
author: None,
created_at: Some(1_700_000_000_300),
content: "fourth".into(),
extra: serde_json::Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
]));
let first = storage
.insert_conversation_tree(agent_id, None, &conv_a)
.unwrap();
let second = storage
.insert_conversation_tree(agent_id, None, &conv_b)
.unwrap();
assert_eq!(first.inserted_indices, vec![2, 3]);
assert_eq!(second.inserted_indices, vec![0, 1]);
assert_eq!(first.conversation_id, second.conversation_id);
let stored_indices: Vec<i64> = storage
.conn
.query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(stored_indices, vec![0, 1, 2, 3]);
}
#[test]
fn insert_conversation_tree_skips_duplicate_message_indices_for_new_conversation() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("duplicate-new-session".into()),
title: Some("Duplicate New Session".into()),
source_path: PathBuf::from("/tmp/duplicate-new-session.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first canonical".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_001),
content: "duplicate idx should be skipped".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: "local".into(),
origin_host: None,
};
let outcome = storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
assert_eq!(outcome.inserted_indices, vec![0, 1]);
let stored_messages: Vec<(i64, String)> = storage
.conn
.query_map_collect(
"SELECT idx, content FROM messages ORDER BY idx",
fparams![],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
stored_messages,
vec![
(0, "first canonical".to_string()),
(1, "second".to_string())
]
);
}
#[test]
fn insert_conversation_tree_merges_duplicate_source_path_without_external_id() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let base_conv = |messages: Vec<Message>| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: None,
title: Some("Source Path Merge".into()),
source_path: PathBuf::from("/tmp/shared-session.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let first = storage
.insert_conversation_tree(
agent_id,
None,
&base_conv(vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
]),
)
.unwrap();
let second = storage
.insert_conversation_tree(
agent_id,
None,
&base_conv(vec![
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 2,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_200),
content: "third".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
]),
)
.unwrap();
assert_eq!(first.conversation_id, second.conversation_id);
assert_eq!(first.inserted_indices, vec![0, 1]);
assert_eq!(second.inserted_indices, vec![2]);
let stored_indices: Vec<i64> = storage
.conn
.query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(stored_indices, vec![0, 1, 2]);
}
#[test]
fn insert_conversation_tree_merges_source_path_duplicates_with_start_drift() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let base_conv = |started_at: Option<i64>, messages: Vec<Message>| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: None,
title: Some("Drift Merge".into()),
source_path: PathBuf::from("/tmp/drift-session.jsonl"),
started_at,
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let first = storage
.insert_conversation_tree(
agent_id,
None,
&base_conv(
Some(1_700_000_000_000),
vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
),
)
.unwrap();
let second = storage
.insert_conversation_tree(
agent_id,
None,
&base_conv(
Some(1_700_000_004_000),
vec![
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 2,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_004_200),
content: "third".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
),
)
.unwrap();
assert_eq!(first.conversation_id, second.conversation_id);
assert_eq!(second.inserted_indices, vec![2]);
}
#[test]
fn insert_conversation_tree_keeps_single_message_overlap_sessions_separate() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let make_conv = |started_at: i64, idx: i64, content: &str| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: None,
title: Some("Partial overlap".into()),
source_path: PathBuf::from("/tmp/reused-session.jsonl"),
started_at: Some(started_at),
ended_at: Some(started_at + 500),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx,
role: MessageRole::User,
author: None,
created_at: Some(started_at),
content: content.into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: "local".into(),
origin_host: None,
};
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "shared opener".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "first session unique".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
..make_conv(1_700_000_000_000, 0, "unused")
},
)
.unwrap();
storage
.insert_conversation_tree(
agent_id,
None,
&make_conv(1_700_000_900_000, 0, "shared opener"),
)
.unwrap();
let conversation_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 2);
}
#[test]
fn insert_conversation_tree_keeps_distinct_source_path_sessions_separate() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let make_conv = |started_at: i64, created_at: i64, content: &str| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: None,
title: Some("Same Path Different Session".into()),
source_path: PathBuf::from("/tmp/reused-session.jsonl"),
started_at: Some(started_at),
ended_at: Some(started_at + 500),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(created_at),
content: content.into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: "local".into(),
origin_host: None,
};
storage
.insert_conversation_tree(
agent_id,
None,
&make_conv(1_700_000_000_000, 1_700_000_000_000, "first session"),
)
.unwrap();
storage
.insert_conversation_tree(
agent_id,
None,
&make_conv(1_700_000_900_000, 1_700_000_900_000, "second session"),
)
.unwrap();
let conversation_count: i64 = storage
.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 2);
}
#[test]
fn insert_conversation_tree_merges_replay_equivalent_messages_with_shifted_idx() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let make_conv = |started_at: i64, messages: Vec<Message>| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: None,
title: Some("Shifted replay".into()),
source_path: PathBuf::from("/tmp/replay-session.jsonl"),
started_at: Some(started_at),
ended_at: Some(started_at + 500),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
};
let first = storage
.insert_conversation_tree(
agent_id,
None,
&make_conv(
1_700_000_000_000,
vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
),
)
.unwrap();
let second = storage
.insert_conversation_tree(
agent_id,
None,
&make_conv(
1_700_000_900_000,
vec![
Message {
id: None,
idx: 10,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 11,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 12,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_200),
content: "third".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
),
)
.unwrap();
assert_eq!(first.conversation_id, second.conversation_id);
assert_eq!(second.inserted_indices, vec![12]);
let stored_indices: Vec<i64> = storage
.conn
.query_map_collect(
"SELECT idx FROM messages WHERE conversation_id = ?1 ORDER BY idx",
fparams![first.conversation_id],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(stored_indices, vec![0, 1, 12]);
}
#[test]
fn salvage_historical_databases_imports_backups_once_and_merges_overlap() {
use crate::model::types::{Conversation, Message, MessageRole};
use std::path::PathBuf;
fn base_conv(source_path: &str, messages: Vec<Message>) -> Conversation {
Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: None,
title: Some("Recovered".into()),
source_path: PathBuf::from(source_path),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: "local".into(),
origin_host: None,
}
}
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&canonical_db).unwrap();
let overlapping_a = base_conv(
"/tmp/shared-history.jsonl",
vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
);
let overlapping_b = base_conv(
"/tmp/shared-history.jsonl",
vec![
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_100),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 2,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_200),
content: "third".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
);
let unique = Conversation {
source_path: PathBuf::from("/tmp/unique-history.jsonl"),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_001_000),
content: "unique".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
started_at: Some(1_700_000_001_000),
ended_at: Some(1_700_000_001_100),
..base_conv("/tmp/unique-history.jsonl", Vec::new())
};
seed_historical_db_direct(
&dir.path()
.join("backups/agent_search.db.20260322T020200.bak"),
std::slice::from_ref(&overlapping_a),
);
seed_historical_db_direct(
&dir.path().join("agent_search.corrupt.20260324_212907"),
&[overlapping_b, unique],
);
let first = storage.salvage_historical_databases(&canonical_db).unwrap();
assert_eq!(first.bundles_considered, 2);
assert_eq!(first.bundles_imported, 2);
assert_eq!(first.messages_imported, 4);
let conversations = storage.list_conversations(10, 0).unwrap();
assert_eq!(conversations.len(), 2);
let shared_id = conversations
.iter()
.find(|conv| conv.source_path == std::path::Path::new("/tmp/shared-history.jsonl"))
.and_then(|conv| conv.id)
.unwrap();
let shared_indices: Vec<i64> = storage
.fetch_messages(shared_id)
.unwrap()
.into_iter()
.map(|msg| msg.idx)
.collect();
assert_eq!(shared_indices, vec![0, 1, 2]);
let second = storage.salvage_historical_databases(&canonical_db).unwrap();
assert_eq!(second.bundles_imported, 0);
assert_eq!(second.messages_imported, 0);
}
#[test]
fn salvage_historical_databases_normalizes_host_only_remote_provenance() {
use crate::model::types::{Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&canonical_db).unwrap();
let host_only_remote = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: None,
title: Some("Recovered Host Only Remote".into()),
source_path: PathBuf::from("/tmp/host-only-history.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "host-only remote".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: " ".into(),
origin_host: Some("builder-5".into()),
};
let historical_db = dir
.path()
.join("backups/agent_search.db.20260322T020200.bak");
seed_historical_db_direct(&historical_db, std::slice::from_ref(&host_only_remote));
let historical_conn =
FrankenConnection::open(historical_db.to_string_lossy().into_owned()).unwrap();
historical_conn
.execute_compat(
"INSERT INTO sources(id, kind, host_label, created_at, updated_at) VALUES(?1, ?2, ?3, ?4, ?5)",
fparams![" ", "ssh", "builder-5", 0_i64, 0_i64],
)
.unwrap();
historical_conn
.execute_compat(
"UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE source_path = ?3",
fparams![" ", "builder-5", "/tmp/host-only-history.jsonl"],
)
.unwrap();
historical_conn
.execute_compat("DELETE FROM sources WHERE id = ?1", fparams!["builder-5"])
.unwrap();
drop(historical_conn);
let first = storage.salvage_historical_databases(&canonical_db).unwrap();
assert_eq!(first.bundles_imported, 1);
assert_eq!(first.messages_imported, 1);
let source_ids = storage.get_source_ids().unwrap();
assert_eq!(source_ids, vec!["builder-5".to_string()]);
let conversations = storage.list_conversations(10, 0).unwrap();
assert_eq!(conversations.len(), 1);
assert_eq!(conversations[0].source_id, "builder-5");
assert_eq!(conversations[0].origin_host.as_deref(), Some("builder-5"));
}
#[test]
fn historical_salvage_retry_splits_single_conversation_until_it_fits() {
use crate::model::types::{Conversation, Message, MessageRole};
use std::path::PathBuf;
let mut attempts: Vec<Vec<usize>> = Vec::new();
let entry = HistoricalBatchEntry {
source_row_id: 77,
agent_id: 1,
workspace_id: None,
conversation: Conversation {
id: None,
agent_slug: "gemini".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("conv-77".into()),
title: Some("Large recovered conversation".into()),
source_path: PathBuf::from("/tmp/history.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_999),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: (0..4)
.map(|idx| Message {
id: None,
idx,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000 + idx),
content: format!("message-{idx}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
.collect(),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
};
let totals = SqliteStorage::import_historical_batch_with_retry(
std::slice::from_ref(&entry),
&mut |batch| {
attempts.push(
batch
.iter()
.map(|entry| entry.conversation.messages.len())
.collect(),
);
let total_messages: usize = batch
.iter()
.map(|entry| entry.conversation.messages.len())
.sum();
if total_messages > 1 {
Err(anyhow!("out of memory"))
} else {
Ok(HistoricalBatchImportTotals {
inserted_source_rows: batch.len(),
inserted_messages: total_messages,
})
}
},
)
.unwrap();
assert_eq!(
totals,
HistoricalBatchImportTotals {
inserted_source_rows: 1,
inserted_messages: 4,
}
);
assert_eq!(attempts.first().cloned(), Some(vec![4]));
assert!(
attempts.iter().filter(|sizes| sizes == &&vec![1]).count() >= 4,
"expected recursive fallback to reach one-message slices"
);
}
#[test]
fn salvage_historical_databases_resumes_from_progress_checkpoint() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(format!("conv-{idx_seed}")),
title: Some(format!("Recovered {idx_seed}")),
source_path: PathBuf::from(source_path),
started_at: Some(1_700_000_000_000 + idx_seed),
ended_at: Some(1_700_000_000_100 + idx_seed),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000 + idx_seed),
content: format!("message-{idx_seed}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
}
}
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
let backup_db = dir
.path()
.join("backups/agent_search.db.20260322T020200.bak");
let storage = SqliteStorage::open(&canonical_db).unwrap();
let conv_a = make_conv("/tmp/one.jsonl", 1);
let conv_b = make_conv("/tmp/two.jsonl", 2);
let conv_c = make_conv("/tmp/three.jsonl", 3);
seed_historical_db_direct(
&backup_db,
&[conv_a.clone(), conv_b.clone(), conv_c.clone()],
);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
storage
.insert_conversation_tree(agent_id, None, &conv_a)
.unwrap();
let bundle = discover_historical_database_bundles(&canonical_db)
.into_iter()
.find(|bundle| bundle.root_path == backup_db)
.unwrap();
let first_row_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
.unwrap()
.query_row_map(
"SELECT id FROM conversations WHERE source_path = ?1",
fparams!["/tmp/one.jsonl"],
|row| row.get_typed(0),
)
.unwrap();
storage
.record_historical_bundle_progress(&bundle, "direct-readonly", first_row_id, 50, 99)
.unwrap();
let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
assert_eq!(outcome.bundles_imported, 1);
assert_eq!(outcome.conversations_imported, 52);
assert_eq!(outcome.messages_imported, 101);
assert_eq!(storage.list_conversations(10, 0).unwrap().len(), 3);
let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
let progress_left: Option<String> = storage
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![progress_key.as_str()],
|row| row.get_typed(0),
)
.optional()
.unwrap();
assert!(
progress_left.is_none(),
"completed salvage should clear bundle progress"
);
let second = storage.salvage_historical_databases(&canonical_db).unwrap();
assert_eq!(second.bundles_imported, 0);
assert_eq!(second.messages_imported, 0);
}
#[test]
fn salvage_historical_databases_skips_bundle_when_checkpoint_covers_backup() {
use crate::model::types::{Conversation, Message, MessageRole};
use std::path::PathBuf;
fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(format!("conv-{idx_seed}")),
title: Some(format!("Recovered {idx_seed}")),
source_path: PathBuf::from(source_path),
started_at: Some(1_700_000_000_000 + idx_seed),
ended_at: Some(1_700_000_000_100 + idx_seed),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000 + idx_seed),
content: format!("message-{idx_seed}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
}
}
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
let backup_db = dir
.path()
.join("backups/agent_search.db.20260322T020200.bak");
let storage = SqliteStorage::open(&canonical_db).unwrap();
seed_historical_db_direct(
&backup_db,
&[
make_conv("/tmp/one.jsonl", 1),
make_conv("/tmp/two.jsonl", 2),
make_conv("/tmp/three.jsonl", 3),
],
);
let bundle = discover_historical_database_bundles(&canonical_db)
.into_iter()
.find(|bundle| bundle.root_path == backup_db)
.unwrap();
let backup_max_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
.unwrap()
.query_row_map(
"SELECT COALESCE(MAX(id), 0) FROM conversations",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert!(backup_max_id > 0, "seeded backup should have conversations");
storage
.record_historical_bundle_progress(&bundle, "direct-readonly", backup_max_id, 3, 3)
.unwrap();
let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
assert_eq!(
outcome.bundles_imported, 0,
"fully-checkpointed bundle must not be re-scanned"
);
assert_eq!(outcome.conversations_imported, 0);
assert_eq!(outcome.messages_imported, 0);
assert_eq!(
storage.list_conversations(10, 0).unwrap().len(),
0,
"skip path must not import anything"
);
assert!(
storage.historical_bundle_already_imported(&bundle).unwrap(),
"skipped bundle must be ledgered as salvaged so future runs short-circuit"
);
let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
let progress_left: Option<String> = storage
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
fparams![progress_key.as_str()],
|row| row.get_typed(0),
)
.optional()
.unwrap();
assert!(
progress_left.is_none(),
"skip path must clear the bundle progress checkpoint"
);
}
#[test]
fn list_conversations_for_lexical_rebuild_uses_stable_id_order() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let make_conv = |source_path: &str, started_at: i64| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(source_path.to_string()),
title: Some(source_path.to_string()),
source_path: PathBuf::from(source_path),
started_at: Some(started_at),
ended_at: Some(started_at + 1),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(started_at),
content: format!("message for {source_path}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let conv_a = make_conv("/tmp/a.jsonl", 3_000);
let conv_b = make_conv("/tmp/b.jsonl", 1_000);
let conv_c = make_conv("/tmp/c.jsonl", 2_000);
storage
.insert_conversation_tree(agent_id, None, &conv_a)
.unwrap();
storage
.insert_conversation_tree(agent_id, None, &conv_b)
.unwrap();
storage
.insert_conversation_tree(agent_id, None, &conv_c)
.unwrap();
let user_order: Vec<PathBuf> = storage
.list_conversations(10, 0)
.unwrap()
.into_iter()
.map(|conv| conv.source_path)
.collect();
assert_eq!(
user_order,
vec![
PathBuf::from("/tmp/a.jsonl"),
PathBuf::from("/tmp/c.jsonl"),
PathBuf::from("/tmp/b.jsonl"),
]
);
let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
let rebuild_order: Vec<PathBuf> = storage
.list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
.unwrap()
.into_iter()
.map(|conv| conv.source_path)
.collect();
assert_eq!(
rebuild_order,
vec![
PathBuf::from("/tmp/a.jsonl"),
PathBuf::from("/tmp/b.jsonl"),
PathBuf::from("/tmp/c.jsonl"),
]
);
let first_page = storage
.list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
.unwrap();
let first_page_paths: Vec<PathBuf> = first_page
.iter()
.map(|conv| conv.source_path.clone())
.collect();
assert_eq!(
first_page_paths,
vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
);
let second_page = storage
.list_conversations_for_lexical_rebuild_after_id(
2,
first_page
.last()
.and_then(|conv| conv.id)
.expect("first page should include an id"),
&agent_slugs,
&workspace_paths,
)
.unwrap();
let second_page_paths: Vec<PathBuf> = second_page
.iter()
.map(|conv| conv.source_path.clone())
.collect();
assert_eq!(second_page_paths, vec![PathBuf::from("/tmp/c.jsonl")]);
let bounded_page = storage
.list_conversations_for_lexical_rebuild_after_id_through_id(
10,
0,
first_page
.last()
.and_then(|conv| conv.id)
.expect("first page should include an id"),
&agent_slugs,
&workspace_paths,
)
.unwrap();
let bounded_paths: Vec<PathBuf> = bounded_page
.iter()
.map(|conv| conv.source_path.clone())
.collect();
assert_eq!(
bounded_paths,
vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
);
}
#[test]
fn keyset_traversal_handles_sparse_holey_conversation_ids() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let make_conv = |label: &str, ts: i64| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(label.to_string()),
title: Some(label.to_string()),
source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
started_at: Some(ts),
ended_at: Some(ts + 1),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(ts),
content: format!("msg for {label}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
for i in 0..6 {
storage
.insert_conversation_tree(
agent_id,
None,
&make_conv(&format!("conv-{i}"), 1000 + i),
)
.unwrap();
}
storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
storage
.conn
.execute_compat("DELETE FROM conversations WHERE id IN (2, 4)", fparams![])
.unwrap();
storage
.conn
.execute_compat(
"DELETE FROM messages WHERE conversation_id IN (2, 4)",
fparams![],
)
.unwrap();
storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
let page1 = storage
.list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
.unwrap();
assert_eq!(page1.len(), 2);
let page1_ids: Vec<i64> = page1.iter().map(|c| c.id.unwrap()).collect();
assert_eq!(page1_ids, vec![1, 3]);
let page2 = storage
.list_conversations_for_lexical_rebuild_after_id(
2,
*page1_ids.last().unwrap(),
&agent_slugs,
&workspace_paths,
)
.unwrap();
assert_eq!(page2.len(), 2);
let page2_ids: Vec<i64> = page2.iter().map(|c| c.id.unwrap()).collect();
assert_eq!(page2_ids, vec![5, 6]);
let page3 = storage
.list_conversations_for_lexical_rebuild_after_id(
2,
*page2_ids.last().unwrap(),
&agent_slugs,
&workspace_paths,
)
.unwrap();
assert!(page3.is_empty());
let all_ids: Vec<i64> = page1_ids.iter().chain(page2_ids.iter()).copied().collect();
assert_eq!(all_ids, vec![1, 3, 5, 6]);
}
#[test]
fn keyset_traversal_through_id_with_sparse_ranges() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let make_conv = |label: &str, ts: i64| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(label.to_string()),
title: Some(label.to_string()),
source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
started_at: Some(ts),
ended_at: Some(ts + 1),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(ts),
content: format!("msg for {label}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
for i in 0..10 {
storage
.insert_conversation_tree(
agent_id,
None,
&make_conv(&format!("conv-{i}"), 1000 + i),
)
.unwrap();
}
storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
storage
.conn
.execute_compat(
"DELETE FROM conversations WHERE id IN (3, 5, 7, 8)",
fparams![],
)
.unwrap();
storage
.conn
.execute_compat(
"DELETE FROM messages WHERE conversation_id IN (3, 5, 7, 8)",
fparams![],
)
.unwrap();
storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
let through_5 = storage
.list_conversations_for_lexical_rebuild_after_id_through_id(
100,
0,
5,
&agent_slugs,
&workspace_paths,
)
.unwrap();
let through_5_ids: Vec<i64> = through_5.iter().map(|c| c.id.unwrap()).collect();
assert_eq!(through_5_ids, vec![1, 2, 4]);
let after_4_through_10 = storage
.list_conversations_for_lexical_rebuild_after_id_through_id(
100,
4,
10,
&agent_slugs,
&workspace_paths,
)
.unwrap();
let ids: Vec<i64> = after_4_through_10.iter().map(|c| c.id.unwrap()).collect();
assert_eq!(ids, vec![6, 9, 10]);
let after_10 = storage
.list_conversations_for_lexical_rebuild_after_id_through_id(
100,
10,
20,
&agent_slugs,
&workspace_paths,
)
.unwrap();
assert!(after_10.is_empty());
}
#[test]
fn list_conversation_footprints_for_lexical_rebuild_estimates_bytes_and_keeps_empty_conversations()
{
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.to_string()),
title: Some(external_id.to_string()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap()
.conversation_id
};
let ascii_id = insert(
"footprint-ascii",
1_700_000_000_000,
vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_001),
content: "abc".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_002),
content: "defg".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
);
let empty_id = insert("footprint-empty", 1_700_000_001_000, Vec::new());
let utf8_id = insert(
"footprint-utf8",
1_700_000_002_000,
vec![Message {
id: None,
idx: 0,
role: MessageRole::Tool,
author: None,
created_at: Some(1_700_000_002_001),
content: "hé🙂".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
);
let sparse_id = insert(
"footprint-sparse",
1_700_000_003_000,
vec![Message {
id: None,
idx: 10,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_003_010),
content: "sparse".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
);
storage
.conn
.execute_compat(
"DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
fparams![utf8_id],
)
.unwrap();
let footprints = storage
.list_conversation_footprints_for_lexical_rebuild()
.unwrap();
assert_eq!(
footprints,
vec![
LexicalRebuildConversationFootprintRow {
conversation_id: ascii_id,
message_count: 2,
message_bytes: 2 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
},
LexicalRebuildConversationFootprintRow {
conversation_id: empty_id,
message_count: 0,
message_bytes: 0,
},
LexicalRebuildConversationFootprintRow {
conversation_id: utf8_id,
message_count: 1,
message_bytes: LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
},
LexicalRebuildConversationFootprintRow {
conversation_id: sparse_id,
message_count: 11,
message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
},
]
);
}
#[test]
fn list_conversation_footprints_for_lexical_rebuild_falls_back_for_missing_tail_cache() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation_id = storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("footprint-missing-tail".to_string()),
title: Some("footprint-missing-tail".to_string()),
source_path: PathBuf::from("/tmp/footprint-missing-tail.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 10,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_010),
content: "legacy sparse tail".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap()
.conversation_id;
storage
.conn
.execute_compat(
"UPDATE conversations
SET last_message_idx = NULL, last_message_created_at = NULL
WHERE id = ?1",
fparams![conversation_id],
)
.unwrap();
storage
.conn
.execute_compat(
"DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
fparams![conversation_id],
)
.unwrap();
let footprints = storage
.list_conversation_footprints_for_lexical_rebuild()
.unwrap();
assert_eq!(
footprints,
vec![LexicalRebuildConversationFootprintRow {
conversation_id,
message_count: 11,
message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
}],
"missing tail-cache metadata should fall back to messages MAX(idx) instead of treating legacy conversations as empty"
);
}
#[test]
fn list_conversation_footprints_for_lexical_rebuild_raises_stale_low_tail_cache() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation_id = storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("footprint-stale-tail".to_string()),
title: Some("footprint-stale-tail".to_string()),
source_path: PathBuf::from("/tmp/footprint-stale-tail.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: (0..3)
.map(|idx| Message {
id: None,
idx,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_010 + idx),
content: format!("message {idx}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
.collect(),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap()
.conversation_id;
storage
.conn
.execute_compat(
"UPDATE conversations
SET last_message_idx = 0, last_message_created_at = 1700000000010
WHERE id = ?1",
fparams![conversation_id],
)
.unwrap();
storage
.conn
.execute_compat(
"UPDATE conversation_tail_state
SET last_message_idx = 0, last_message_created_at = 1700000000010
WHERE conversation_id = ?1",
fparams![conversation_id],
)
.unwrap();
let footprints = storage
.list_conversation_footprints_for_lexical_rebuild()
.unwrap();
assert_eq!(
footprints,
vec![LexicalRebuildConversationFootprintRow {
conversation_id,
message_count: 3,
message_bytes: 3 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
}],
"stale-low tail caches must not under-plan lexical shards and trip doc>plan invariants"
);
}
#[test]
fn list_conversation_footprints_for_lexical_rebuild_tolerates_missing_tail_state_table() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation_id = storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("footprint-missing-tail-table".to_string()),
title: Some("footprint-missing-tail-table".to_string()),
source_path: PathBuf::from("/tmp/footprint-missing-tail-table.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 10,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_010),
content: "legacy sparse tail without hot table".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap()
.conversation_id;
storage
.conn
.execute_compat(
"UPDATE conversations
SET last_message_idx = NULL, last_message_created_at = NULL
WHERE id = ?1",
fparams![conversation_id],
)
.unwrap();
storage
.conn
.execute_compat("DROP TABLE conversation_tail_state", fparams![])
.unwrap();
let footprints = storage
.list_conversation_footprints_for_lexical_rebuild()
.unwrap();
assert_eq!(
footprints,
vec![LexicalRebuildConversationFootprintRow {
conversation_id,
message_count: 11,
message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
}],
"read-only lexical self-heal must tolerate pre-tail-cache databases and use messages MAX(idx)"
);
}
#[test]
fn list_conversation_footprints_for_lexical_rebuild_tolerates_legacy_search_demo_fixture() {
let fixture_db = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures")
.join("search_demo_data")
.join("agent_search.db");
let storage = FrankenStorage::open_readonly(&fixture_db).unwrap();
let footprints = storage
.list_conversation_footprints_for_lexical_rebuild()
.unwrap();
assert!(
!footprints.is_empty(),
"search self-heal should be able to plan a lexical rebuild from the legacy search demo fixture"
);
assert!(
footprints
.iter()
.all(|footprint| footprint.message_count > 0),
"legacy fixture conversations should derive message counts from messages when tail caches are absent"
);
}
#[test]
fn lexical_rebuild_listing_normalizes_host_only_remote_source_from_blank_source_id() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("legacy-blank-source".into()),
title: Some("Legacy blank source".into()),
source_path: PathBuf::from("/tmp/legacy-blank-source.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let conversation_id = storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap()
.conversation_id;
storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
storage
.conn
.execute_compat(
"UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
fparams![" ", "dev@laptop", conversation_id],
)
.unwrap();
storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
let listed = storage.list_conversations(10, 0).unwrap();
assert_eq!(listed.len(), 1);
assert_eq!(listed[0].source_id, "dev@laptop");
assert_eq!(listed[0].origin_host.as_deref(), Some("dev@laptop"));
let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
let rebuild_listed = storage
.list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
.unwrap();
assert_eq!(rebuild_listed.len(), 1);
assert_eq!(rebuild_listed[0].source_id, "dev@laptop");
assert_eq!(rebuild_listed[0].origin_host.as_deref(), Some("dev@laptop"));
}
#[test]
fn seed_canonical_from_best_historical_bundle_copies_data_and_resets_runtime_meta() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
let source_db = dir
.path()
.join("backups/agent_search.db.20260322T020200.bak");
fs::create_dir_all(source_db.parent().unwrap()).unwrap();
let source = SqliteStorage::open(&source_db).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = source.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("seed-conv".into()),
title: Some("Historical seed".into()),
source_path: PathBuf::from("/tmp/historical-seed.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::json!({"seed": true}),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_050),
content: "seeded message".into(),
extra_json: serde_json::json!({"usage": {"total_tokens": 12}}),
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
source
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
source.set_last_scan_ts(123).unwrap();
source.set_last_indexed_at(456).unwrap();
source.set_last_embedded_message_id(789).unwrap();
source
.conn
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams!["historical_bundle_salvaged:stale", "{\"stale\":true}"],
)
.unwrap();
drop(source);
let legacy_v13_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, content='', tokenize='porter')";
let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
let legacy = rusqlite_test_fixture_conn(&source_db);
legacy
.execute_batch(
"UPDATE meta SET value = '13' WHERE key = 'schema_version';
DELETE FROM _schema_migrations WHERE version = 14;
PRAGMA writable_schema = ON;",
)
.unwrap();
legacy
.execute(
"DELETE FROM meta WHERE key = ?1",
[FTS_FRANKEN_REBUILD_META_KEY],
)
.unwrap();
legacy
.execute(
"INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
[legacy_v13_fts_sql],
)
.unwrap();
legacy
.execute(
"INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
[duplicate_legacy_fts_sql],
)
.unwrap();
legacy
.execute_batch("PRAGMA writable_schema = OFF;")
.unwrap();
drop(legacy);
{
let verify = rusqlite_test_fixture_conn(&source_db);
verify
.execute_batch("PRAGMA writable_schema = ON;")
.unwrap();
let fts_entries: i64 = verify
.query_row(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(
fts_entries, 2,
"test fixture should reproduce the duplicate legacy fts_messages rows"
);
let msg_count: i64 = verify
.query_row("SELECT COUNT(*) FROM messages", [], |row| row.get(0))
.unwrap();
assert_eq!(msg_count, 1);
}
let fresh = SqliteStorage::open(&canonical_db).unwrap();
drop(fresh);
let outcome = seed_canonical_from_best_historical_bundle(&canonical_db)
.unwrap()
.unwrap();
assert_eq!(outcome.bundles_imported, 1);
assert_eq!(outcome.conversations_imported, 1);
assert_eq!(outcome.messages_imported, 1);
let readonly = open_franken_with_flags(
&canonical_db.to_string_lossy(),
FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
)
.unwrap();
let readonly_message_count: i64 = readonly
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(readonly_message_count, 1);
let seeded = SqliteStorage::open(&canonical_db).unwrap();
assert_eq!(
seeded
.count_sessions_in_range(None, None, None, None)
.unwrap()
.0,
1
);
let message_count: i64 = seeded
.conn
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(message_count, 1);
assert_eq!(seeded.get_last_scan_ts().unwrap(), None);
assert_eq!(seeded.get_last_embedded_message_id().unwrap(), None);
let last_indexed: Option<String> = seeded
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = 'last_indexed_at'",
fparams![],
|row| row.get_typed(0),
)
.optional()
.unwrap();
assert!(last_indexed.is_none());
let salvage_keys: Vec<String> = seeded
.conn
.query_map_collect(
"SELECT key FROM meta WHERE key LIKE 'historical_bundle_salvaged:%' ORDER BY key",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(salvage_keys.len(), 1);
let reopened_readonly = open_franken_with_flags(
&canonical_db.to_string_lossy(),
FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
)
.unwrap();
let reopened_fts_entries: i64 = reopened_readonly
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(
reopened_fts_entries, 1,
"seeded canonical db should keep a single stock-SQLite fts_messages schema row"
);
let reopened_message_count: i64 = reopened_readonly
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(reopened_message_count, 1);
let franken_seeded = FrankenStorage::open(&canonical_db).unwrap();
assert_eq!(
franken_seeded.schema_version().unwrap(),
CURRENT_SCHEMA_VERSION
);
franken_seeded
.ensure_search_fallback_fts_consistency()
.expect("ensure FTS consistency after seed");
let post_franken_schema_rows: i64 = franken_seeded
.raw()
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(post_franken_schema_rows, 1);
let fts_probe = franken_seeded
.raw()
.query("SELECT COUNT(*) FROM fts_messages");
assert!(
fts_probe.is_ok(),
"expected post-seed FTS to be queryable, got {fts_probe:?}"
);
}
#[test]
fn failed_baseline_seed_preserves_existing_canonical_bundle() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
let source_db = dir
.path()
.join("backups/agent_search.db.20260325T120000Z.bad-seed.bak");
fs::create_dir_all(source_db.parent().unwrap()).unwrap();
let canonical = SqliteStorage::open(&canonical_db).unwrap();
canonical
.conn
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
fparams!["sentinel", "keep-me"],
)
.unwrap();
drop(canonical);
let source = SqliteStorage::open(&source_db).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = source.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("bad-seed-conv".into()),
title: Some("Bad seed".into()),
source_path: PathBuf::from("/tmp/bad-seed.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::json!({"seed": "bad"}),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_050),
content: "this seed should fail".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
source
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
drop(source);
let legacy = FrankenConnection::open(source_db.to_string_lossy().into_owned()).unwrap();
legacy
.execute("UPDATE meta SET value = '12' WHERE key = 'schema_version'")
.unwrap();
drop(legacy);
let err = seed_canonical_from_best_historical_bundle(&canonical_db).unwrap_err();
assert!(
err.to_string()
.contains("schema_version 12 is too old for baseline import"),
"unexpected seed error: {err:#}"
);
let reopened = SqliteStorage::open(&canonical_db).unwrap();
let sentinel: Option<String> = reopened
.conn
.query_row_map(
"SELECT value FROM meta WHERE key = 'sentinel'",
fparams![],
|row| row.get_typed(0),
)
.optional()
.unwrap();
assert_eq!(sentinel.as_deref(), Some("keep-me"));
let conversation_count: i64 = reopened
.conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 0);
let readonly = open_franken_with_flags(
&canonical_db.to_string_lossy(),
FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
)
.unwrap();
let readonly_conversation_count: i64 = readonly
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(readonly_conversation_count, 0);
}
#[test]
fn fetch_messages_for_lexical_rebuild_skips_extra_json() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-rebuild-test".into()),
title: Some("Lexical rebuild".into()),
source_path: PathBuf::from("/tmp/lexical-rebuild.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_050),
content: "indexed text".into(),
extra_json: serde_json::json!({
"usage": { "total_tokens": 1234 },
"irrelevant_blob": "still preserved in canonical storage"
}),
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let inserted = storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
let conversation_id = inserted.conversation_id;
let stored = storage.fetch_messages(conversation_id).unwrap();
assert_eq!(stored.len(), 1);
assert!(!stored[0].extra_json.is_null());
let lexical = storage
.fetch_messages_for_lexical_rebuild(conversation_id)
.unwrap();
assert_eq!(lexical.len(), 1);
assert_eq!(lexical[0].content, "indexed text");
assert_eq!(lexical[0].author.as_deref(), Some("assistant"));
assert!(lexical[0].extra_json.is_null());
}
#[test]
fn fetch_messages_for_lexical_rebuild_batch_groups_and_orders_messages() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let first = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-batch-1".into()),
title: Some("Lexical batch 1".into()),
source_path: PathBuf::from("/tmp/lexical-batch-1.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_010),
content: "first-a".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_020),
content: "first-b".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let second = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-batch-2".into()),
title: Some("Lexical batch 2".into()),
source_path: PathBuf::from("/tmp/lexical-batch-2.jsonl"),
started_at: Some(1_700_000_000_200),
ended_at: Some(1_700_000_000_300),
approx_tokens: Some(84),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::Tool,
author: Some("tool".into()),
created_at: Some(1_700_000_000_210),
content: "second-a".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let third = Conversation {
external_id: Some("lexical-batch-3".into()),
title: Some("Lexical batch 3".into()),
source_path: PathBuf::from("/tmp/lexical-batch-3.jsonl"),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::System,
author: Some("system".into()),
created_at: Some(1_700_000_000_410),
content: "third-a".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
}],
..second.clone()
};
let first_id = storage
.insert_conversation_tree(agent_id, None, &first)
.unwrap()
.conversation_id;
let second_id = storage
.insert_conversation_tree(agent_id, None, &second)
.unwrap()
.conversation_id;
let third_id = storage
.insert_conversation_tree(agent_id, None, &third)
.unwrap()
.conversation_id;
let lexical = storage
.fetch_messages_for_lexical_rebuild_batch(&[third_id, first_id], None, None)
.unwrap();
let first_messages = lexical.get(&first_id).expect("first conversation");
assert_eq!(first_messages.len(), 2);
assert_eq!(first_messages[0].content, "first-a");
assert_eq!(first_messages[1].content, "first-b");
assert!(
first_messages
.iter()
.all(|message| message.extra_json.is_null())
);
assert!(
!lexical.contains_key(&second_id),
"batch fetch must exclude conversations not requested by the caller"
);
let third_messages = lexical.get(&third_id).expect("third conversation");
assert_eq!(third_messages.len(), 1);
assert_eq!(third_messages[0].content, "third-a");
assert!(third_messages[0].extra_json.is_null());
}
#[test]
fn fetch_messages_for_lexical_rebuild_batch_enforces_content_byte_guardrail() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-batch-guard".into()),
title: Some("Lexical batch guard".into()),
source_path: PathBuf::from("/tmp/lexical-batch-guard.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_010),
content: "123456".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_020),
content: "abcdef".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let conversation_id = storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap()
.conversation_id;
let error = storage
.fetch_messages_for_lexical_rebuild_batch(&[conversation_id], Some(10), Some(8))
.expect_err("guardrail should reject oversized batch content");
let message = format!("{error:#}");
assert!(
message.contains("content-byte guardrail"),
"expected guardrail reason in error, got {message}"
);
}
#[test]
fn fetch_messages_handles_manual_rows_inserted_via_raw_connection() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("manual-rows.db");
let storage = FrankenStorage::open(&db_path).unwrap();
let conn = storage.raw();
conn.execute(
"INSERT INTO agents (id, slug, name, kind, created_at, updated_at)
VALUES (1, 'claude_code', 'Claude Code', 'local', 0, 0)",
)
.unwrap();
conn.execute(
"INSERT INTO conversations
(id, agent_id, external_id, title, source_path, source_id, started_at)
VALUES (1, 1, 'manual-ext', 'Manual Session', '/tmp/manual.jsonl', 'local', 200)",
)
.unwrap();
conn.execute(
"INSERT INTO messages
(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
VALUES (1, 1, 0, 'user', 'tester', 1700000000000, 'manual body', '{\"k\":1}', NULL)",
)
.unwrap();
let lexical = storage.fetch_messages_for_lexical_rebuild(1).unwrap();
assert_eq!(lexical.len(), 1);
assert_eq!(lexical[0].content, "manual body");
let full = storage.fetch_messages(1).unwrap();
assert_eq!(full.len(), 1);
assert_eq!(full[0].content, "manual body");
assert_eq!(full[0].author.as_deref(), Some("tester"));
assert_eq!(full[0].extra_json, serde_json::json!({ "k": 1 }));
}
#[test]
fn lexical_rebuild_batch_messages_query_avoids_sorter_temp_btrees() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
for (external_id, base_ts) in [
("conv-1", 1_700_000_000_000_i64),
("conv-2", 1_700_000_001_000_i64),
] {
let conversation = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.to_string()),
title: Some("Lexical rebuild".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
}
let conversation_ids: Vec<i64> = storage
.conn
.query_map_collect(
"SELECT id FROM conversations ORDER BY id",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(conversation_ids.len(), 2);
let plan_details: Vec<String> = storage
.conn
.query_map_collect(
"EXPLAIN QUERY PLAN \
SELECT conversation_id, id, idx, role, author, created_at, content \
FROM messages \
WHERE conversation_id IN (?1, ?2) \
ORDER BY conversation_id ASC, idx ASC",
fparams![conversation_ids[0], conversation_ids[1]],
|row| row.get_typed(3),
)
.unwrap();
assert!(
plan_details
.iter()
.any(|detail| detail.contains("sqlite_autoindex_messages_1")),
"expected batched lexical rebuild fetch to use the conversation_id/idx composite index, got {plan_details:?}"
);
assert!(
!plan_details
.iter()
.any(|detail| detail.contains("TEMP B-TREE")),
"expected batched lexical rebuild fetch to avoid sorter temp b-trees, got {plan_details:?}"
);
}
#[test]
fn stream_messages_for_lexical_rebuild_groups_and_orders_messages() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let first = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-stream-1".into()),
title: Some("Lexical stream 1".into()),
source_path: PathBuf::from("/tmp/lexical-stream-1.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_010),
content: "first-a".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_020),
content: "first-b".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let second = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-stream-2".into()),
title: Some("Lexical stream 2".into()),
source_path: PathBuf::from("/tmp/lexical-stream-2.jsonl"),
started_at: Some(1_700_000_000_200),
ended_at: Some(1_700_000_000_300),
approx_tokens: Some(84),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::Tool,
author: Some("tool".into()),
created_at: Some(1_700_000_000_210),
content: "second-a".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let first_id = storage
.insert_conversation_tree(agent_id, None, &first)
.unwrap()
.conversation_id;
let second_id = storage
.insert_conversation_tree(agent_id, None, &second)
.unwrap()
.conversation_id;
let mut streamed = Vec::new();
storage
.stream_messages_for_lexical_rebuild_from_conversation_id(first_id, |row| {
streamed.push((
row.conversation_id,
row.idx,
row.role,
row.author,
row.content,
));
Ok(())
})
.unwrap();
assert_eq!(
streamed,
vec![
(
first_id,
0,
"user".to_string(),
Some("user".to_string()),
"first-a".to_string(),
),
(
first_id,
1,
"agent".to_string(),
Some("assistant".to_string()),
"first-b".to_string(),
),
(
second_id,
0,
"tool".to_string(),
Some("tool".to_string()),
"second-a".to_string(),
),
]
);
}
#[test]
fn stream_messages_for_lexical_rebuild_between_conversation_ids_respects_upper_bound() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: Some("1.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let first = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-range-1".into()),
title: Some("Lexical range 1".into()),
source_path: PathBuf::from("/tmp/lexical-range-1.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_010),
content: "first-only".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let second = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("lexical-range-2".into()),
title: Some("Lexical range 2".into()),
source_path: PathBuf::from("/tmp/lexical-range-2.jsonl"),
started_at: Some(1_700_000_000_200),
ended_at: Some(1_700_000_000_300),
approx_tokens: Some(84),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::Tool,
author: Some("tool".into()),
created_at: Some(1_700_000_000_210),
content: "second-should-not-appear".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let first_id = storage
.insert_conversation_tree(agent_id, None, &first)
.unwrap()
.conversation_id;
let second_id = storage
.insert_conversation_tree(agent_id, None, &second)
.unwrap()
.conversation_id;
let mut streamed = Vec::new();
storage
.stream_messages_for_lexical_rebuild_between_conversation_ids(
first_id,
first_id,
|row| {
streamed.push((row.conversation_id, row.idx, row.content));
Ok(())
},
)
.unwrap();
assert_eq!(streamed, vec![(first_id, 0, "first-only".to_string())]);
assert!(
streamed
.iter()
.all(|(conversation_id, _, _)| *conversation_id != second_id),
"upper bound should exclude later conversation ids"
);
}
#[test]
fn stream_messages_for_lexical_rebuild_between_conversation_ids_handles_mixed_ranges() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let claude_agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let aider_agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "aider".into(),
name: "Aider".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
type MessageSpec = (i64, MessageRole, Option<String>, Option<i64>, String);
let mut expected = Vec::new();
let mut first_conversation_id = None;
let mut last_conversation_id = None;
let mut insert_conversation =
|agent_id: i64,
external_id: &str,
title: &str,
source_path: &str,
started_at: i64,
message_specs: Vec<MessageSpec>| {
let conversation = Conversation {
id: None,
agent_slug: if agent_id == aider_agent_id {
"aider".into()
} else {
"claude_code".into()
},
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.to_string()),
title: Some(title.to_string()),
source_path: PathBuf::from(source_path),
started_at: Some(started_at),
ended_at: Some(started_at + 100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: message_specs
.iter()
.map(|(idx, role, author, created_at, content)| Message {
id: None,
idx: *idx,
role: role.clone(),
author: author.clone(),
created_at: *created_at,
content: content.clone(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
.collect(),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let conversation_id = storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap()
.conversation_id;
if first_conversation_id.is_none() {
first_conversation_id = Some(conversation_id);
}
last_conversation_id = Some(conversation_id);
expected.extend(message_specs.into_iter().map(
|(idx, role, author, created_at, content)| {
(
conversation_id,
idx,
match role {
MessageRole::User => "user".to_string(),
MessageRole::Agent => "agent".to_string(),
MessageRole::Tool => "tool".to_string(),
MessageRole::System => "system".to_string(),
MessageRole::Other(other) => other,
},
author,
created_at,
content,
)
},
));
};
for (label, base_ts) in [
("alpha", 1_700_000_000_000_i64),
("beta", 1_700_000_001_000_i64),
("gamma", 1_700_000_002_000_i64),
("delta", 1_700_000_003_000_i64),
("epsilon", 1_700_000_004_000_i64),
] {
insert_conversation(
claude_agent_id,
&format!("lexical-{label}"),
&format!("Lexical {label}"),
&format!("/tmp/{label}.jsonl"),
base_ts,
vec![
(
0,
MessageRole::User,
None,
Some(base_ts + 10),
format!("{label}_content"),
),
(
1,
MessageRole::Agent,
None,
Some(base_ts + 20),
format!("{label}_content_response"),
),
],
);
}
insert_conversation(
aider_agent_id,
"lexical-aider-history",
"Aider Chat: coding_agent_session_search",
"/tmp/.aider.chat.history.md",
1_764_619_673_394,
vec![
(
0,
MessageRole::System,
Some("system".to_string()),
None,
"# aider chat started at 2025-12-01 20:07:47".to_string(),
),
(
1,
MessageRole::User,
Some("user".to_string()),
None,
"/tmp/workspace/.venv/bin/aider --no-git --message hello world".to_string(),
),
],
);
insert_conversation(
aider_agent_id,
"lexical-aider-fixture",
"Aider Chat: aider",
"/tmp/tests/fixtures/aider/.aider.chat.history.md",
1_764_621_401_399,
vec![
(
0,
MessageRole::User,
Some("user".to_string()),
None,
"/add src/main.rs".to_string(),
),
(
1,
MessageRole::Agent,
Some("assistant".to_string()),
None,
"Added src/main.rs to the chat.
#### /add src/main.rs"
.to_string(),
),
(
2,
MessageRole::User,
Some("user".to_string()),
None,
"Please refactor.".to_string(),
),
(
3,
MessageRole::Agent,
Some("assistant".to_string()),
None,
"Sure, here is the code.".to_string(),
),
],
);
let mut streamed = Vec::new();
storage
.stream_messages_for_lexical_rebuild_between_conversation_ids(
first_conversation_id.unwrap(),
last_conversation_id.unwrap(),
|row| {
streamed.push((
row.conversation_id,
row.idx,
row.role,
row.author,
row.created_at,
row.content,
));
Ok(())
},
)
.unwrap();
assert_eq!(streamed, expected);
}
#[test]
fn lexical_rebuild_stream_queries_use_rowid_and_per_conversation_probes() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
for (external_id, base_ts) in [
("conv-1", 1_700_000_000_000_i64),
("conv-2", 1_700_000_001_000_i64),
] {
let conversation = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.to_string()),
title: Some("Lexical rebuild".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
}
let first_id: i64 = storage
.conn
.query_row_map(
"SELECT id FROM conversations ORDER BY id LIMIT 1",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let last_id: i64 = storage
.conn
.query_row_map(
"SELECT id FROM conversations ORDER BY id DESC LIMIT 1",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let conversation_plan_details: Vec<String> = storage
.conn
.query_map_collect(
"EXPLAIN QUERY PLAN SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
fparams![first_id, last_id],
|row| row.get_typed(3),
)
.unwrap();
assert!(
!conversation_plan_details
.iter()
.any(|detail| detail.contains("TEMP B-TREE")),
"expected streamed lexical rebuild conversation listing to avoid sorter temp b-trees, got {conversation_plan_details:?}"
);
let message_plan_details: Vec<String> = storage
.conn
.query_map_collect(
"EXPLAIN QUERY PLAN SELECT id, idx, role, author, created_at, content FROM messages INDEXED BY sqlite_autoindex_messages_1 WHERE conversation_id = ?1 ORDER BY idx",
fparams![first_id],
|row| row.get_typed(3),
)
.unwrap();
assert!(
message_plan_details
.iter()
.any(|detail| detail.contains("sqlite_autoindex_messages_1")
|| detail.contains("idx_messages_conv_idx")),
"expected per-conversation lexical rebuild fetch to use the conversation_id/idx index, got {message_plan_details:?}"
);
assert!(
!message_plan_details
.iter()
.any(|detail| detail.contains("TEMP B-TREE")),
"expected per-conversation lexical rebuild fetch to avoid sorter temp b-trees, got {message_plan_details:?}"
);
}
#[test]
fn discover_historical_database_bundles_prefers_larger_archives_first() {
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
fs::write(&canonical_db, b"canonical").unwrap();
let smaller = dir.path().join("agent_search.corrupt.small");
fs::write(&smaller, vec![0_u8; 32]).unwrap();
let backups_dir = dir.path().join("backups");
fs::create_dir_all(&backups_dir).unwrap();
let larger = backups_dir.join("agent_search.db.20260322T020200.bak");
fs::write(&larger, vec![0_u8; 128]).unwrap();
let bundles = discover_historical_database_bundles(&canonical_db);
let ordered_paths: Vec<PathBuf> =
bundles.into_iter().map(|bundle| bundle.root_path).collect();
assert_eq!(ordered_paths, vec![larger, smaller]);
}
#[test]
fn discover_historical_database_bundles_prefers_queryable_direct_bundles_first() {
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
fs::write(&canonical_db, b"canonical").unwrap();
let larger_corrupt = dir.path().join("agent_search.corrupt.20260324_212907");
fs::write(&larger_corrupt, vec![0_u8; 4096]).unwrap();
let backups_dir = dir.path().join("backups");
fs::create_dir_all(&backups_dir).unwrap();
let smaller_healthy = backups_dir.join("agent_search.db.20260322T020200.bak");
let conn = FrankenConnection::open(smaller_healthy.to_string_lossy().into_owned()).unwrap();
conn.execute_batch(
"CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
content TEXT
);
INSERT INTO conversations(id, source_path) VALUES (1, '/tmp/history.jsonl');
INSERT INTO messages(id, conversation_id, idx, content)
VALUES (1, 1, 0, 'seed');",
)
.unwrap();
drop(conn);
let bundles = discover_historical_database_bundles(&canonical_db);
let ordered_paths: Vec<PathBuf> = bundles
.iter()
.map(|bundle| bundle.root_path.clone())
.collect();
assert_eq!(ordered_paths, vec![smaller_healthy, larger_corrupt]);
assert!(bundles[0].supports_direct_readonly);
assert!(!bundles[1].supports_direct_readonly);
}
#[test]
fn salvage_historical_databases_skips_unreadable_quarantined_bundles() {
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
let storage = SqliteStorage::open(&canonical_db).unwrap();
let quarantined = dir.path().join("agent_search.corrupt.20260324_212907");
fs::write(&quarantined, b"not a sqlite database").unwrap();
let discovered: Vec<PathBuf> = discover_historical_database_bundles(&canonical_db)
.into_iter()
.map(|bundle| bundle.root_path)
.collect();
assert_eq!(discovered, vec![quarantined]);
let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
assert_eq!(outcome.bundles_considered, 1);
assert_eq!(outcome.bundles_imported, 0);
assert_eq!(outcome.conversations_imported, 0);
assert_eq!(outcome.messages_imported, 0);
assert!(storage.list_conversations(10, 0).unwrap().is_empty());
}
#[test]
fn discover_historical_database_bundles_includes_repair_lab_and_snapshots_named_roots() {
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
fs::write(&canonical_db, b"canonical").unwrap();
let repair_lab_dir = dir.path().join("repair-lab").join("live-copy");
fs::create_dir_all(&repair_lab_dir).unwrap();
let repair_lab_db = repair_lab_dir.join("agent_search.db");
fs::write(&repair_lab_db, vec![0_u8; 96]).unwrap();
fs::write(
repair_lab_dir.join("agent_search.rebuild-test.db"),
vec![0_u8; 192],
)
.unwrap();
let snapshots_dir = dir.path().join("snapshots").join("20260324T013201Z");
fs::create_dir_all(&snapshots_dir).unwrap();
let snapshot_db = snapshots_dir.join("agent_search.db");
fs::write(&snapshot_db, vec![0_u8; 64]).unwrap();
let bundles = discover_historical_database_bundles(&canonical_db);
let ordered_paths: Vec<PathBuf> =
bundles.into_iter().map(|bundle| bundle.root_path).collect();
assert!(ordered_paths.contains(&repair_lab_db));
assert!(ordered_paths.contains(&snapshot_db));
assert!(
!ordered_paths
.iter()
.any(|path| path.file_name().and_then(|name| name.to_str())
== Some("agent_search.rebuild-test.db"))
);
}
#[test]
fn discover_historical_database_bundles_prefers_healthy_backup_over_replay_priority() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
let dir = TempDir::new().unwrap();
let canonical_db = dir.path().join("agent_search.db");
fs::write(&canonical_db, b"canonical").unwrap();
let replay_dir = dir
.path()
.join("repair-lab")
.join("replay-20260324T070101Z");
fs::create_dir_all(&replay_dir).unwrap();
let replay_db = replay_dir.join("agent_search.db");
let replay_storage = SqliteStorage::open(&replay_db).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = replay_storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("replay-conv".into()),
title: Some("Replay bundle".into()),
source_path: PathBuf::from("/tmp/replay.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_050),
content: "replay message".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
replay_storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
drop(replay_storage);
let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
let replay_legacy = rusqlite_test_fixture_conn(&replay_db);
replay_legacy
.execute_batch(
"UPDATE meta SET value = '13' WHERE key = 'schema_version';
DELETE FROM _schema_migrations WHERE version = 14;
PRAGMA writable_schema = ON;",
)
.unwrap();
replay_legacy
.execute(
"DELETE FROM meta WHERE key = ?1",
[FTS_FRANKEN_REBUILD_META_KEY],
)
.unwrap();
replay_legacy
.execute(
"INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
[duplicate_legacy_fts_sql],
)
.unwrap();
replay_legacy
.execute_batch("PRAGMA writable_schema = OFF;")
.unwrap();
drop(replay_legacy);
let backups_dir = dir.path().join("backups");
fs::create_dir_all(&backups_dir).unwrap();
let clean_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
let clean_storage = SqliteStorage::open(&clean_backup).unwrap();
let clean_agent_id = clean_storage.ensure_agent(&agent).unwrap();
clean_storage
.insert_conversation_tree(clean_agent_id, None, &conversation)
.unwrap();
drop(clean_storage);
let bundles = discover_historical_database_bundles(&canonical_db);
let ordered_paths: Vec<PathBuf> = bundles
.iter()
.map(|bundle| bundle.root_path.clone())
.collect();
assert_eq!(ordered_paths[0], clean_backup);
assert_eq!(ordered_paths[1], replay_db);
assert_eq!(
bundles[0].probe.schema_version,
Some(CURRENT_SCHEMA_VERSION)
);
assert_eq!(bundles[0].probe.fts_schema_rows, Some(0));
assert!(!bundles[0].probe.fts_queryable);
assert_eq!(bundles[1].probe.schema_version, Some(13));
assert_eq!(bundles[1].probe.fts_schema_rows, Some(1));
}
#[test]
fn ensure_fts_consistency_via_rusqlite_catches_up_missing_rows() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("fts-catchup.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("fts-catchup".into()),
title: Some("FTS catchup".into()),
source_path: PathBuf::from("/tmp/fts-catchup.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_050),
content: "initial message".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
drop(storage);
rebuild_fts_via_rusqlite(&db_path).unwrap();
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
let conversation_id: i64 = conn
.query_row_map("SELECT id FROM conversations LIMIT 1", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
conn.execute_compat(
"INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
VALUES(2, ?1, 1, 'assistant', 'assistant', 1700000000060, 'authentication catchup', NULL, NULL)",
fparams![conversation_id],
)
.unwrap();
drop(conn);
let repair = ensure_fts_consistency_via_rusqlite(&db_path).unwrap();
assert_eq!(
repair,
FtsConsistencyRepair::IncrementalCatchUp {
inserted_rows: 1,
total_rows: 2
}
);
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
let auth_rows: i64 = conn
.query_row_map(
"SELECT COUNT(*) FROM fts_messages WHERE rowid = 2",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(auth_rows, 1);
}
#[test]
fn rebuild_fts_via_rusqlite_cleans_duplicate_legacy_schema_rows() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("fts-duplicate-rebuild.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/ws")),
external_id: Some("retro".into()),
title: Some("retro".into()),
source_path: PathBuf::from("/tmp/retro.jsonl"),
started_at: Some(42),
ended_at: Some(42),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(42),
content: "retro investigation".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
drop(storage);
materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
let conn = rusqlite_test_fixture_conn(&db_path);
conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
conn.execute(
"INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
["CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')"],
)
.unwrap();
conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
let duplicate_rows: i64 = conn
.query_row(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(duplicate_rows, 2);
drop(conn);
let inserted = rebuild_fts_via_rusqlite(&db_path).unwrap();
assert_eq!(inserted, 1);
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
let schema_rows = franken_fts_schema_rows(&conn).unwrap();
assert_eq!(
schema_rows, 1,
"DROP TABLE should leave one clean FTS schema"
);
let match_count: i64 = conn
.query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(match_count, 1);
}
#[test]
fn ensure_agent_creates_new() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "test_agent".into(),
name: "Test Agent".into(),
version: Some("1.0".into()),
kind: AgentKind::Cli,
};
let id = storage.ensure_agent(&agent).unwrap();
assert!(id > 0);
}
#[test]
fn ensure_agent_returns_existing_id() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
};
let id1 = storage.ensure_agent(&agent).unwrap();
let id2 = storage.ensure_agent(&agent).unwrap();
assert_eq!(id1, id2);
}
#[test]
fn ensure_agent_unchanged_preserves_updated_at() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("1.0".into()),
kind: AgentKind::Cli,
};
storage.ensure_agent(&agent).unwrap();
let initial_updated_at: i64 = storage
.conn
.query_row_map(
"SELECT updated_at FROM agents WHERE slug = ?1",
fparams![agent.slug.as_str()],
|row| row.get_typed(0),
)
.unwrap();
std::thread::sleep(std::time::Duration::from_millis(5));
storage.ensure_agent(&agent).unwrap();
let fetched_updated_at: i64 = storage
.conn
.query_row_map(
"SELECT updated_at FROM agents WHERE slug = ?1",
fparams![agent.slug.as_str()],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(fetched_updated_at, initial_updated_at);
}
#[test]
fn ensure_agent_changed_metadata_updates_cached_slug() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let mut agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("1.0".into()),
kind: AgentKind::Cli,
};
let id1 = storage.ensure_agent(&agent).unwrap();
agent.name = "Codex CLI".into();
agent.version = Some("1.1".into());
let id2 = storage.ensure_agent(&agent).unwrap();
let fetched: (String, Option<String>) = storage
.conn
.query_row_map(
"SELECT name, version FROM agents WHERE slug = ?1",
fparams![agent.slug.as_str()],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(id1, id2);
assert_eq!(fetched, ("Codex CLI".into(), Some("1.1".into())));
}
#[test]
fn list_agents_returns_inserted() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "new_agent".into(),
name: "New Agent".into(),
version: None,
kind: AgentKind::VsCode,
};
storage.ensure_agent(&agent).unwrap();
let agents = storage.list_agents().unwrap();
assert!(agents.iter().any(|a| a.slug == "new_agent"));
}
#[test]
fn ensure_workspace_creates_new() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let id = storage
.ensure_workspace(Path::new("/home/user/project"), Some("My Project"))
.unwrap();
assert!(id > 0);
}
#[test]
fn ensure_workspace_returns_existing() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let path = Path::new("/home/user/myproject");
let id1 = storage.ensure_workspace(path, None).unwrap();
let id2 = storage.ensure_workspace(path, None).unwrap();
assert_eq!(id1, id2);
}
#[test]
fn ensure_workspace_changed_display_name_updates_cached_path() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let path = Path::new("/home/user/myproject");
let id1 = storage.ensure_workspace(path, Some("Before")).unwrap();
let id2 = storage.ensure_workspace(path, Some("After")).unwrap();
let display_name: Option<String> = storage
.conn
.query_row_map(
"SELECT display_name FROM workspaces WHERE path = ?1",
fparams![path.to_string_lossy().as_ref()],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(id1, id2);
assert_eq!(display_name.as_deref(), Some("After"));
}
#[test]
fn list_workspaces_returns_inserted() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
storage
.ensure_workspace(Path::new("/test/workspace"), Some("Test WS"))
.unwrap();
let workspaces = storage.list_workspaces().unwrap();
assert!(
workspaces
.iter()
.any(|w| w.path.to_str() == Some("/test/workspace"))
);
}
#[test]
fn upsert_source_creates_new() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let source = Source {
id: "test-laptop".into(),
kind: SourceKind::Ssh,
host_label: Some("test.local".into()),
machine_id: Some("test-machine-id".into()),
platform: None,
config_json: None,
created_at: Some(SqliteStorage::now_millis()),
updated_at: None,
};
storage.upsert_source(&source).unwrap();
let fetched = storage.get_source("test-laptop").unwrap();
assert!(fetched.is_some());
assert_eq!(fetched.unwrap().host_label, Some("test.local".into()));
}
#[test]
fn upsert_source_updates_existing() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let source1 = Source {
id: "my-source".into(),
kind: SourceKind::Ssh,
host_label: Some("Original Label".into()),
machine_id: None,
platform: None,
config_json: None,
created_at: Some(SqliteStorage::now_millis()),
updated_at: None,
};
storage.upsert_source(&source1).unwrap();
let source2 = Source {
id: "my-source".into(),
kind: SourceKind::Ssh,
host_label: Some("Updated Label".into()),
machine_id: None,
platform: Some("linux".into()),
config_json: None,
created_at: Some(SqliteStorage::now_millis()),
updated_at: Some(SqliteStorage::now_millis()),
};
storage.upsert_source(&source2).unwrap();
let fetched = storage.get_source("my-source").unwrap().unwrap();
assert_eq!(fetched.host_label, Some("Updated Label".into()));
assert!(fetched.platform.is_some());
}
#[test]
fn upsert_source_unchanged_preserves_updated_at() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let source = Source {
id: "stable-source".into(),
kind: SourceKind::Ssh,
host_label: Some("builder.local".into()),
machine_id: None,
platform: Some("linux".into()),
config_json: Some(serde_json::json!({"role": "bench"})),
created_at: None,
updated_at: None,
};
storage.upsert_source(&source).unwrap();
let initial = storage.get_source("stable-source").unwrap().unwrap();
std::thread::sleep(std::time::Duration::from_millis(5));
storage.upsert_source(&source).unwrap();
let fetched = storage.get_source("stable-source").unwrap().unwrap();
assert_eq!(fetched.created_at, initial.created_at);
assert_eq!(fetched.updated_at, initial.updated_at);
assert_eq!(fetched.host_label, initial.host_label);
assert_eq!(fetched.platform, initial.platform);
assert_eq!(fetched.config_json, initial.config_json);
}
#[test]
fn ensure_source_for_conversation_recreates_remote_source_after_delete() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/ws/cache-recreate")),
external_id: Some("cache-recreate".into()),
title: Some("Cache Recreate".into()),
source_path: PathBuf::from("/log/cache-recreate.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: Some(16),
metadata_json: serde_json::json!({}),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("tester".into()),
created_at: Some(1_700_000_000_000),
content: "cache recreate".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
}],
source_id: "cache-remote-source".into(),
origin_host: Some("builder-cache".into()),
};
storage
.ensure_source_for_conversation(&conversation)
.unwrap();
assert!(storage.get_source("cache-remote-source").unwrap().is_some());
let deleted = storage.delete_source("cache-remote-source", false).unwrap();
assert!(deleted);
assert!(storage.get_source("cache-remote-source").unwrap().is_none());
storage
.ensure_source_for_conversation(&conversation)
.unwrap();
let recreated = storage.get_source("cache-remote-source").unwrap();
assert!(recreated.is_some());
assert_eq!(
recreated.unwrap().host_label.as_deref(),
Some("builder-cache")
);
}
#[test]
fn delete_source_removes_entry() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let source = Source {
id: "to-delete".into(),
kind: SourceKind::Local,
host_label: None,
machine_id: None,
platform: None,
config_json: None,
created_at: Some(SqliteStorage::now_millis()),
updated_at: None,
};
storage.upsert_source(&source).unwrap();
let deleted = storage.delete_source("to-delete", false).unwrap();
assert!(deleted);
let fetched = storage.get_source("to-delete").unwrap();
assert!(fetched.is_none());
}
#[test]
fn delete_source_cannot_delete_local() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let result = storage.delete_source(LOCAL_SOURCE_ID, false);
assert!(result.is_err());
}
#[test]
fn list_sources_includes_local() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let sources = storage.list_sources().unwrap();
assert!(sources.iter().any(|s| s.id == LOCAL_SOURCE_ID));
}
#[test]
fn insert_conversation_tree_blank_local_source_normalizes_to_local_id() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some("blank-local-source".into()),
title: Some("Blank local source".into()),
source_path: dir.path().join("blank-local.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: " ".into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
assert!(storage.get_source(" ").unwrap().is_none());
let source = storage
.get_source(LOCAL_SOURCE_ID)
.unwrap()
.expect("local source row should exist");
assert_eq!(source.kind, SourceKind::Local);
assert_eq!(source.host_label, None);
let conversations = storage.list_conversations(10, 0).unwrap();
assert_eq!(conversations.len(), 1);
assert_eq!(conversations[0].source_id, LOCAL_SOURCE_ID);
assert_eq!(conversations[0].origin_host, None);
}
#[test]
fn repeated_local_inserts_do_not_touch_bootstrap_source_row() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let bootstrap_updated_at: i64 = storage
.conn
.query_row_map(
"SELECT updated_at FROM sources WHERE id = ?1",
fparams![LOCAL_SOURCE_ID],
|row| row.get_typed(0),
)
.unwrap();
let make_conversation = |external_id: &str, suffix: &str| Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some(external_id.into()),
title: Some(format!("Local source {suffix}")),
source_path: dir.path().join(format!("local-{suffix}.jsonl")),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: format!("hello-{suffix}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
std::thread::sleep(std::time::Duration::from_millis(5));
storage
.insert_conversation_tree(agent_id, None, &make_conversation("local-source-1", "one"))
.unwrap();
let after_first_insert: i64 = storage
.conn
.query_row_map(
"SELECT updated_at FROM sources WHERE id = ?1",
fparams![LOCAL_SOURCE_ID],
|row| row.get_typed(0),
)
.unwrap();
std::thread::sleep(std::time::Duration::from_millis(5));
storage
.insert_conversation_tree(agent_id, None, &make_conversation("local-source-2", "two"))
.unwrap();
let after_second_insert: i64 = storage
.conn
.query_row_map(
"SELECT updated_at FROM sources WHERE id = ?1",
fparams![LOCAL_SOURCE_ID],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(after_first_insert, bootstrap_updated_at);
assert_eq!(after_second_insert, bootstrap_updated_at);
}
#[test]
fn insert_conversation_tree_blank_remote_source_normalizes_to_origin_host() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some("blank-remote-source".into()),
title: Some("Blank remote source".into()),
source_path: dir.path().join("blank-remote.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: " ".into(),
origin_host: Some("user@work-laptop".into()),
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
assert!(storage.get_source(" ").unwrap().is_none());
let source = storage
.get_source("user@work-laptop")
.unwrap()
.expect("normalized remote source row should exist");
assert_eq!(source.kind, SourceKind::Ssh);
assert_eq!(source.host_label.as_deref(), Some("user@work-laptop"));
let conversations = storage.list_conversations(10, 0).unwrap();
assert_eq!(conversations.len(), 1);
assert_eq!(conversations[0].source_id, "user@work-laptop");
assert_eq!(
conversations[0].origin_host.as_deref(),
Some("user@work-laptop")
);
}
#[test]
fn insert_conversations_batched_normalizes_host_only_remote_source_id() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some("batched-blank-remote-source".into()),
title: Some("Batched blank remote source".into()),
source_path: dir.path().join("batched-blank-remote.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: " ".into(),
origin_host: Some("user@batch-host".into()),
};
storage
.insert_conversations_batched(&[(agent_id, None, &conversation)])
.unwrap();
assert!(storage.get_source(" ").unwrap().is_none());
let source = storage
.get_source("user@batch-host")
.unwrap()
.expect("normalized batched remote source row should exist");
assert_eq!(source.kind, SourceKind::Ssh);
assert_eq!(source.host_label.as_deref(), Some("user@batch-host"));
let conversations = storage.list_conversations(10, 0).unwrap();
assert_eq!(conversations.len(), 1);
assert_eq!(conversations[0].source_id, "user@batch-host");
assert_eq!(
conversations[0].origin_host.as_deref(),
Some("user@batch-host")
);
}
#[test]
fn get_source_ids_excludes_local() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let source = Source {
id: "remote-1".into(),
kind: SourceKind::Ssh,
host_label: Some("server".into()),
machine_id: None,
platform: None,
config_json: None,
created_at: Some(SqliteStorage::now_millis()),
updated_at: None,
};
storage.upsert_source(&source).unwrap();
let ids = storage.get_source_ids().unwrap();
assert!(!ids.contains(&LOCAL_SOURCE_ID.to_string()));
assert!(ids.contains(&"remote-1".to_string()));
}
#[test]
fn get_last_scan_ts_returns_none_initially() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let ts = storage.get_last_scan_ts().unwrap();
assert!(ts.is_none());
}
#[test]
fn set_and_get_last_scan_ts() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let expected_ts = 1700000000000_i64;
storage.set_last_scan_ts(expected_ts).unwrap();
let actual_ts = storage.get_last_scan_ts().unwrap();
assert_eq!(actual_ts, Some(expected_ts));
}
#[test]
fn now_millis_returns_reasonable_value() {
let ts = SqliteStorage::now_millis();
assert!(ts > 1577836800000);
assert!(ts < 4102444800000);
}
#[test]
fn msgpack_roundtrip_basic_object() {
let value = serde_json::json!({
"key": "value",
"number": 42,
"nested": { "inner": true }
});
let bytes = serialize_json_to_msgpack(&value).expect("should serialize");
let recovered = deserialize_msgpack_to_json(&bytes);
assert_eq!(value, recovered);
}
#[test]
fn msgpack_returns_none_for_null() {
let value = serde_json::Value::Null;
assert!(serialize_json_to_msgpack(&value).is_none());
}
#[test]
fn message_insert_stores_null_extra_json_as_sql_null() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some("null-extra-json".into()),
title: Some("Null extra_json".into()),
source_path: PathBuf::from("/tmp/null-extra-json.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "null metadata message".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let conversation_id = storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap()
.conversation_id;
let (extra_json, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
.conn
.query_row_map(
"SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
fparams![conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert!(extra_json.is_none());
assert!(extra_bin.is_none());
let stored = storage.fetch_messages(conversation_id).unwrap();
assert!(stored[0].extra_json.is_null());
}
#[test]
fn message_insert_stores_nonempty_extra_json_as_msgpack_only() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let extra_json = serde_json::json!({ "idx": 7, "kind": "profile" });
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some("msgpack-extra-json".into()),
title: Some("MessagePack extra_json".into()),
source_path: PathBuf::from("/tmp/msgpack-extra-json.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "msgpack metadata message".into(),
extra_json: extra_json.clone(),
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
let conversation_id = storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap()
.conversation_id;
let (extra_json_text, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
.conn
.query_row_map(
"SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
fparams![conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert!(extra_json_text.is_none());
assert!(extra_bin.is_some());
let stored = storage.fetch_messages(conversation_id).unwrap();
assert_eq!(stored[0].extra_json, extra_json);
}
#[test]
fn conversation_insert_preserves_null_metadata_json_as_json_null() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some("null-conversation-metadata".into()),
title: Some("Null conversation metadata".into()),
source_path: PathBuf::from("/tmp/null-conversation-metadata.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "null conversation metadata message".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
let (metadata_json, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
.conn
.query_row_map(
"SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
fparams!["null-conversation-metadata"],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(metadata_json.as_deref(), Some("null"));
assert!(metadata_bin.is_none());
let listed = storage.list_conversations(10, 0).unwrap();
assert!(listed[0].metadata_json.is_null());
}
#[test]
fn conversation_insert_stores_nonempty_metadata_as_msgpack_only() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let metadata_json = serde_json::json!({ "bench": true, "source": "profile" });
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: None,
external_id: Some("msgpack-conversation-metadata".into()),
title: Some("MessagePack conversation metadata".into()),
source_path: PathBuf::from("/tmp/msgpack-conversation-metadata.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_001),
approx_tokens: None,
metadata_json: metadata_json.clone(),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "msgpack conversation metadata message".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
let (metadata_text, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
.conn
.query_row_map(
"SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
fparams!["msgpack-conversation-metadata"],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert!(metadata_text.is_none());
assert!(metadata_bin.is_some());
let listed = storage.list_conversations(10, 0).unwrap();
assert_eq!(listed[0].metadata_json, metadata_json);
}
#[test]
fn msgpack_returns_none_for_empty_object() {
let value = serde_json::json!({});
assert!(serialize_json_to_msgpack(&value).is_none());
}
#[test]
fn parse_historical_json_column_preserves_large_payloads_as_raw_json() {
let raw = format!("{{\"blob\":\"{}\"}}", "x".repeat(1_000_000));
let value = parse_historical_json_column(Some(raw.clone()));
assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
assert_eq!(json_value_size_hint(&value), raw.len());
}
#[test]
fn parse_historical_json_column_preserves_small_payloads_as_raw_json() {
let raw = String::from("{\"ok\":true,\"n\":1}");
let value = parse_historical_json_column(Some(raw.clone()));
assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
}
#[test]
fn msgpack_serializes_non_empty_array() {
let value = serde_json::json!([1, 2, 3]);
let bytes = serialize_json_to_msgpack(&value).expect("should serialize array");
let recovered = deserialize_msgpack_to_json(&bytes);
assert_eq!(value, recovered);
}
#[test]
fn msgpack_smaller_than_json() {
let value = serde_json::json!({
"field_name_one": "some_value",
"field_name_two": 123456,
"field_name_three": [1, 2, 3, 4, 5],
"field_name_four": { "nested": true }
});
let json_bytes = serde_json::to_vec(&value).unwrap();
let msgpack_bytes = serialize_json_to_msgpack(&value).unwrap();
assert!(
msgpack_bytes.len() < json_bytes.len(),
"MessagePack ({} bytes) should be smaller than JSON ({} bytes)",
msgpack_bytes.len(),
json_bytes.len()
);
}
#[test]
fn migration_v7_adds_binary_columns() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let has_metadata_bin = storage
.raw()
.query("PRAGMA table_info(conversations)")
.unwrap()
.iter()
.any(|row| row.get_typed::<String>(1).unwrap() == "metadata_bin");
assert!(
has_metadata_bin,
"conversations should have metadata_bin column"
);
let has_extra_bin = storage
.raw()
.query("PRAGMA table_info(messages)")
.unwrap()
.iter()
.any(|row| row.get_typed::<String>(1).unwrap() == "extra_bin");
assert!(has_extra_bin, "messages should have extra_bin column");
}
#[test]
fn insert_conversation_tree_rehydrates_append_tail_state_cache_after_manual_clear() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("append-tail-state-cache.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace = PathBuf::from("/ws/profiled-append-remote");
let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
let initial = make_profiled_append_remote_merge_conversation(11, 5);
let insert_outcome = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &initial)
.unwrap();
let conversation_id = insert_outcome.conversation_id;
let initial_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
.raw()
.query_row_map(
"SELECT ended_at, last_message_idx, last_message_created_at
FROM conversation_tail_state
WHERE conversation_id = ?1",
fparams![conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
)
.unwrap();
assert_eq!(initial_tail, (Some(111_005), Some(4), Some(111_004)));
storage
.raw()
.execute_compat(
"UPDATE conversations SET ended_at = ?1 WHERE id = ?2",
fparams![111_999_i64, conversation_id],
)
.unwrap();
storage
.raw()
.execute_compat(
"DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
fparams![conversation_id],
)
.unwrap();
let appended = make_profiled_append_remote_merge_conversation(11, 10);
let append_outcome = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &appended)
.unwrap();
assert_eq!(append_outcome.inserted_indices, vec![5, 6, 7, 8, 9]);
let final_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
.raw()
.query_row_map(
"SELECT ended_at, last_message_idx, last_message_created_at
FROM conversation_tail_state
WHERE conversation_id = ?1",
fparams![conversation_id],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
)
.unwrap();
assert_eq!(final_tail, (Some(111_999), Some(9), Some(111_009)));
}
#[test]
fn msgpack_deserialize_empty_returns_default() {
let recovered = deserialize_msgpack_to_json(&[]);
assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
}
#[test]
fn msgpack_deserialize_garbage_returns_default() {
let recovered = deserialize_msgpack_to_json(&[0x85]);
assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
}
#[test]
fn stats_aggregator_collects_and_expands() {
let mut agg = StatsAggregator::new();
assert!(agg.is_empty());
agg.record("claude", "local", 100, 5, 500);
agg.record("codex", "local", 100, 3, 300);
agg.record("claude", "local", 101, 2, 200);
assert!(!agg.is_empty());
assert_eq!(agg.raw_entry_count(), 3);
let entries = agg.expand();
assert_eq!(entries.len(), 10);
let day100_all = entries
.iter()
.find(|(d, a, s, _)| *d == 100 && a == "all" && s == "all")
.unwrap();
assert_eq!(day100_all.3.session_count_delta, 2);
assert_eq!(day100_all.3.message_count_delta, 8);
assert_eq!(day100_all.3.total_chars_delta, 800);
}
#[test]
fn lazy_franken_db_not_open_before_get() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("lazy_test.db");
let _storage = SqliteStorage::open(&db_path).unwrap();
let lazy = LazyFrankenDb::new(db_path);
assert!(
!lazy.is_open(),
"LazyFrankenDb must not open on construction"
);
}
#[test]
fn lazy_franken_db_opens_on_first_get() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("lazy_test.db");
let _storage = SqliteStorage::open(&db_path).unwrap();
drop(_storage);
let lazy = LazyFrankenDb::new(db_path);
assert!(!lazy.is_open());
let conn = lazy.get("test").expect("should open successfully");
let count: i64 = conn
.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |r| {
r.get_typed(0)
})
.unwrap();
assert_eq!(count, 0);
drop(conn);
assert!(lazy.is_open(), "LazyFrankenDb must be open after get()");
}
#[test]
fn lazy_franken_db_reuses_connection() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("lazy_test.db");
let _storage = SqliteStorage::open(&db_path).unwrap();
drop(_storage);
let lazy = LazyFrankenDb::new(db_path);
{
let conn = lazy.get("first").unwrap();
conn.execute_batch("CREATE TABLE IF NOT EXISTS test_tbl (id INTEGER)")
.unwrap();
}
{
let conn = lazy.get("second").unwrap();
let count: i64 = conn
.query_row_map("SELECT COUNT(*) FROM test_tbl", fparams![], |r| {
r.get_typed(0)
})
.unwrap();
assert_eq!(count, 0);
}
}
#[test]
fn lazy_franken_db_not_found_error() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("nonexistent.db");
let lazy = LazyFrankenDb::new(db_path);
let result = lazy.get("test");
assert!(result.is_err());
assert!(
matches!(result.unwrap_err(), LazyDbError::NotFound(_)),
"should return NotFound for missing DB"
);
}
#[test]
fn lazy_franken_db_path_accessor() {
let path = PathBuf::from("/tmp/test_lazy.db");
let lazy = LazyFrankenDb::new(path.clone());
assert_eq!(lazy.path(), path.as_path());
}
#[test]
fn sql_like_match_basic_patterns() {
assert!(sql_like_match("claude-opus-4-20250101", "claude-opus-4%"));
assert!(sql_like_match("claude-opus-4", "claude-opus-4%"));
assert!(!sql_like_match("claude-sonnet-4", "claude-opus-4%"));
assert!(sql_like_match("gemini-2.0-flash-001", "gemini-2%flash%"));
assert!(sql_like_match("gemini-2-flash", "gemini-2%flash%"));
assert!(!sql_like_match("gemini-2-pro", "gemini-2%flash%"));
assert!(sql_like_match("hello", "hello"));
assert!(!sql_like_match("hello!", "hello"));
assert!(sql_like_match("gpt-4o", "gpt-4_"));
assert!(!sql_like_match("gpt-4oo", "gpt-4_"));
assert!(sql_like_match("Claude-Opus-4", "claude-opus-4%"));
}
#[test]
fn date_str_to_day_id_converts_correctly() {
assert_eq!(date_str_to_day_id("2025-10-01").unwrap(), 2100);
assert_eq!(date_str_to_day_id("2024-04-01").unwrap(), 1552);
assert!(date_str_to_day_id("invalid").is_err());
}
#[test]
fn pricing_table_lookup_selects_matching_entry() {
let effective_day = date_str_to_day_id("2025-10-01").unwrap();
let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
let table = PricingTable {
entries: vec![
PricingEntry {
model_pattern: "claude-opus-4%".into(),
provider: "anthropic".into(),
input_cost_per_mtok: 15.0,
output_cost_per_mtok: 75.0,
cache_read_cost_per_mtok: Some(1.5),
cache_creation_cost_per_mtok: Some(18.75),
effective_day_id: effective_day,
},
PricingEntry {
model_pattern: "claude-sonnet-4%".into(),
provider: "anthropic".into(),
input_cost_per_mtok: 3.0,
output_cost_per_mtok: 15.0,
cache_read_cost_per_mtok: Some(0.3),
cache_creation_cost_per_mtok: Some(3.75),
effective_day_id: effective_day,
},
],
};
let result = table.lookup("claude-opus-4-20260101", lookup_day);
assert!(result.is_some());
assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
let result = table.lookup("claude-sonnet-4-latest", lookup_day);
assert!(result.is_some());
assert_eq!(result.unwrap().input_cost_per_mtok, 3.0);
assert!(table.lookup("unknown-model", lookup_day).is_none());
}
#[test]
fn pricing_table_lookup_respects_effective_date() {
let effective_day_1 = date_str_to_day_id("2025-10-01").unwrap();
let effective_day_2 = date_str_to_day_id("2026-01-01").unwrap();
let table = PricingTable {
entries: vec![
PricingEntry {
model_pattern: "claude-opus-4%".into(),
provider: "anthropic".into(),
input_cost_per_mtok: 15.0,
output_cost_per_mtok: 75.0,
cache_read_cost_per_mtok: None,
cache_creation_cost_per_mtok: None,
effective_day_id: effective_day_1,
},
PricingEntry {
model_pattern: "claude-opus-4%".into(),
provider: "anthropic".into(),
input_cost_per_mtok: 12.0,
output_cost_per_mtok: 60.0,
cache_read_cost_per_mtok: None,
cache_creation_cost_per_mtok: None,
effective_day_id: effective_day_2,
},
],
};
let result = table.lookup("claude-opus-4", date_str_to_day_id("2025-11-01").unwrap());
assert!(result.is_some());
assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
let result = table.lookup("claude-opus-4", date_str_to_day_id("2026-02-01").unwrap());
assert!(result.is_some());
assert_eq!(result.unwrap().input_cost_per_mtok, 12.0);
assert!(
table
.lookup("claude-opus-4", date_str_to_day_id("2024-01-01").unwrap())
.is_none()
);
}
#[test]
fn pricing_table_lookup_specificity_tiebreak() {
let effective_day = date_str_to_day_id("2025-01-01").unwrap();
let lookup_day = date_str_to_day_id("2026-01-01").unwrap();
let table = PricingTable {
entries: vec![
PricingEntry {
model_pattern: "gpt-4%".into(),
provider: "openai".into(),
input_cost_per_mtok: 10.0,
output_cost_per_mtok: 30.0,
cache_read_cost_per_mtok: None,
cache_creation_cost_per_mtok: None,
effective_day_id: effective_day,
},
PricingEntry {
model_pattern: "gpt-4-turbo%".into(),
provider: "openai".into(),
input_cost_per_mtok: 5.0,
output_cost_per_mtok: 15.0,
cache_read_cost_per_mtok: None,
cache_creation_cost_per_mtok: None,
effective_day_id: effective_day,
},
],
};
let result = table.lookup("gpt-4-turbo-2025", lookup_day);
assert!(result.is_some());
assert_eq!(result.unwrap().input_cost_per_mtok, 5.0);
let result = table.lookup("gpt-4o", lookup_day);
assert!(result.is_some());
assert_eq!(result.unwrap().input_cost_per_mtok, 10.0);
}
#[test]
fn pricing_table_compute_cost_basic() {
let effective_day = date_str_to_day_id("2025-10-01").unwrap();
let table = PricingTable {
entries: vec![PricingEntry {
model_pattern: "claude-opus-4%".into(),
provider: "anthropic".into(),
input_cost_per_mtok: 15.0,
output_cost_per_mtok: 75.0,
cache_read_cost_per_mtok: Some(1.5),
cache_creation_cost_per_mtok: Some(18.75),
effective_day_id: effective_day,
}],
};
let cost = table.compute_cost(
Some("claude-opus-4-latest"),
date_str_to_day_id("2026-02-06").unwrap(),
Some(1000),
Some(500),
None,
None,
);
assert!(cost.is_some());
assert!((cost.unwrap() - 0.0525).abs() < 1e-10);
}
#[test]
fn pricing_table_compute_cost_with_cache() {
let effective_day = date_str_to_day_id("2025-10-01").unwrap();
let table = PricingTable {
entries: vec![PricingEntry {
model_pattern: "claude-opus-4%".into(),
provider: "anthropic".into(),
input_cost_per_mtok: 15.0,
output_cost_per_mtok: 75.0,
cache_read_cost_per_mtok: Some(1.5),
cache_creation_cost_per_mtok: Some(18.75),
effective_day_id: effective_day,
}],
};
let cost = table.compute_cost(
Some("claude-opus-4-latest"),
date_str_to_day_id("2026-02-06").unwrap(),
Some(1_000_000),
Some(100_000),
Some(500_000),
Some(200_000),
);
assert!(cost.is_some());
assert!((cost.unwrap() - 16.5).abs() < 1e-10);
}
#[test]
fn pricing_table_compute_cost_returns_none_for_unknown_model() {
let effective_day = date_str_to_day_id("2025-10-01").unwrap();
let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
let table = PricingTable {
entries: vec![PricingEntry {
model_pattern: "claude-opus-4%".into(),
provider: "anthropic".into(),
input_cost_per_mtok: 15.0,
output_cost_per_mtok: 75.0,
cache_read_cost_per_mtok: None,
cache_creation_cost_per_mtok: None,
effective_day_id: effective_day,
}],
};
assert!(
table
.compute_cost(
Some("unknown-model"),
lookup_day,
Some(1000),
Some(500),
None,
None
)
.is_none()
);
assert!(
table
.compute_cost(None, lookup_day, Some(1000), Some(500), None, None)
.is_none()
);
assert!(
table
.compute_cost(Some("claude-opus-4"), lookup_day, None, None, None, None)
.is_none()
);
}
#[test]
fn pricing_table_load_from_db() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
let table = PricingTable::load(&storage.conn).unwrap();
assert!(!table.is_empty());
let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
let opus = table.lookup("claude-opus-4-latest", lookup_day);
assert!(opus.is_some());
assert_eq!(opus.unwrap().input_cost_per_mtok, 15.0);
let flash = table.lookup("gemini-2.0-flash-001", lookup_day);
assert!(flash.is_some());
assert_eq!(flash.unwrap().input_cost_per_mtok, 0.075);
}
#[test]
fn pricing_table_load_rejects_invalid_effective_date() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let storage = SqliteStorage::open(&db_path).unwrap();
storage
.conn
.execute_compat(
"INSERT INTO model_pricing (
model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
fparams![
"broken-model%",
"test",
1.0_f64,
2.0_f64,
Option::<f64>::None,
Option::<f64>::None,
"not-a-date"
],
)
.unwrap();
let err = PricingTable::load(&storage.conn).unwrap_err();
assert!(err.to_string().contains("invalid effective_date"));
}
#[test]
fn pricing_diagnostics_tracks_coverage() {
let mut diag = PricingDiagnostics::default();
diag.record_priced();
diag.record_priced();
diag.record_unpriced(Some("custom-model-v1"));
diag.record_unpriced(Some("custom-model-v1"));
diag.record_unpriced(None);
assert_eq!(diag.priced_count, 2);
assert_eq!(diag.unpriced_count, 3);
assert_eq!(diag.unknown_models.len(), 2);
assert_eq!(diag.unknown_models["custom-model-v1"], 2);
assert_eq!(diag.unknown_models["(none)"], 1);
}
fn franken_storage_in_memory() -> FrankenStorage {
let conn = FrankenConnection::open(":memory:").unwrap();
let storage = FrankenStorage::new(conn, PathBuf::from(":memory:"));
storage.run_migrations().unwrap();
storage.apply_config().unwrap();
storage
}
#[test]
fn franken_migrations_create_all_tables() {
let storage = franken_storage_in_memory();
let version = storage.schema_version().unwrap();
assert_eq!(
version, CURRENT_SCHEMA_VERSION,
"fresh FrankenStorage should be at current schema version"
);
let rows = storage
.raw()
.query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
.unwrap();
let table_names: Vec<String> = rows
.iter()
.filter_map(|r| r.get_typed::<String>(0).ok())
.collect();
for required in [
"meta",
"agents",
"workspaces",
"conversations",
"messages",
"snippets",
"tags",
"conversation_tags",
] {
assert!(
table_names.contains(&required.to_string()),
"missing table: {required}"
);
}
assert!(
table_names.contains(&"sources".to_string()),
"missing sources table"
);
assert!(
table_names.contains(&"daily_stats".to_string()),
"missing daily_stats table"
);
assert!(
table_names.contains(&"embedding_jobs".to_string()),
"missing embedding_jobs table"
);
for analytics_table in ["message_metrics", "usage_hourly", "usage_daily"] {
assert!(
table_names.contains(&analytics_table.to_string()),
"missing table: {analytics_table}"
);
}
assert!(
table_names.contains(&"conversation_tail_state".to_string()),
"missing conversation_tail_state table"
);
assert!(
table_names.contains(&"conversation_external_lookup".to_string()),
"missing conversation_external_lookup table"
);
assert!(
table_names.contains(&"conversation_external_tail_lookup".to_string()),
"missing conversation_external_tail_lookup table"
);
let rows = storage
.raw()
.query("SELECT COUNT(*) FROM _schema_migrations;")
.unwrap();
let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
assert_eq!(
count,
(13..=CURRENT_SCHEMA_VERSION).count() as i64,
"_schema_migrations should record the V13 base schema and post-V13 migrations"
);
let rows = storage
.raw()
.query("SELECT version FROM _schema_migrations ORDER BY version;")
.unwrap();
let versions: Vec<i64> = rows
.iter()
.map(|row| row.get_typed(0))
.collect::<std::result::Result<_, _>>()
.unwrap();
assert_eq!(
versions,
(13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
"_schema_migrations should contain v13 through current"
);
}
#[test]
fn franken_migrations_idempotent() {
let storage = franken_storage_in_memory();
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
storage.run_migrations().unwrap();
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
}
#[test]
fn migration_v20_backfills_conversation_external_tail_lookup() {
let storage = franken_storage_in_memory();
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace_id = storage
.ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
.unwrap();
let mut conv = make_profiled_storage_remote_conversation(1919, 2);
conv.source_id = "profiled-storage-remote-source-東京".into();
conv.external_id = Some("profiled-storage-remote-☃-1919".into());
let outcome = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &conv)
.unwrap();
let external_id = conv.external_id.as_deref().unwrap();
let lookup_key = conversation_external_lookup_key(&conv.source_id, agent_id, external_id);
storage
.raw()
.execute("DELETE FROM conversation_external_tail_lookup")
.unwrap();
storage
.raw()
.execute("DELETE FROM _schema_migrations WHERE version = 20")
.unwrap();
storage
.raw()
.execute_compat(
"UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
fparams!["19"],
)
.unwrap();
storage.run_migrations().unwrap();
let backfilled: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
.raw()
.query_row_map(
"SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
FROM conversation_external_tail_lookup
WHERE lookup_key = ?1",
fparams![lookup_key.as_str()],
|row| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
))
},
)
.unwrap();
assert_eq!(
backfilled,
(
outcome.conversation_id,
conv.ended_at,
Some(1),
conv.messages[1].created_at
)
);
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
}
#[test]
fn migration_v15_creates_lazy_tail_state_cache() {
let conn = FrankenConnection::open(":memory:").unwrap();
conn.execute_batch(
"CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
ended_at INTEGER
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
created_at INTEGER
);
INSERT INTO conversations(id, ended_at) VALUES
(1, 1710000000300),
(2, NULL);
INSERT INTO messages(id, conversation_id, idx, created_at) VALUES
(10, 1, 0, 1710000000100),
(11, 1, 1, 1710000000200),
(12, 2, 0, 1710000000400);",
)
.unwrap();
conn.execute(
"CREATE TABLE _schema_migrations (
version INTEGER PRIMARY KEY,
name TEXT NOT NULL,
applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))
);",
)
.unwrap();
assert!(
apply_conversation_tail_state_cache_migration(&conn).unwrap(),
"v15 migration should apply once"
);
assert!(
!apply_conversation_tail_state_cache_migration(&conn).unwrap(),
"v15 migration should be idempotent once recorded"
);
let columns = conn.query("PRAGMA table_info(conversations);").unwrap();
let column_names: HashSet<String> = columns
.iter()
.map(|row| row.get_typed(1))
.collect::<std::result::Result<_, frankensqlite::FrankenError>>()
.unwrap();
assert!(column_names.contains("last_message_idx"));
assert!(column_names.contains("last_message_created_at"));
let tail_rows: i64 = conn
.query("SELECT COUNT(*) FROM conversation_tail_state;")
.unwrap()
.first()
.unwrap()
.get_typed(0)
.unwrap();
assert_eq!(
tail_rows, 0,
"v15 should create the cache without an open-time message scan"
);
let applied: i64 = conn
.query("SELECT COUNT(*) FROM _schema_migrations WHERE version = 15;")
.unwrap()
.first()
.unwrap()
.get_typed(0)
.unwrap();
assert_eq!(applied, 1);
}
#[test]
fn schema_repair_adds_missing_conversations_token_columns() {
let conn = FrankenConnection::open(":memory:").unwrap();
conn.execute_batch(
"CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
source_path TEXT NOT NULL
);",
)
.unwrap();
let storage = FrankenStorage::new(conn, std::path::PathBuf::from(":memory:"));
storage.repair_missing_conversation_token_columns().unwrap();
storage.repair_missing_conversation_token_columns().unwrap();
let columns = franken_table_column_names(&storage.conn, "conversations").unwrap();
for &(column_name, _) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
assert!(
columns.contains(column_name),
"schema repair should add conversations.{column_name}"
);
}
}
#[test]
fn franken_meta_schema_version_in_sync() {
let storage = franken_storage_in_memory();
let rows = storage
.raw()
.query("SELECT value FROM meta WHERE key = 'schema_version';")
.unwrap();
let meta_version: String = rows.first().unwrap().get_typed(0).unwrap();
assert_eq!(
meta_version,
CURRENT_SCHEMA_VERSION.to_string(),
"meta.schema_version should match CURRENT_SCHEMA_VERSION"
);
}
#[test]
fn franken_transition_from_meta_version() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_transition.db");
let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
.unwrap();
conn.execute("INSERT INTO meta(key, value) VALUES('schema_version', '10');")
.unwrap();
conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
.unwrap();
drop(conn);
let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
transition_from_meta_version(&conn).unwrap();
let rows = conn
.query("SELECT version FROM _schema_migrations ORDER BY version;")
.unwrap();
let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
assert_eq!(
versions,
(1..=13).collect::<Vec<i64>>(),
"transition should bridge legacy V10 databases through the combined V13 base marker"
);
}
#[test]
fn franken_transition_from_current_meta_backfills_current_schema_marker() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_current_transition.db");
let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
.unwrap();
conn.execute_compat(
"INSERT INTO meta(key, value) VALUES('schema_version', ?1);",
&[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
)
.unwrap();
conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
.unwrap();
drop(conn);
let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
transition_from_meta_version(&conn).unwrap();
let rows = conn
.query("SELECT version FROM _schema_migrations ORDER BY version;")
.unwrap();
let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
assert_eq!(
versions,
(1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
"current meta schema marker should backfill every known migration"
);
}
#[test]
fn franken_transition_skips_when_already_done() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_transition_skip.db");
let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
conn.execute(
"CREATE TABLE _schema_migrations (version INTEGER PRIMARY KEY, name TEXT NOT NULL, applied_at TEXT NOT NULL DEFAULT 'now');",
).unwrap();
conn.execute("INSERT INTO _schema_migrations (version, name) VALUES (1, 'test');")
.unwrap();
transition_from_meta_version(&conn).unwrap();
let rows = conn
.query("SELECT COUNT(*) FROM _schema_migrations;")
.unwrap();
let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
assert_eq!(
count, 1,
"transition should not re-run on already-transitioned DB"
);
}
#[test]
fn franken_transition_fresh_db_is_noop() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_fresh_noop.db");
let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
transition_from_meta_version(&conn).unwrap();
let res = conn.query("SELECT * FROM \"_schema_migrations\";");
assert!(
res.is_err(),
"transition should not create _schema_migrations on fresh DB"
);
}
#[test]
fn franken_transition_with_fts_virtual_table_succeeds() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_transition_with_fts.db");
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
conn.execute_batch(
"CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
INSERT INTO meta(key, value) VALUES('schema_version', '13');
CREATE TABLE conversations (id INTEGER PRIMARY KEY);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at,
content='',
tokenize='porter unicode61'
);",
)
.unwrap();
drop(conn);
let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
transition_from_meta_version(&conn).unwrap();
let rows = conn
.query("SELECT version FROM _schema_migrations ORDER BY version;")
.unwrap();
let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
assert_eq!(versions, (1..=13).collect::<Vec<i64>>());
}
#[test]
fn franken_storage_open_legacy_v13_with_fts_virtual_table_succeeds() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_open_legacy_v13_with_fts.db");
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
conn.execute_batch(
"CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
INSERT INTO meta(key, value) VALUES('schema_version', '13');
CREATE TABLE agents (
id INTEGER PRIMARY KEY,
slug TEXT NOT NULL
);
CREATE TABLE workspaces (
id INTEGER PRIMARY KEY,
path TEXT NOT NULL
);
CREATE TABLE sources (
id TEXT PRIMARY KEY,
kind TEXT NOT NULL,
host_label TEXT,
machine_id TEXT,
platform TEXT,
config_json TEXT,
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL
);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT NOT NULL DEFAULT 'local',
external_id TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER,
ended_at INTEGER
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
role TEXT NOT NULL,
author TEXT,
created_at INTEGER,
content TEXT NOT NULL,
extra_json TEXT,
extra_bin BLOB
);
INSERT INTO agents(id, slug) VALUES (1, 'codex');
INSERT INTO workspaces(id, path) VALUES (1, '/data/projects/coding_agent_session_search');
INSERT INTO sources(id, kind, host_label, created_at, updated_at)
VALUES ('local', 'local', NULL, 1710000000000, 1710000000000);
INSERT INTO conversations(
id,
agent_id,
workspace_id,
source_id,
external_id,
title,
source_path,
started_at
)
VALUES (
1,
1,
1,
'local',
'legacy-session',
'legacy session',
'/tmp/legacy.jsonl',
1710000000000
);
INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content)
VALUES (1, 1, 0, 'user', 'tester', 1710000000000, 'legacy content');
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at,
message_id,
content='',
tokenize='porter unicode61'
);",
)
.unwrap();
drop(conn);
let storage = FrankenStorage::open(&db_path).unwrap();
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
let rows = storage
.raw()
.query("SELECT version FROM _schema_migrations ORDER BY version;")
.unwrap();
let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
assert_eq!(versions, (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>());
}
#[test]
fn franken_storage_open_repairs_duplicate_fts_messages_schema_rows() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_open_repairs_duplicate_fts_schema.db");
let storage = FrankenStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("dup-fts-schema".into()),
title: Some("Duplicate FTS schema".into()),
source_path: PathBuf::from("/tmp/dup-fts-schema.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_050),
content: "message that should remain queryable".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
drop(storage);
materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
let conn = rusqlite_test_fixture_conn(&db_path);
conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
conn.execute(
"INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
[duplicate_legacy_fts_sql],
)
.unwrap();
conn.execute(
"DELETE FROM meta WHERE key = ?1",
[FTS_FRANKEN_REBUILD_META_KEY],
)
.unwrap();
conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
let duplicate_rows: i64 = conn
.query_row(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(duplicate_rows, 2);
drop(conn);
let reopened = FrankenStorage::open(&db_path).unwrap();
assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
let generation_rows: Vec<String> = reopened
.raw()
.query_map_collect(
"SELECT value FROM meta WHERE key = ?1",
fparams![FTS_FRANKEN_REBUILD_META_KEY],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(
generation_rows.len(),
0,
"canonical open should not eagerly rewrite FTS repair metadata"
);
reopened.ensure_search_fallback_fts_consistency().unwrap();
let repaired = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
assert_eq!(franken_fts_schema_rows(&repaired).unwrap(), 1);
let total_messages: i64 = reopened
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let total_fts_rows: i64 = reopened
.raw()
.query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(total_fts_rows, total_messages);
}
#[test]
fn franken_storage_open_fresh_db_keeps_single_franken_fts_schema_row() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("fresh-franken-storage-open.db");
let storage = FrankenStorage::open(&db_path).unwrap();
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
storage
.ensure_search_fallback_fts_consistency()
.expect("ensure FTS consistency after fresh open");
drop(storage);
let c_reader = FrankenConnection::open(db_path.to_string_lossy().into_owned())
.expect("open DB via frankensqlite for sqlite_master inspection");
assert_eq!(
franken_fts_schema_rows(&c_reader).unwrap(),
1,
"exactly one fts_messages schema row should exist after ensure_search_fallback_fts_consistency"
);
drop(c_reader);
let storage = FrankenStorage::open(&db_path).unwrap();
assert!(
storage
.raw()
.query("SELECT COUNT(*) FROM fts_messages")
.is_ok(),
"fts_messages must be queryable through frankensqlite after open"
);
}
#[test]
fn franken_storage_open_repairs_missing_analytics_tables_when_version_markers_lie() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test_repair_missing_analytics.db");
{
let storage = FrankenStorage::open(&db_path).unwrap();
assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
}
{
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
for table in &[
"usage_models_daily",
"usage_daily",
"usage_hourly",
"message_metrics",
"token_daily_stats",
"token_usage",
"model_pricing",
"embedding_jobs",
"daily_stats",
] {
conn.execute(&format!("DROP TABLE IF EXISTS {table}"))
.unwrap();
}
conn.execute_compat(
"UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
&[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
)
.unwrap();
}
let repaired = FrankenStorage::open(&db_path).unwrap();
assert_eq!(repaired.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
let analytics_count: i64 = repaired
.raw()
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master
WHERE type='table'
AND name IN (
'daily_stats',
'embedding_jobs',
'token_usage',
'token_daily_stats',
'model_pricing',
'message_metrics',
'usage_hourly',
'usage_daily',
'usage_models_daily'
)",
&[],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(
analytics_count, 9,
"open() should recreate the missing analytics tables even when schema_version already says current"
);
}
#[test]
fn current_schema_repair_batches_cover_every_required_probe() {
let missing_tables: Vec<&'static str> = REQUIRED_CURRENT_SCHEMA_TABLE_PROBES
.iter()
.map(|(table_name, _)| *table_name)
.collect();
let batches = current_schema_repair_batches_for_missing_tables(&missing_tables).unwrap();
let covered_tables: HashSet<&'static str> = batches
.iter()
.flat_map(|batch| batch.tables.iter().copied())
.collect();
for table_name in missing_tables {
assert!(
covered_tables.contains(table_name),
"missing repair coverage for {table_name}"
);
}
}
#[test]
fn current_schema_repair_batches_do_not_replay_core_schema_bootstrap() {
for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
assert!(
!batch.sql.contains("CREATE TABLE IF NOT EXISTS meta"),
"repair batch {} should not recreate meta",
batch.name
);
assert!(
!batch.sql.contains("CREATE TABLE IF NOT EXISTS agents"),
"repair batch {} should not recreate agents",
batch.name
);
assert!(
!batch.sql.contains("CREATE TABLE IF NOT EXISTS workspaces"),
"repair batch {} should not recreate workspaces",
batch.name
);
assert!(
!batch
.sql
.contains("CREATE TABLE IF NOT EXISTS conversations"),
"repair batch {} should not recreate conversations",
batch.name
);
assert!(
!batch.sql.contains("CREATE TABLE IF NOT EXISTS messages"),
"repair batch {} should not recreate messages",
batch.name
);
assert!(
!batch.sql.contains("CREATE TABLE IF NOT EXISTS snippets"),
"repair batch {} should not recreate snippets",
batch.name
);
assert!(
!batch.sql.contains("CREATE VIRTUAL TABLE fts_messages"),
"repair batch {} should not recreate FTS tables",
batch.name
);
assert!(
!batch.sql.contains("DROP TABLE"),
"repair batch {} should never drop tables",
batch.name
);
}
}
#[test]
fn build_cass_migrations_applies_combined_v13() {
let conn = FrankenConnection::open(":memory:").unwrap();
let base_result = build_cass_migrations_before_tail_cache()
.run(&conn)
.unwrap();
assert!(apply_conversation_tail_state_cache_migration(&conn).unwrap());
let post_result = build_cass_migrations_after_tail_cache().run(&conn).unwrap();
assert!(base_result.was_fresh);
let mut applied = base_result.applied;
applied.push(15);
applied.extend(post_result.applied);
assert_eq!(
applied,
(13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
"should apply combined V13 plus additive post-V13 migrations"
);
let current: i64 = conn
.query("SELECT MAX(version) FROM _schema_migrations;")
.unwrap()
.first()
.unwrap()
.get_typed(0)
.unwrap();
assert_eq!(current, CURRENT_SCHEMA_VERSION);
}
#[test]
fn franken_insert_conversations_batched_populates_analytics_rollups() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use frankensqlite::compat::{ConnectionExt, RowExt};
use std::path::PathBuf;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("franken-index.db");
let storage = FrankenStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "claude_code".into(),
name: "Claude Code".into(),
version: Some("1.0".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let ts_ms = 1_770_551_400_000_i64;
let usage_json = serde_json::json!({
"message": {
"model": "claude-opus-4-6",
"usage": {
"input_tokens": 100,
"output_tokens": 50,
"cache_read_input_tokens": 25,
"cache_creation_input_tokens": 10,
"service_tier": "standard"
}
}
});
let conv = Conversation {
id: None,
agent_slug: "claude_code".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("franken-batch-upsert".into()),
title: Some("Franken batch upsert".into()),
source_path: PathBuf::from("/tmp/franken.jsonl"),
started_at: Some(ts_ms),
ended_at: Some(ts_ms + 60_000),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(ts_ms),
content: "Please make a plan.".into(),
extra_json: serde_json::Value::Null,
snippets: vec![],
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(ts_ms + 30_000),
content: "## Plan\n\n1. Reproduce\n2. Patch\n3. Verify".into(),
extra_json: usage_json,
snippets: vec![],
},
],
source_id: "local".into(),
origin_host: None,
};
let outcomes = storage
.insert_conversations_batched(&[(agent_id, None, &conv)])
.unwrap();
assert_eq!(outcomes.len(), 1);
assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
let conn = storage.raw();
let daily_stats_rows: i64 = conn
.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let token_daily_rows: i64 = conn
.query_row_map(
"SELECT COUNT(*) FROM token_daily_stats",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
let usage_daily_rows: i64 = conn
.query_row_map("SELECT COUNT(*) FROM usage_daily", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
let model_daily_rows: i64 = conn
.query_row_map(
"SELECT COUNT(*) FROM usage_models_daily",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert!(daily_stats_rows > 0, "daily_stats should be populated");
assert!(
token_daily_rows > 0,
"token_daily_stats should be populated"
);
assert!(usage_daily_rows > 0, "usage_daily should be populated");
assert!(
model_daily_rows > 0,
"usage_models_daily should be populated"
);
}
#[test]
fn connection_manager_creates_readers() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("cm.db");
let fs = FrankenStorage::open(&db_path).unwrap();
drop(fs);
let config = ConnectionManagerConfig {
reader_count: 3,
max_writers: 2,
};
let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
assert_eq!(mgr.reader_count(), 3);
assert_eq!(mgr.max_writers(), 2);
}
#[test]
fn connection_manager_clamps_zero_writer_limit_to_prevent_deadlock() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("cm.db");
let fs = FrankenStorage::open(&db_path).unwrap();
drop(fs);
let mgr = std::sync::Arc::new(
FrankenConnectionManager::new(
&db_path,
ConnectionManagerConfig {
reader_count: 0,
max_writers: 0,
},
)
.unwrap(),
);
assert_eq!(mgr.reader_count(), 1);
assert_eq!(mgr.max_writers(), 1);
let (tx, rx) = std::sync::mpsc::channel();
let mgr_for_thread = std::sync::Arc::clone(&mgr);
std::thread::spawn(move || {
let result = mgr_for_thread.writer().map(|mut guard| {
guard.mark_committed();
});
tx.send(result.is_ok()).expect("writer result send");
});
assert!(
rx.recv_timeout(Duration::from_secs(10)).unwrap(),
"writer acquisition should not block forever when configured with zero writer slots"
);
}
#[test]
fn connection_manager_reader_round_robin() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("cm.db");
let fs = FrankenStorage::open(&db_path).unwrap();
drop(fs);
let config = ConnectionManagerConfig {
reader_count: 2,
max_writers: 1,
};
let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
let idx_before = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
let _r1 = mgr.reader();
let idx_after = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
assert_eq!(idx_after, idx_before + 1, "reader index should advance");
}
#[test]
fn connection_manager_writer_reads_and_writes() {
use frankensqlite::compat::RowExt;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("cm.db");
let fs = FrankenStorage::open(&db_path).unwrap();
drop(fs);
let mgr = FrankenConnectionManager::new(&db_path, Default::default()).unwrap();
{
let mut guard = mgr.writer().unwrap();
guard
.storage()
.raw()
.execute("CREATE TABLE IF NOT EXISTS cm_test (id INTEGER PRIMARY KEY, val TEXT)")
.unwrap();
guard
.storage()
.raw()
.execute("INSERT INTO cm_test (val) VALUES ('hello')")
.unwrap();
guard.mark_committed();
}
let reader_guard = mgr.reader();
let rows = reader_guard.query("SELECT val FROM cm_test").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "hello");
}
#[test]
fn connection_manager_writer_guard_drops_releases_slot() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("cm.db");
let fs = FrankenStorage::open(&db_path).unwrap();
drop(fs);
let config = ConnectionManagerConfig {
reader_count: 1,
max_writers: 1,
};
let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
{
let mut guard = mgr.writer().unwrap();
guard.mark_committed();
}
let mut guard2 = mgr.writer().unwrap();
guard2.mark_committed();
}
#[test]
fn connection_manager_concurrent_writer_works() {
use frankensqlite::compat::RowExt;
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("cm.db");
let fs = FrankenStorage::open(&db_path).unwrap();
drop(fs);
let config = ConnectionManagerConfig {
reader_count: 1,
max_writers: 2,
};
let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
{
let mut guard = mgr.concurrent_writer().unwrap();
guard
.storage()
.raw()
.execute("CREATE TABLE IF NOT EXISTS cm_conc (id INTEGER PRIMARY KEY, val TEXT)")
.unwrap();
guard
.storage()
.raw()
.execute("INSERT INTO cm_conc (val) VALUES ('concurrent')")
.unwrap();
guard.mark_committed();
}
let reader_guard = mgr.reader();
let rows = reader_guard.query("SELECT val FROM cm_conc").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "concurrent");
}
#[test]
fn connection_manager_default_config() {
let config = ConnectionManagerConfig::default();
assert_eq!(config.reader_count, 4);
assert!(config.max_writers > 0);
}
#[test]
fn purge_agent_archive_data_removes_only_target_agent_and_rebuilds_derived_tables() {
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use std::path::PathBuf;
fn seed_conversation(storage: &FrankenStorage, agent_slug: &str, marker: &str) {
let agent = Agent {
id: None,
slug: agent_slug.into(),
name: agent_slug.into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = Conversation {
id: None,
agent_slug: agent_slug.into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(format!("{agent_slug}-{marker}")),
title: Some(format!("{agent_slug} {marker}")),
source_path: PathBuf::from(format!("/tmp/{agent_slug}-{marker}.jsonl")),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_010),
content: format!("{agent_slug} {marker} user"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_020),
content: format!("{agent_slug} {marker} assistant"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
}
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_conversation(&storage, "openclaw", "purge-target");
seed_conversation(&storage, "codex", "keep-target");
let purge = storage.purge_agent_archive_data("openclaw").unwrap();
assert_eq!(purge.conversations_deleted, 1);
assert_eq!(purge.messages_deleted, 2);
storage.rebuild_fts().unwrap();
storage.rebuild_analytics().unwrap();
storage.rebuild_daily_stats().unwrap();
storage.rebuild_token_daily_stats().unwrap();
let agents = storage.list_agents().unwrap();
assert_eq!(agents.len(), 1);
assert_eq!(agents[0].slug, "codex");
assert_eq!(storage.total_conversation_count().unwrap(), 1);
assert_eq!(storage.total_message_count().unwrap(), 2);
let fts_rows: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(fts_rows, 2);
let total_daily_sessions: i64 = storage
.raw()
.query_row_map(
"SELECT COALESCE(SUM(session_count), 0)
FROM daily_stats
WHERE agent_slug = 'all' AND source_id = 'all'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(total_daily_sessions, 1);
let openclaw_token_rows: i64 = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM token_daily_stats WHERE agent_slug = 'openclaw'",
fparams![],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(openclaw_token_rows, 0);
}
#[test]
fn cleanup_orphan_fk_rows_removes_orphans_and_is_noop_on_clean_db() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("orphan_fk_self_heal.db");
let storage = FrankenStorage::open(&db_path).unwrap();
storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO agents(id, slug, name, kind, created_at, updated_at) \
VALUES(1, 'test-agent', 'Test Agent', 'cli', 0, 0)",
fparams![],
)
.unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO conversations(id, agent_id, source_id, source_path, started_at) \
VALUES(1, 1, 'local', '/tmp/real.jsonl', 0)",
fparams![],
)
.unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO messages(id, conversation_id, idx, role, content) \
VALUES(1, 1, 0, 'user', 'real message')",
fparams![],
)
.unwrap();
for (mid, cid, idx) in [(101_i64, 99_999_i64, 0_i64), (102, 0, 0), (103, 0, 1)] {
storage
.raw()
.execute_compat(
"INSERT INTO messages(id, conversation_id, idx, role, content) \
VALUES(?1, ?2, ?3, 'user', 'orphan message')",
fparams![mid, cid, idx],
)
.unwrap();
}
for message_id in [1_i64, 101_i64, 102_i64] {
storage
.raw()
.execute_compat(
"INSERT INTO message_metrics(
message_id, created_at_ms, hour_id, day_id, agent_slug,
role, content_chars, content_tokens_est
) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 13, 2)",
fparams![message_id],
)
.unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO token_usage(
message_id, conversation_id, agent_id, timestamp_ms, day_id,
role, content_chars
) VALUES(?1, 1, 1, 0, 0, 'user', 13)",
fparams![message_id],
)
.unwrap();
}
storage
.raw()
.execute_compat(
"INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text) \
VALUES(99999, '/tmp/orphan-snippet.rs', 1, 2, 'rust', 'fn main() {}')",
fparams![],
)
.unwrap();
storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
let messages_before: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(messages_before, 4); let snippets_before: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(snippets_before, 1);
let metrics_before: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(metrics_before, 3);
let token_usage_before: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(token_usage_before, 3);
let report = storage.cleanup_orphan_fk_rows().unwrap();
let messages_after: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(messages_after, 1, "real message must be preserved");
let snippets_after: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(snippets_after, 0);
let metrics_after: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(metrics_after, 1, "real message metric must be preserved");
let token_usage_after: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(token_usage_after, 1, "real token row must be preserved");
assert_eq!(report.total, 4, "report total: {:?}", report);
let messages_count = report
.per_table
.iter()
.find(|(t, _)| *t == "messages")
.map(|(_, c)| *c);
assert_eq!(messages_count, Some(3));
let snippets_count = report
.per_table
.iter()
.find(|(t, _)| *t == "snippets")
.map(|(_, c)| *c);
assert_eq!(snippets_count, Some(1));
let second = storage.cleanup_orphan_fk_rows().unwrap();
assert_eq!(second.total, 0);
assert!(second.per_table.is_empty());
}
#[test]
fn cleanup_orphan_fk_rows_handles_more_than_one_delete_chunk() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("orphan_fk_chunked_self_heal.db");
let storage = FrankenStorage::open(&db_path).unwrap();
let orphan_count = ORPHAN_FK_ID_CHUNK_SIZE + 3;
storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
{
let mut tx = storage.raw().transaction().unwrap();
for idx in 0..orphan_count {
let message_id = 10_000_i64 + i64::try_from(idx).unwrap();
let conversation_id = 20_000_i64 + i64::try_from(idx).unwrap();
tx.execute_compat(
"INSERT INTO messages(id, conversation_id, idx, role, content) \
VALUES(?1, ?2, 0, 'user', 'orphan message')",
fparams![message_id, conversation_id],
)
.unwrap();
tx.execute_compat(
"INSERT INTO message_metrics(
message_id, created_at_ms, hour_id, day_id, agent_slug,
role, content_chars, content_tokens_est
) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 14, 2)",
fparams![message_id],
)
.unwrap();
}
tx.commit().unwrap();
}
storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
let report = storage.cleanup_orphan_fk_rows().unwrap();
assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
let messages_count = report
.per_table
.iter()
.find(|(table, _)| *table == "messages")
.map(|(_, count)| *count);
assert_eq!(messages_count, Some(i64::try_from(orphan_count).unwrap()));
let messages_after: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(messages_after, 0);
let metrics_after: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(metrics_after, 0);
}
#[test]
fn cleanup_orphan_fk_rows_pages_direct_child_orphans() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("direct_orphan_fk_paged_self_heal.db");
let storage = FrankenStorage::open(&db_path).unwrap();
let orphan_count = (ORPHAN_FK_ID_CHUNK_SIZE * 2) + 5;
storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
{
let mut tx = storage.raw().transaction().unwrap();
for idx in 0..orphan_count {
let message_id = 50_000_i64 + i64::try_from(idx).unwrap();
tx.execute_compat(
"INSERT INTO message_metrics(
message_id, created_at_ms, hour_id, day_id, agent_slug,
role, content_chars, content_tokens_est
) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 21, 3)",
fparams![message_id],
)
.unwrap();
}
tx.commit().unwrap();
}
storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
let report = storage.cleanup_orphan_fk_rows().unwrap();
assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
let metrics_count = report
.per_table
.iter()
.filter(|(table, _)| *table == "message_metrics")
.map(|(_, count)| *count)
.sum::<i64>();
assert_eq!(metrics_count, i64::try_from(orphan_count).unwrap());
assert_eq!(
report
.per_table
.iter()
.filter(|(table, _)| *table == "message_metrics")
.count(),
1,
"paged cleanup should aggregate report entries by table: {report:?}"
);
let metrics_after: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(metrics_after, 0);
}
}