pub(crate) mod lexical_generation;
pub(crate) mod memoization;
pub(crate) mod parallel_wal_shadow;
pub mod quarantine;
pub mod redact_secrets;
pub mod refresh_ledger;
pub(crate) mod responsiveness;
pub mod semantic;
pub mod semantic_progress;
use self::quarantine::{QuarantineKey, QuarantineState};
use self::refresh_ledger::{
EquivalenceArtifacts as RefreshEquivalenceArtifacts, PhaseRecord, RefreshLedger,
RefreshLedgerEvidence, RefreshPhase,
};
use std::any::Any;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque};
#[cfg(target_os = "linux")]
use std::ffi::CString;
use std::fs::{self, File, OpenOptions};
use std::io::{BufWriter, Seek, Write};
#[cfg(target_os = "linux")]
use std::os::unix::ffi::OsStrExt;
use std::path::{Path, PathBuf};
#[cfg(target_os = "macos")]
use std::process::Command;
use std::rc::Rc;
use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, AtomicUsize, Ordering};
use std::sync::{Arc, Condvar, Mutex};
use std::thread::{self, JoinHandle};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use anyhow::{Context, Result};
use crossbeam_channel::{Receiver, Sender, TrySendError, bounded, never, select};
use frankensearch::index::VectorIndex as FsVectorIndex;
use frankensqlite::compat::{
ConnectionExt, ParamValue, RowExt, Transaction as FrankenTransaction,
TransactionExt as FrankenTransactionExt,
};
use fs2::FileExt;
use notify::event::{AccessKind, AccessMode, MetadataKind, ModifyKind};
use notify::{RecursiveMode, Watcher, recommended_watcher};
use rayon::{ThreadPool, ThreadPoolBuilder, prelude::*};
use tempfile::Builder as TempDirBuilder;
use crate::connectors::NormalizedConversation;
#[cfg(test)]
use crate::connectors::NormalizedMessage;
use crate::connectors::{
Connector, ScanRoot, aider::AiderConnector, amp::AmpConnector, chatgpt::ChatGptConnector,
claude_code::ClaudeCodeConnector, clawdbot::ClawdbotConnector, cline::ClineConnector,
codex::CodexConnector, copilot::CopilotConnector, copilot_cli::CopilotCliConnector,
cursor::CursorConnector, factory::FactoryConnector, gemini::GeminiConnector,
kimi::KimiConnector, openclaw::OpenClawConnector, opencode::OpenCodeConnector,
pi_agent::PiAgentConnector, qwen::QwenConnector, vibe::VibeConnector,
};
use crate::model::conversation_packet::{
CONVERSATION_PACKET_VERSION, ConversationPacket, ConversationPacketHashes,
ConversationPacketProvenance, ConversationPacketSinkProjections,
};
use crate::search::asset_state::{SearchMaintenanceJobKind, SearchMaintenanceMode};
use crate::search::canonicalize::is_hard_message_noise;
use crate::search::tantivy::{
SearchableIndexSummary, TantivyIndex, index_dir, schema_hash_matches,
};
use crate::search::vector_index::{
ROLE_ASSISTANT, ROLE_SYSTEM, ROLE_TOOL, ROLE_USER, vector_index_path,
};
use crate::sources::config::{Platform, SourcesConfig};
use crate::sources::provenance::{LOCAL_SOURCE_ID, Origin, Source, SourceKind};
use crate::sources::sync::path_to_safe_dirname;
use crate::storage::sqlite::{
DailyStatsRebuildResult, FrankenStorage, FtsConsistencyRepair, HistoricalSalvageOutcome,
LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE, StatsAggregator, StatsDelta,
seed_canonical_from_best_historical_bundle,
};
use semantic::{
EmbeddingInput, SemanticIndexer, packet_embedding_inputs_from_storage,
packet_embedding_inputs_from_storage_since,
};
use crate::search::policy::{CHUNKING_STRATEGY_VERSION, SEMANTIC_SCHEMA_VERSION};
use crate::search::semantic_manifest::{
ArtifactRecord, SemanticManifest, TierKind as SemanticTierKind,
};
#[cfg(test)]
use std::iter::Peekable;
type BatchClassificationMap =
HashMap<(ConnectorKind, PathBuf), (ScanRoot, Option<i64>, Option<i64>)>;
const LEXICAL_REBUILD_PACKET_VERSION: u32 = CONVERSATION_PACKET_VERSION;
const CODEX_INDEXER_EXTRA_COMPACT_THRESHOLD_BYTES: u64 = 16 * 1024 * 1024;
const PREPARSE_PRIMARY_SOURCE_CAPTURE_LIMIT: usize = 256;
const WATCH_INGEST_DEFAULT_CHUNK_SIZE: usize = 32;
const WATCH_INGEST_CHUNK_SIZE_MAX: usize = 512;
static ROBOT_TRACE_INGEST_ENABLED: AtomicBool = AtomicBool::new(false);
static ROBOT_TRACE_INGEST_BATCH_N: AtomicU64 = AtomicU64::new(0);
static ACTIVE_SESSION_SOURCE_SKIP_OBSERVED: AtomicBool = AtomicBool::new(false);
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
struct SourceFileId {
dev: u64,
ino: u64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ActiveSessionSourceReason {
TestOverride,
WritableFileDescriptor,
AdvisoryLock,
RecentlyModified,
}
#[derive(Debug, Default)]
struct ActiveSessionSourceFilter {
writable_file_ids: HashSet<SourceFileId>,
recent_write_window: Option<Duration>,
}
impl ActiveSessionSourceFilter {
fn new(enable_recent_write_window: bool) -> Self {
Self {
writable_file_ids: collect_writable_open_session_file_ids(),
recent_write_window: active_session_recent_write_window(enable_recent_write_window),
}
}
#[cfg(test)]
fn with_recent_write_window_for_test(recent_write_window: Option<Duration>) -> Self {
Self {
writable_file_ids: HashSet::new(),
recent_write_window,
}
}
fn active_writer_reason(&self, path: &Path) -> Option<ActiveSessionSourceReason> {
if !is_session_log_source_candidate(path) {
return None;
}
if test_active_session_source_path_matches(path) {
return Some(ActiveSessionSourceReason::TestOverride);
}
if self
.recent_write_window
.is_some_and(|window| source_file_recently_modified(path, window))
{
tracing::debug!(
source_path = %path.display(),
"active-source filter: mtime fallback fired"
);
return Some(ActiveSessionSourceReason::RecentlyModified);
}
if source_file_id(path).is_some_and(|file_id| self.writable_file_ids.contains(&file_id)) {
return Some(ActiveSessionSourceReason::WritableFileDescriptor);
}
if source_file_has_active_advisory_lock(path) {
return Some(ActiveSessionSourceReason::AdvisoryLock);
}
None
}
}
fn active_session_recent_write_window(enable_recent_write_window: bool) -> Option<Duration> {
if !enable_recent_write_window {
return None;
}
let seconds = dotenvy::var("CASS_ACTIVE_SESSION_RECENT_WRITE_WINDOW_SECS")
.ok()
.and_then(|value| value.trim().parse::<u64>().ok())
.unwrap_or(120)
.min(3_600);
(seconds > 0).then_some(Duration::from_secs(seconds))
}
fn source_file_recently_modified(path: &Path, window: Duration) -> bool {
let Ok(metadata) = fs::metadata(path) else {
return false;
};
let Ok(modified_at) = metadata.modified() else {
return false;
};
match SystemTime::now().duration_since(modified_at) {
Ok(age) => age <= window,
Err(_) => true,
}
}
fn is_session_log_source_candidate(path: &Path) -> bool {
path.extension()
.and_then(|ext| ext.to_str())
.is_some_and(|ext| {
ext.eq_ignore_ascii_case("jsonl")
|| ext.eq_ignore_ascii_case("json")
|| ext.eq_ignore_ascii_case("claude")
})
}
#[cfg(test)]
fn test_active_session_source_path_matches(path: &Path) -> bool {
let Ok(paths) = dotenvy::var("CASS_TEST_ACTIVE_SESSION_SOURCE_PATHS") else {
return false;
};
std::env::split_paths(&paths).any(|candidate| candidate == path)
}
#[cfg(not(test))]
fn test_active_session_source_path_matches(_path: &Path) -> bool {
false
}
fn should_skip_active_session_source(
active_source_filter: &ActiveSessionSourceFilter,
source_id: &str,
source_path: &Path,
) -> bool {
if source_id != LOCAL_SOURCE_ID {
return false;
}
let Some(reason) = active_source_filter.active_writer_reason(source_path) else {
return false;
};
ACTIVE_SESSION_SOURCE_SKIP_OBSERVED.store(true, Ordering::Relaxed);
tracing::info!(
source_path = %source_path.display(),
?reason,
"skipping session source that appears to be actively written"
);
true
}
#[cfg(unix)]
fn source_file_id(path: &Path) -> Option<SourceFileId> {
use std::os::unix::fs::MetadataExt;
fs::metadata(path).ok().map(|metadata| SourceFileId {
dev: metadata.dev(),
ino: metadata.ino(),
})
}
#[cfg(not(unix))]
fn source_file_id(_path: &Path) -> Option<SourceFileId> {
None
}
#[cfg(target_os = "linux")]
fn collect_writable_open_session_file_ids() -> HashSet<SourceFileId> {
let mut file_ids = HashSet::new();
let Ok(processes) = fs::read_dir("/proc") else {
return file_ids;
};
for process in processes.flatten() {
let pid = process.file_name();
let Some(pid) = pid.to_str() else {
continue;
};
if pid.parse::<u32>().is_err() {
continue;
}
let fd_dir = process.path().join("fd");
let Ok(fds) = fs::read_dir(&fd_dir) else {
continue;
};
for fd in fds.flatten() {
let fd_name = fd.file_name();
let Some(fd_name) = fd_name.to_str() else {
continue;
};
let fd_path = fd.path();
if !linux_fdinfo_is_writable(&process.path().join("fdinfo").join(fd_name)) {
continue;
}
let Ok(target) = fs::read_link(&fd_path) else {
continue;
};
if !is_session_log_source_candidate(&target) {
continue;
}
if let Some(file_id) = source_file_id(&fd_path) {
file_ids.insert(file_id);
}
}
}
file_ids
}
#[cfg(target_os = "linux")]
fn linux_fdinfo_is_writable(fdinfo_path: &Path) -> bool {
const O_ACCMODE: u64 = 0o3;
const O_WRONLY: u64 = 0o1;
const O_RDWR: u64 = 0o2;
let Ok(contents) = fs::read_to_string(fdinfo_path) else {
return false;
};
let Some(flags_value) = contents.lines().find_map(|line| {
line.strip_prefix("flags:")
.and_then(|value| u64::from_str_radix(value.trim(), 8).ok())
}) else {
return false;
};
matches!(flags_value & O_ACCMODE, O_WRONLY | O_RDWR)
}
#[cfg(target_os = "macos")]
fn collect_writable_open_session_file_ids() -> HashSet<SourceFileId> {
let mut file_ids = HashSet::new();
let Ok(output) = Command::new("lsof").args(["-nP", "-F", "pfn"]).output() else {
return file_ids;
};
if !output.status.success() {
return file_ids;
}
let text = String::from_utf8_lossy(&output.stdout);
let mut current_fd_is_writable = false;
for line in text.lines() {
if line.is_empty() {
continue;
}
let (field, value) = line.split_at(1);
match field {
"p" => {
current_fd_is_writable = false;
}
"f" => {
current_fd_is_writable = value.contains('w') || value.contains('u');
}
"n" if current_fd_is_writable => {
let path = Path::new(value);
if is_session_log_source_candidate(path)
&& let Some(file_id) = source_file_id(path)
{
file_ids.insert(file_id);
}
}
_ => {}
}
}
file_ids
}
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
fn collect_writable_open_session_file_ids() -> HashSet<SourceFileId> {
HashSet::new()
}
fn source_file_has_active_advisory_lock(path: &Path) -> bool {
let Ok(file) = OpenOptions::new().read(true).open(path) else {
return false;
};
match file.try_lock_exclusive() {
Ok(()) => {
let _ = FileExt::unlock(&file);
false
}
Err(error) => matches!(
error.kind(),
std::io::ErrorKind::WouldBlock | std::io::ErrorKind::Interrupted
),
}
}
#[cfg(target_os = "linux")]
mod linux_publish_swap {
use std::ffi::{c_char, c_int, c_uint};
pub const AT_FDCWD: c_int = -100;
pub const RENAME_EXCHANGE: c_uint = 0x2;
pub const EINVAL: i32 = 22;
unsafe extern "C" {
pub fn renameat2(
olddirfd: c_int,
oldpath: *const c_char,
newdirfd: c_int,
newpath: *const c_char,
flags: c_uint,
) -> c_int;
}
}
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum LexicalRebuildPacketSource {
CanonicalReplay,
NormalizedConversation,
}
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum LexicalRebuildPacketProvenanceMode {
SourceMapLookup,
ConversationFields,
MetadataFields,
HostFallback,
LocalDefault,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct LexicalRebuildPacketDiagnostics {
version: u32,
source: LexicalRebuildPacketSource,
provenance_mode: LexicalRebuildPacketProvenanceMode,
missing_conversation_id: bool,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct LexicalRebuildPacketIdentity {
conversation_id: Option<i64>,
external_id: Option<String>,
agent: String,
workspace: Option<String>,
source_path: String,
title: Option<String>,
started_at: Option<i64>,
ended_at: Option<i64>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct LexicalRebuildPacketProvenance {
source_id: String,
origin_kind: String,
origin_host: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct LexicalRebuildConversationPacket {
diagnostics: LexicalRebuildPacketDiagnostics,
identity: LexicalRebuildPacketIdentity,
provenance: LexicalRebuildPacketProvenance,
contract_hashes: ConversationPacketHashes,
contract_projections: ConversationPacketSinkProjections,
messages: crate::storage::sqlite::LexicalRebuildGroupedMessageRows,
message_count: usize,
message_bytes: usize,
flow_reservation_bytes: usize,
last_message_id: Option<i64>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct LexicalRebuildPacketFingerprintInput<'a> {
pub version: u32,
pub agent: &'a str,
pub external_id: Option<&'a str>,
pub workspace: Option<&'a str>,
pub source_path: &'a str,
pub title: Option<&'a str>,
pub started_at: Option<i64>,
pub ended_at: Option<i64>,
pub source_id: &'a str,
pub origin_kind: &'a str,
pub origin_host: Option<&'a str>,
pub lexical_projected_content_bytes: usize,
pub messages: &'a crate::storage::sqlite::LexicalRebuildGroupedMessageRows,
pub message_count: usize,
pub message_bytes: usize,
}
#[cfg(test)]
fn message_id_from_db(raw: i64) -> Option<u64> {
u64::try_from(raw).ok()
}
#[cfg(test)]
fn saturating_u32_from_i64(raw: i64) -> u32 {
match u32::try_from(raw) {
Ok(value) => value,
Err(_) if raw.is_negative() => 0,
Err(_) => u32::MAX,
}
}
#[derive(Debug, Clone)]
pub enum ReindexCommand {
Full,
}
#[derive(Debug)]
pub enum IndexerEvent {
Notify(Vec<PathBuf>),
Command(ReindexCommand),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum StaleAction {
#[default]
Warn,
Rebuild,
None,
}
impl StaleAction {
fn from_env_str(s: &str) -> Self {
match s.to_lowercase().as_str() {
"rebuild" | "auto" | "fix" => Self::Rebuild,
"none" | "off" | "disabled" | "0" | "false" => Self::None,
_ => Self::Warn, }
}
}
#[derive(Debug, Clone)]
pub struct StaleConfig {
pub threshold_hours: u64,
pub action: StaleAction,
pub check_interval_mins: u64,
pub min_zero_scans: u64,
}
impl Default for StaleConfig {
fn default() -> Self {
Self {
threshold_hours: 24,
action: StaleAction::Warn,
check_interval_mins: 60,
min_zero_scans: 10,
}
}
}
impl StaleConfig {
pub fn from_env() -> Self {
let mut cfg = Self::default();
if let Some(hours) = env_u64("CASS_WATCH_STALE_THRESHOLD_HOURS") {
cfg.threshold_hours = hours;
}
if let Ok(val) = dotenvy::var("CASS_WATCH_STALE_ACTION") {
cfg.action = StaleAction::from_env_str(&val);
}
if let Some(mins) = env_u64("CASS_WATCH_STALE_CHECK_INTERVAL_MINS") {
cfg.check_interval_mins = mins;
}
if let Some(count) = env_u64("CASS_WATCH_STALE_MIN_ZERO_SCANS") {
cfg.min_zero_scans = count;
}
cfg
}
pub fn is_enabled(&self) -> bool {
self.action != StaleAction::None
}
}
fn env_u64(key: &str) -> Option<u64> {
dotenvy::var(key).ok()?.parse().ok()
}
#[derive(Debug)]
pub struct StaleDetector {
config: StaleConfig,
last_successful_ingest: Mutex<Option<Instant>>,
consecutive_zero_scans: std::sync::atomic::AtomicU64,
warning_emitted: AtomicBool,
last_check: Mutex<Instant>,
total_ingests: std::sync::atomic::AtomicU64,
start_time: Instant,
}
impl StaleDetector {
pub fn new(config: StaleConfig) -> Self {
Self {
config,
last_successful_ingest: Mutex::new(None),
consecutive_zero_scans: std::sync::atomic::AtomicU64::new(0),
warning_emitted: AtomicBool::new(false),
last_check: Mutex::new(Instant::now()),
total_ingests: std::sync::atomic::AtomicU64::new(0),
start_time: Instant::now(),
}
}
pub fn from_env() -> Self {
Self::new(StaleConfig::from_env())
}
pub fn record_scan(&self, conversations_indexed: usize) {
if conversations_indexed > 0 {
{
let mut guard = self
.last_successful_ingest
.lock()
.unwrap_or_else(|e| e.into_inner());
*guard = Some(Instant::now());
}
self.consecutive_zero_scans.store(0, Ordering::Relaxed);
self.warning_emitted.store(false, Ordering::Relaxed);
self.total_ingests.fetch_add(1, Ordering::Relaxed);
tracing::debug!(
conversations = conversations_indexed,
"stale_detector: successful ingest recorded"
);
} else {
let count = self.consecutive_zero_scans.fetch_add(1, Ordering::Relaxed) + 1;
tracing::trace!(
consecutive_zero_scans = count,
"stale_detector: zero-conversation scan"
);
}
}
pub fn check_stale(&self) -> Option<StaleAction> {
if !self.config.is_enabled() {
return None;
}
let now = Instant::now();
{
let mut last_check = self.last_check.lock().unwrap_or_else(|e| e.into_inner());
let check_interval = Duration::from_secs(self.config.check_interval_mins * 60);
if now.duration_since(*last_check) < check_interval {
return None;
}
*last_check = now;
}
let zero_count = self.consecutive_zero_scans.load(Ordering::Relaxed);
if zero_count < self.config.min_zero_scans {
return None;
}
let threshold = Duration::from_secs(self.config.threshold_hours * 3600);
let is_stale = match self
.last_successful_ingest
.lock()
.unwrap_or_else(|e| e.into_inner())
.as_ref()
{
Some(last) => now.duration_since(*last) > threshold,
None => now.duration_since(self.start_time) > threshold,
};
if is_stale {
let already_warned = self.warning_emitted.swap(true, Ordering::Relaxed);
if !already_warned || self.config.action == StaleAction::Rebuild {
return Some(self.config.action);
}
}
None
}
pub fn stats(&self) -> StaleStats {
let last_ingest = *self
.last_successful_ingest
.lock()
.unwrap_or_else(|e| e.into_inner());
StaleStats {
consecutive_zero_scans: self.consecutive_zero_scans.load(Ordering::Relaxed),
total_ingests: self.total_ingests.load(Ordering::Relaxed),
seconds_since_last_ingest: last_ingest.map(|t| t.elapsed().as_secs()),
warning_emitted: self.warning_emitted.load(Ordering::Relaxed),
config_action: format!("{:?}", self.config.action),
config_threshold_hours: self.config.threshold_hours,
}
}
pub fn reset(&self) {
{
let mut guard = self
.last_successful_ingest
.lock()
.unwrap_or_else(|e| e.into_inner());
*guard = Some(Instant::now());
}
self.consecutive_zero_scans.store(0, Ordering::Relaxed);
self.warning_emitted.store(false, Ordering::Relaxed);
}
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct StaleStats {
pub consecutive_zero_scans: u64,
pub total_ingests: u64,
pub seconds_since_last_ingest: Option<u64>,
pub warning_emitted: bool,
pub config_action: String,
pub config_threshold_hours: u64,
}
#[derive(Debug, Clone, Default, serde::Serialize)]
pub struct ConnectorStats {
pub name: String,
pub conversations: usize,
pub messages: usize,
pub scan_ms: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub struct LexicalRepairStats {
pub kind: String,
pub reason: String,
pub canonical_conversations: usize,
pub canonical_messages: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub observed_tantivy_docs: Option<usize>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub struct SemanticWatchOnceStats {
pub published: bool,
pub selected_docs: usize,
pub embedded_docs: usize,
pub tier: String,
pub vector_index_path: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub manifest_before_db_fingerprint: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub manifest_after_db_fingerprint: Option<String>,
pub reason: String,
}
#[derive(Debug, Clone, Default, serde::Serialize)]
pub struct IndexingStats {
pub scan_ms: u64,
pub index_ms: u64,
pub connectors: Vec<ConnectorStats>,
pub agents_discovered: Vec<String>,
pub total_conversations: usize,
pub total_messages: usize,
#[serde(skip_serializing)]
pub total_counts_exact: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub lexical_strategy: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub lexical_strategy_reason: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub lexical_repair: Option<LexicalRepairStats>,
#[serde(skip_serializing_if = "Option::is_none")]
pub semantic_watch_once: Option<SemanticWatchOnceStats>,
pub quarantined_conversations: usize,
pub lexical_update_deferred: bool,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
struct CanonicalMutationCounts {
inserted_conversations: usize,
inserted_messages: usize,
}
impl CanonicalMutationCounts {
fn accumulate(self, other: Self) -> Self {
Self {
inserted_conversations: self
.inserted_conversations
.saturating_add(other.inserted_conversations),
inserted_messages: self
.inserted_messages
.saturating_add(other.inserted_messages),
}
}
fn changed(self) -> bool {
self.inserted_conversations > 0 || self.inserted_messages > 0
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
struct NonWatchIngestOutcome {
canonical_mutations: CanonicalMutationCounts,
quarantined_conversations: usize,
lexical_update_deferred: bool,
}
impl NonWatchIngestOutcome {
fn accumulate(self, other: Self) -> Self {
Self {
canonical_mutations: self
.canonical_mutations
.accumulate(other.canonical_mutations),
quarantined_conversations: self
.quarantined_conversations
.saturating_add(other.quarantined_conversations),
lexical_update_deferred: self.lexical_update_deferred || other.lexical_update_deferred,
}
}
}
impl std::ops::Deref for NonWatchIngestOutcome {
type Target = CanonicalMutationCounts;
fn deref(&self) -> &Self::Target {
&self.canonical_mutations
}
}
impl PartialEq<CanonicalMutationCounts> for NonWatchIngestOutcome {
fn eq(&self, other: &CanonicalMutationCounts) -> bool {
self.canonical_mutations == *other
}
}
#[derive(Debug, Default)]
pub struct IndexingProgress {
pub total: AtomicUsize,
pub current: AtomicUsize,
pub phase: AtomicUsize,
pub is_rebuilding: AtomicBool,
pub discovered_agents: AtomicUsize,
pub discovered_agent_names: Mutex<Vec<String>>,
pub last_error: Mutex<Option<String>>,
pub stats: Mutex<IndexingStats>,
pub rebuild_pipeline_queue_depth: AtomicUsize,
pub rebuild_pipeline_inflight_message_bytes: AtomicUsize,
pub rebuild_pipeline_pending_batch_conversations: AtomicUsize,
pub rebuild_pipeline_pending_batch_message_bytes: AtomicUsize,
pub rebuild_pipeline_page_prep_workers: AtomicUsize,
pub rebuild_pipeline_active_page_prep_jobs: AtomicUsize,
pub rebuild_pipeline_ordered_buffered_pages: AtomicUsize,
pub rebuild_pipeline_budget_generation: AtomicUsize,
pub rebuild_pipeline_producer_budget_wait_count: AtomicUsize,
pub rebuild_pipeline_producer_budget_wait_ms: AtomicUsize,
pub rebuild_pipeline_producer_handoff_wait_count: AtomicUsize,
pub rebuild_pipeline_producer_handoff_wait_ms: AtomicUsize,
pub rebuild_pipeline_host_loadavg_1m_milli: Mutex<Option<u32>>,
pub rebuild_pipeline_host_available_memory_bytes: Mutex<Option<u64>>,
pub rebuild_pipeline_process_rss_bytes: Mutex<Option<u64>>,
pub rebuild_pipeline_controller_mode: Mutex<String>,
pub rebuild_pipeline_controller_reason: Mutex<String>,
pub rebuild_pipeline_staged_merge_workers_max: AtomicUsize,
pub rebuild_pipeline_staged_merge_allowed_jobs: AtomicUsize,
pub rebuild_pipeline_staged_merge_active_jobs: AtomicUsize,
pub rebuild_pipeline_staged_merge_ready_artifacts: AtomicUsize,
pub rebuild_pipeline_staged_merge_ready_groups: AtomicUsize,
pub rebuild_pipeline_staged_merge_controller_reason: Mutex<String>,
pub rebuild_pipeline_staged_shard_build_workers_max: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_allowed_jobs: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_active_jobs: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_pending_jobs: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_controller_reason: Mutex<String>,
pub rebuild_pipeline_staged_shard_build_memory_reserve_bytes: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_emergency_memory_reserve_bytes: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_completed_jobs: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_last_shard_index: Mutex<Option<usize>>,
pub rebuild_pipeline_staged_shard_build_last_message_bytes: AtomicUsize,
pub rebuild_pipeline_staged_shard_build_last_index_size_bytes: AtomicU64,
pub rebuild_pipeline_staged_shard_build_last_duration_ms: AtomicU64,
pub rebuild_pipeline_staged_shard_build_last_amplification_milli: Mutex<Option<u64>>,
pub rebuild_pipeline_staged_shard_build_observed_amplification_milli: Mutex<Option<u64>>,
}
impl IndexingProgress {
fn phase_label_for(phase: usize) -> &'static str {
match phase {
1 => "scanning",
2 => "indexing",
_ => "preparing",
}
}
pub fn phase_label(&self) -> &'static str {
Self::phase_label_for(self.phase.load(Ordering::Relaxed))
}
pub fn snapshot_json(&self, elapsed_ms: u128) -> serde_json::Value {
let phase = self.phase.load(Ordering::Relaxed);
let total = self.total.load(Ordering::Relaxed);
let current = self.current.load(Ordering::Relaxed);
let agents = self.discovered_agents.load(Ordering::Relaxed);
let is_rebuilding = self.is_rebuilding.load(Ordering::Relaxed);
let agent_names: Vec<String> = self
.discovered_agent_names
.lock()
.map(|g| g.clone())
.unwrap_or_default();
let last_error: Option<String> = self.last_error.lock().ok().and_then(|g| g.clone());
let rebuild_pipeline_queue_depth =
self.rebuild_pipeline_queue_depth.load(Ordering::Relaxed);
let rebuild_pipeline_inflight_message_bytes = self
.rebuild_pipeline_inflight_message_bytes
.load(Ordering::Relaxed);
let rebuild_pipeline_pending_batch_conversations = self
.rebuild_pipeline_pending_batch_conversations
.load(Ordering::Relaxed);
let rebuild_pipeline_pending_batch_message_bytes = self
.rebuild_pipeline_pending_batch_message_bytes
.load(Ordering::Relaxed);
let rebuild_pipeline_page_prep_workers = self
.rebuild_pipeline_page_prep_workers
.load(Ordering::Relaxed);
let rebuild_pipeline_active_page_prep_jobs = self
.rebuild_pipeline_active_page_prep_jobs
.load(Ordering::Relaxed);
let rebuild_pipeline_ordered_buffered_pages = self
.rebuild_pipeline_ordered_buffered_pages
.load(Ordering::Relaxed);
let rebuild_pipeline_budget_generation = self
.rebuild_pipeline_budget_generation
.load(Ordering::Relaxed);
let rebuild_pipeline_producer_budget_wait_count = self
.rebuild_pipeline_producer_budget_wait_count
.load(Ordering::Relaxed);
let rebuild_pipeline_producer_budget_wait_ms = self
.rebuild_pipeline_producer_budget_wait_ms
.load(Ordering::Relaxed);
let rebuild_pipeline_producer_handoff_wait_count = self
.rebuild_pipeline_producer_handoff_wait_count
.load(Ordering::Relaxed);
let rebuild_pipeline_producer_handoff_wait_ms = self
.rebuild_pipeline_producer_handoff_wait_ms
.load(Ordering::Relaxed);
let rebuild_pipeline_host_loadavg_1m_milli = self
.rebuild_pipeline_host_loadavg_1m_milli
.lock()
.ok()
.and_then(|value| *value);
let rebuild_pipeline_host_available_memory_bytes = self
.rebuild_pipeline_host_available_memory_bytes
.lock()
.ok()
.and_then(|value| *value);
let rebuild_pipeline_process_rss_bytes = self
.rebuild_pipeline_process_rss_bytes
.lock()
.ok()
.and_then(|value| *value);
let rebuild_pipeline_controller_mode = self
.rebuild_pipeline_controller_mode
.lock()
.map(|value| value.clone())
.unwrap_or_default();
let rebuild_pipeline_controller_reason = self
.rebuild_pipeline_controller_reason
.lock()
.map(|value| value.clone())
.unwrap_or_default();
let rebuild_pipeline_staged_merge_workers_max = self
.rebuild_pipeline_staged_merge_workers_max
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_merge_allowed_jobs = self
.rebuild_pipeline_staged_merge_allowed_jobs
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_merge_active_jobs = self
.rebuild_pipeline_staged_merge_active_jobs
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_merge_ready_artifacts = self
.rebuild_pipeline_staged_merge_ready_artifacts
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_merge_ready_groups = self
.rebuild_pipeline_staged_merge_ready_groups
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_merge_controller_reason = self
.rebuild_pipeline_staged_merge_controller_reason
.lock()
.map(|value| value.clone())
.unwrap_or_default();
let rebuild_pipeline_staged_shard_build_workers_max = self
.rebuild_pipeline_staged_shard_build_workers_max
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_allowed_jobs = self
.rebuild_pipeline_staged_shard_build_allowed_jobs
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_active_jobs = self
.rebuild_pipeline_staged_shard_build_active_jobs
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_pending_jobs = self
.rebuild_pipeline_staged_shard_build_pending_jobs
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_controller_reason = self
.rebuild_pipeline_staged_shard_build_controller_reason
.lock()
.map(|value| value.clone())
.unwrap_or_default();
let rebuild_pipeline_staged_shard_build_memory_reserve_bytes = self
.rebuild_pipeline_staged_shard_build_memory_reserve_bytes
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_emergency_memory_reserve_bytes = self
.rebuild_pipeline_staged_shard_build_emergency_memory_reserve_bytes
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_completed_jobs = self
.rebuild_pipeline_staged_shard_build_completed_jobs
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_last_shard_index = self
.rebuild_pipeline_staged_shard_build_last_shard_index
.lock()
.ok()
.and_then(|value| *value);
let rebuild_pipeline_staged_shard_build_last_message_bytes = self
.rebuild_pipeline_staged_shard_build_last_message_bytes
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_last_index_size_bytes = self
.rebuild_pipeline_staged_shard_build_last_index_size_bytes
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_last_duration_ms = self
.rebuild_pipeline_staged_shard_build_last_duration_ms
.load(Ordering::Relaxed);
let rebuild_pipeline_staged_shard_build_last_amplification_milli = self
.rebuild_pipeline_staged_shard_build_last_amplification_milli
.lock()
.ok()
.and_then(|value| *value);
let rebuild_pipeline_staged_shard_build_observed_amplification_milli = self
.rebuild_pipeline_staged_shard_build_observed_amplification_milli
.lock()
.ok()
.and_then(|value| *value);
let (quarantined_conversations, lexical_update_deferred) = self
.stats
.lock()
.map(|stats| {
(
stats.quarantined_conversations,
stats.lexical_update_deferred,
)
})
.unwrap_or_default();
let (rate_per_sec, eta_seconds) = if phase == 2 && elapsed_ms > 0 && current > 0 {
let secs = (elapsed_ms as f64) / 1000.0;
let rate = (current as f64) / secs.max(0.001);
let remaining = total.saturating_sub(current) as f64;
let eta = if rate > 0.0 && total > 0 {
Some(remaining / rate)
} else {
None
};
(Some(rate), eta)
} else {
(None, None)
};
serde_json::json!({
"phase": Self::phase_label_for(phase),
"phase_code": phase,
"total": total,
"current": current,
"discovered_agents": agents,
"agent_names": agent_names,
"is_rebuilding": is_rebuilding,
"elapsed_ms": elapsed_ms,
"rate_per_sec": rate_per_sec,
"eta_seconds": eta_seconds,
"last_error": last_error,
"quarantined_conversations": quarantined_conversations,
"lexical_update_deferred": lexical_update_deferred,
"rebuild_pipeline": {
"queue_depth": rebuild_pipeline_queue_depth,
"inflight_message_bytes": rebuild_pipeline_inflight_message_bytes,
"pending_batch_conversations": rebuild_pipeline_pending_batch_conversations,
"pending_batch_message_bytes": rebuild_pipeline_pending_batch_message_bytes,
"page_prep_workers": rebuild_pipeline_page_prep_workers,
"active_page_prep_jobs": rebuild_pipeline_active_page_prep_jobs,
"ordered_buffered_pages": rebuild_pipeline_ordered_buffered_pages,
"budget_generation": rebuild_pipeline_budget_generation,
"producer_budget_wait_count": rebuild_pipeline_producer_budget_wait_count,
"producer_budget_wait_ms": rebuild_pipeline_producer_budget_wait_ms,
"producer_handoff_wait_count": rebuild_pipeline_producer_handoff_wait_count,
"producer_handoff_wait_ms": rebuild_pipeline_producer_handoff_wait_ms,
"host_loadavg_1m": rebuild_pipeline_host_loadavg_1m_milli.map(|value| {
f64::from(value) / 1000.0
}),
"host_available_memory_bytes": rebuild_pipeline_host_available_memory_bytes,
"process_rss_bytes": rebuild_pipeline_process_rss_bytes,
"controller_mode": non_empty_json_string(rebuild_pipeline_controller_mode),
"controller_reason": non_empty_json_string(rebuild_pipeline_controller_reason),
"staged_merge_workers_max": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_merge_workers_max),
"staged_merge_allowed_jobs": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_merge_allowed_jobs),
"staged_merge_active_jobs": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_merge_active_jobs),
"staged_merge_ready_artifacts": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_merge_ready_artifacts),
"staged_merge_ready_groups": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_merge_ready_groups),
"staged_merge_controller_reason": active_rebuild_json_string(is_rebuilding, rebuild_pipeline_staged_merge_controller_reason),
"staged_shard_build_workers_max": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_workers_max),
"staged_shard_build_allowed_jobs": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_allowed_jobs),
"staged_shard_build_active_jobs": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_active_jobs),
"staged_shard_build_pending_jobs": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_pending_jobs),
"staged_shard_build_controller_reason": active_rebuild_json_string(is_rebuilding, rebuild_pipeline_staged_shard_build_controller_reason),
"staged_shard_build_memory_reserve_bytes": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_memory_reserve_bytes),
"staged_shard_build_emergency_memory_reserve_bytes": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_emergency_memory_reserve_bytes),
"staged_shard_build_completed_jobs": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_completed_jobs),
"staged_shard_build_last_shard_index": active_rebuild_json_optional_usize(is_rebuilding, rebuild_pipeline_staged_shard_build_last_shard_index),
"staged_shard_build_last_message_bytes": staged_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_last_message_bytes),
"staged_shard_build_last_index_size_bytes": staged_u64_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_last_index_size_bytes),
"staged_shard_build_last_duration_ms": staged_u64_field_or_null(is_rebuilding, rebuild_pipeline_staged_shard_build_last_duration_ms),
"staged_shard_build_last_amplification_milli": active_rebuild_json_optional_u64(is_rebuilding, rebuild_pipeline_staged_shard_build_last_amplification_milli),
"staged_shard_build_observed_amplification_milli": active_rebuild_json_optional_u64(is_rebuilding, rebuild_pipeline_staged_shard_build_observed_amplification_milli),
},
})
}
}
fn staged_field_or_null(is_rebuilding: bool, value: usize) -> serde_json::Value {
if is_rebuilding {
serde_json::json!(value)
} else {
serde_json::Value::Null
}
}
fn staged_u64_field_or_null(is_rebuilding: bool, value: u64) -> serde_json::Value {
if is_rebuilding {
serde_json::json!(value)
} else {
serde_json::Value::Null
}
}
fn non_empty_json_string(value: String) -> Option<String> {
(!value.is_empty()).then_some(value)
}
fn active_rebuild_json_string(is_rebuilding: bool, value: String) -> Option<String> {
(is_rebuilding && !value.is_empty()).then_some(value)
}
fn active_rebuild_json_optional_usize(
is_rebuilding: bool,
value: Option<usize>,
) -> serde_json::Value {
if is_rebuilding {
serde_json::json!(value)
} else {
serde_json::Value::Null
}
}
fn active_rebuild_json_optional_u64(is_rebuilding: bool, value: Option<u64>) -> serde_json::Value {
if is_rebuilding {
serde_json::json!(value)
} else {
serde_json::Value::Null
}
}
#[derive(Clone)]
pub struct IndexOptions {
pub full: bool,
pub force_rebuild: bool,
pub watch: bool,
pub watch_once_paths: Option<Vec<PathBuf>>,
pub db_path: PathBuf,
pub data_dir: PathBuf,
pub semantic: bool,
pub build_hnsw: bool,
pub embedder: String,
pub progress: Option<Arc<IndexingProgress>>,
pub watch_interval_secs: u64,
}
pub fn set_robot_trace_ingest_enabled(enabled: bool) -> bool {
let previous = ROBOT_TRACE_INGEST_ENABLED.swap(enabled, Ordering::Relaxed);
if enabled {
ROBOT_TRACE_INGEST_BATCH_N.store(0, Ordering::Relaxed);
}
let _ = crate::storage::sqlite::set_message_lookup_trace_enabled(enabled);
previous
}
#[derive(Debug)]
struct RobotIngestTraceSpan {
batch_n: u64,
stage: &'static str,
lexical_strategy: &'static str,
defer_checkpoints: bool,
batch_conversations: usize,
batch_msgs: usize,
started: Instant,
lookup_before: crate::storage::sqlite::MessageLookupTraceCounters,
}
fn robot_trace_ingest_start(
stage: &'static str,
convs: &[NormalizedConversation],
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Option<RobotIngestTraceSpan> {
if !ROBOT_TRACE_INGEST_ENABLED.load(Ordering::Relaxed) {
return None;
}
let batch_n = ROBOT_TRACE_INGEST_BATCH_N
.fetch_add(1, Ordering::Relaxed)
.saturating_add(1);
Some(RobotIngestTraceSpan {
batch_n,
stage,
lexical_strategy: lexical_strategy.as_str(),
defer_checkpoints,
batch_conversations: convs.len(),
batch_msgs: convs.iter().map(|conv| conv.messages.len()).sum(),
started: Instant::now(),
lookup_before: crate::storage::sqlite::message_lookup_trace_snapshot(),
})
}
fn robot_trace_ingest_finish(
span: Option<RobotIngestTraceSpan>,
status: &str,
inserted_conversations: usize,
inserted_messages: usize,
error: Option<&anyhow::Error>,
) {
let Some(span) = span else {
return;
};
let lookup_delta =
crate::storage::sqlite::message_lookup_trace_snapshot().saturating_sub(span.lookup_before);
let mut payload = serde_json::json!({
"event": "ingest_batch",
"ts_ms": chrono::Utc::now().timestamp_millis(),
"batch_n": span.batch_n,
"stage": span.stage,
"status": status,
"batch_conversations": span.batch_conversations,
"batch_msgs": span.batch_msgs,
"inserted_conversations": inserted_conversations,
"inserted_messages": inserted_messages,
"wall_ms": span.started.elapsed().as_millis() as u64,
"lexical_strategy": span.lexical_strategy,
"defer_checkpoints": span.defer_checkpoints,
"lookups_against_global": lookup_delta.lookups_against_global(),
"lookup_trace": lookup_delta,
});
if let Some(error) = error
&& let serde_json::Value::Object(ref mut map) = payload
{
map.insert("error".to_string(), serde_json::json!(error.to_string()));
}
if let Ok(line) = serde_json::to_string(&payload) {
eprintln!("{line}");
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum LexicalPopulationStrategy {
IncrementalInline,
InlineRebuildFromScan,
DeferredAuthoritativeDbRebuild,
}
impl LexicalPopulationStrategy {
fn as_str(self) -> &'static str {
match self {
Self::IncrementalInline => "incremental_inline",
Self::InlineRebuildFromScan => "inline_rebuild_from_scan",
Self::DeferredAuthoritativeDbRebuild => "deferred_authoritative_db_rebuild",
}
}
}
fn select_lexical_population_strategy(
needs_rebuild: bool,
defer_to_authoritative_db_rebuild: bool,
) -> LexicalPopulationStrategy {
if defer_to_authoritative_db_rebuild {
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild
} else if needs_rebuild {
LexicalPopulationStrategy::InlineRebuildFromScan
} else {
LexicalPopulationStrategy::IncrementalInline
}
}
fn resolve_lexical_population_strategy(
needs_rebuild: bool,
full_refresh: bool,
salvage_messages_imported: usize,
) -> (LexicalPopulationStrategy, &'static str) {
let defer_to_authoritative_db_rebuild = full_refresh || salvage_messages_imported > 0;
let strategy =
select_lexical_population_strategy(needs_rebuild, defer_to_authoritative_db_rebuild);
let reason = if salvage_messages_imported > 0 {
"historical_salvage_imported_messages_require_authoritative_db_rebuild"
} else if full_refresh {
"full_refresh_defers_inline_lexical_writes_to_authoritative_db_rebuild"
} else if needs_rebuild {
"lexical_index_needs_rebuild_so_scan_results_repopulate_tantivy_directly"
} else {
"incremental_scan_applies_inline_lexical_updates_only_for_new_messages"
};
(strategy, reason)
}
fn lexical_population_strategy_requires_inline_tantivy(
strategy: LexicalPopulationStrategy,
) -> bool {
!matches!(
strategy,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild
)
}
fn record_lexical_population_strategy(
progress: Option<&Arc<IndexingProgress>>,
strategy: LexicalPopulationStrategy,
reason: &str,
) {
let Some(progress) = progress else {
return;
};
if let Ok(mut stats) = progress.stats.lock() {
stats.lexical_strategy = Some(strategy.as_str().to_string());
stats.lexical_strategy_reason = Some(reason.to_string());
}
}
fn record_lexical_population_strategy_if_unset(
progress: Option<&Arc<IndexingProgress>>,
strategy: LexicalPopulationStrategy,
reason: &str,
) {
let Some(progress) = progress else {
return;
};
if let Ok(mut stats) = progress.stats.lock()
&& stats.lexical_strategy.is_none()
{
stats.lexical_strategy = Some(strategy.as_str().to_string());
stats.lexical_strategy_reason = Some(reason.to_string());
}
}
fn record_semantic_watch_once_stats(
progress: Option<&Arc<IndexingProgress>>,
stats: SemanticWatchOnceStats,
) {
let Some(progress) = progress else {
return;
};
if let Ok(mut indexing_stats) = progress.stats.lock() {
indexing_stats.semantic_watch_once = Some(stats);
}
}
fn record_incremental_canonical_lexical_repair(
progress: Option<&Arc<IndexingProgress>>,
plan: &IncrementalCanonicalLexicalRepairPlan,
canonical_conversations: usize,
) {
let Some(progress) = progress else {
return;
};
if let Ok(mut stats) = progress.stats.lock() {
stats.lexical_repair = Some(LexicalRepairStats {
kind: "authoritative_canonical_db_rebuild".to_string(),
reason: plan.reason.to_string(),
canonical_conversations,
canonical_messages: plan.canonical_messages,
observed_tantivy_docs: plan.observed_tantivy_docs,
});
}
}
fn reset_progress_to_idle(progress: Option<&Arc<IndexingProgress>>) {
let Some(progress) = progress else {
return;
};
progress.phase.store(0, Ordering::Relaxed);
progress.is_rebuilding.store(false, Ordering::Relaxed);
progress
.rebuild_pipeline_queue_depth
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_inflight_message_bytes
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_pending_batch_conversations
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_pending_batch_message_bytes
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_page_prep_workers
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_active_page_prep_jobs
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_ordered_buffered_pages
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_budget_generation
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_budget_wait_count
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_budget_wait_ms
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_handoff_wait_count
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_handoff_wait_ms
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_workers_max
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_allowed_jobs
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_active_jobs
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_ready_artifacts
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_ready_groups
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_workers_max
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_allowed_jobs
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_active_jobs
.store(0, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_pending_jobs
.store(0, Ordering::Relaxed);
if let Ok(mut host_loadavg) = progress.rebuild_pipeline_host_loadavg_1m_milli.lock() {
*host_loadavg = None;
}
if let Ok(mut controller_mode) = progress.rebuild_pipeline_controller_mode.lock() {
controller_mode.clear();
}
if let Ok(mut controller_reason) = progress.rebuild_pipeline_controller_reason.lock() {
controller_reason.clear();
}
if let Ok(mut staged_merge_reason) = progress
.rebuild_pipeline_staged_merge_controller_reason
.lock()
{
staged_merge_reason.clear();
}
if let Ok(mut staged_shard_build_reason) = progress
.rebuild_pipeline_staged_shard_build_controller_reason
.lock()
{
staged_shard_build_reason.clear();
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
struct LexicalRebuildPipelineSinkRuntimeSnapshot {
queue_depth: usize,
pending_batch_conversations: usize,
pending_batch_message_bytes: usize,
}
impl LexicalRebuildPipelineSinkRuntimeSnapshot {
fn new(
queue_depth: usize,
pending_batch_conversations: usize,
pending_batch_message_bytes: usize,
) -> Self {
Self {
queue_depth,
pending_batch_conversations,
pending_batch_message_bytes,
}
}
}
fn capture_lexical_rebuild_pipeline_runtime(
flow_limiter: &StreamingByteLimiter,
producer_telemetry: Option<&LexicalRebuildProducerTelemetry>,
responsiveness_controller: Option<&LexicalRebuildResponsivenessController>,
budget_generation: usize,
sink_runtime: LexicalRebuildPipelineSinkRuntimeSnapshot,
) -> LexicalRebuildPipelineRuntimeSnapshot {
let producer_snapshot = producer_telemetry
.map(LexicalRebuildProducerTelemetry::snapshot)
.unwrap_or_default();
let (controller_mode, controller_reason) = responsiveness_controller
.map(|controller| {
(
controller.mode().to_string(),
controller.reason().to_string(),
)
})
.unwrap_or_default();
LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: sink_runtime.queue_depth,
inflight_message_bytes: flow_limiter.bytes_in_flight(),
max_message_bytes_in_flight: flow_limiter.max_bytes_in_flight(),
pending_batch_conversations: sink_runtime.pending_batch_conversations,
pending_batch_message_bytes: sink_runtime.pending_batch_message_bytes,
page_prep_workers: producer_snapshot.page_prep_workers,
active_page_prep_jobs: producer_snapshot.active_page_prep_jobs,
ordered_buffered_pages: producer_snapshot.ordered_buffered_pages,
budget_generation,
producer_budget_wait_count: producer_snapshot.budget_wait_count,
producer_budget_wait_ms: producer_snapshot.budget_wait_ms,
producer_handoff_wait_count: producer_snapshot.handoff_wait_count,
producer_handoff_wait_ms: producer_snapshot.handoff_wait_ms,
host_loadavg_1m_milli: lexical_rebuild_host_loadavg_1m_milli(),
host_available_memory_bytes: responsiveness::available_memory_bytes(),
process_rss_bytes: responsiveness::process_resident_memory_bytes(),
controller_mode,
controller_reason,
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
staged_shard_build_memory_reserve_bytes: 0,
staged_shard_build_emergency_memory_reserve_bytes: 0,
staged_shard_build_completed_jobs: 0,
staged_shard_build_last_shard_index: None,
staged_shard_build_last_message_bytes: 0,
staged_shard_build_last_index_size_bytes: 0,
staged_shard_build_last_duration_ms: 0,
staged_shard_build_last_amplification_milli: None,
staged_shard_build_observed_amplification_milli: None,
updated_at_ms: FrankenStorage::now_millis(),
}
}
fn refresh_lexical_rebuild_pipeline_runtime(
latest_runtime: &mut LexicalRebuildPipelineRuntimeSnapshot,
progress: Option<&Arc<IndexingProgress>>,
flow_limiter: &StreamingByteLimiter,
producer_telemetry: Option<&LexicalRebuildProducerTelemetry>,
responsiveness_controller: Option<&LexicalRebuildResponsivenessController>,
budget_generation: usize,
sink_runtime: LexicalRebuildPipelineSinkRuntimeSnapshot,
) {
*latest_runtime = capture_lexical_rebuild_pipeline_runtime(
flow_limiter,
producer_telemetry,
responsiveness_controller,
budget_generation,
sink_runtime,
);
let Some(progress) = progress else {
return;
};
progress
.rebuild_pipeline_queue_depth
.store(latest_runtime.queue_depth, Ordering::Relaxed);
progress
.rebuild_pipeline_inflight_message_bytes
.store(latest_runtime.inflight_message_bytes, Ordering::Relaxed);
progress.rebuild_pipeline_pending_batch_conversations.store(
latest_runtime.pending_batch_conversations,
Ordering::Relaxed,
);
progress.rebuild_pipeline_pending_batch_message_bytes.store(
latest_runtime.pending_batch_message_bytes,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_page_prep_workers
.store(latest_runtime.page_prep_workers, Ordering::Relaxed);
progress
.rebuild_pipeline_active_page_prep_jobs
.store(latest_runtime.active_page_prep_jobs, Ordering::Relaxed);
progress
.rebuild_pipeline_ordered_buffered_pages
.store(latest_runtime.ordered_buffered_pages, Ordering::Relaxed);
progress
.rebuild_pipeline_budget_generation
.store(latest_runtime.budget_generation, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_budget_wait_count
.store(latest_runtime.producer_budget_wait_count, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_budget_wait_ms
.store(latest_runtime.producer_budget_wait_ms, Ordering::Relaxed);
progress.rebuild_pipeline_producer_handoff_wait_count.store(
latest_runtime.producer_handoff_wait_count,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_producer_handoff_wait_ms
.store(latest_runtime.producer_handoff_wait_ms, Ordering::Relaxed);
if let Ok(mut host_loadavg) = progress.rebuild_pipeline_host_loadavg_1m_milli.lock() {
*host_loadavg = latest_runtime.host_loadavg_1m_milli;
}
if let Ok(mut host_available_memory) =
progress.rebuild_pipeline_host_available_memory_bytes.lock()
{
*host_available_memory = latest_runtime.host_available_memory_bytes;
}
if let Ok(mut process_rss) = progress.rebuild_pipeline_process_rss_bytes.lock() {
*process_rss = latest_runtime.process_rss_bytes;
}
if let Ok(mut controller_mode) = progress.rebuild_pipeline_controller_mode.lock() {
*controller_mode = latest_runtime.controller_mode.clone();
}
if let Ok(mut controller_reason) = progress.rebuild_pipeline_controller_reason.lock() {
*controller_reason = latest_runtime.controller_reason.clone();
}
progress
.rebuild_pipeline_staged_merge_workers_max
.store(latest_runtime.staged_merge_workers_max, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_allowed_jobs
.store(latest_runtime.staged_merge_allowed_jobs, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_active_jobs
.store(latest_runtime.staged_merge_active_jobs, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_ready_artifacts
.store(
latest_runtime.staged_merge_ready_artifacts,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_merge_ready_groups
.store(latest_runtime.staged_merge_ready_groups, Ordering::Relaxed);
if let Ok(mut staged_merge_reason) = progress
.rebuild_pipeline_staged_merge_controller_reason
.lock()
{
*staged_merge_reason = latest_runtime.staged_merge_controller_reason.clone();
}
progress
.rebuild_pipeline_staged_shard_build_workers_max
.store(
latest_runtime.staged_shard_build_workers_max,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_shard_build_allowed_jobs
.store(
latest_runtime.staged_shard_build_allowed_jobs,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_shard_build_active_jobs
.store(
latest_runtime.staged_shard_build_active_jobs,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_shard_build_pending_jobs
.store(
latest_runtime.staged_shard_build_pending_jobs,
Ordering::Relaxed,
);
if let Ok(mut staged_shard_build_reason) = progress
.rebuild_pipeline_staged_shard_build_controller_reason
.lock()
{
*staged_shard_build_reason = latest_runtime.staged_shard_build_controller_reason.clone();
}
progress
.rebuild_pipeline_staged_shard_build_memory_reserve_bytes
.store(
latest_runtime.staged_shard_build_memory_reserve_bytes,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_shard_build_emergency_memory_reserve_bytes
.store(
latest_runtime.staged_shard_build_emergency_memory_reserve_bytes,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_shard_build_completed_jobs
.store(
latest_runtime.staged_shard_build_completed_jobs,
Ordering::Relaxed,
);
if let Ok(mut last_shard_index) = progress
.rebuild_pipeline_staged_shard_build_last_shard_index
.lock()
{
*last_shard_index = latest_runtime.staged_shard_build_last_shard_index;
}
progress
.rebuild_pipeline_staged_shard_build_last_message_bytes
.store(
latest_runtime.staged_shard_build_last_message_bytes,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_shard_build_last_index_size_bytes
.store(
latest_runtime.staged_shard_build_last_index_size_bytes,
Ordering::Relaxed,
);
progress
.rebuild_pipeline_staged_shard_build_last_duration_ms
.store(
latest_runtime.staged_shard_build_last_duration_ms,
Ordering::Relaxed,
);
if let Ok(mut last_amplification) = progress
.rebuild_pipeline_staged_shard_build_last_amplification_milli
.lock()
{
*last_amplification = latest_runtime.staged_shard_build_last_amplification_milli;
}
if let Ok(mut observed_amplification) = progress
.rebuild_pipeline_staged_shard_build_observed_amplification_milli
.lock()
{
*observed_amplification = latest_runtime.staged_shard_build_observed_amplification_milli;
}
}
#[allow(clippy::too_many_arguments)]
fn refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
latest_runtime: &mut LexicalRebuildPipelineRuntimeSnapshot,
progress: Option<&Arc<IndexingProgress>>,
flow_limiter: &StreamingByteLimiter,
producer_telemetry: Option<&LexicalRebuildProducerTelemetry>,
responsiveness_controller: &mut LexicalRebuildResponsivenessController,
pipeline_budget_controller: &LexicalRebuildPipelineBudgetController,
current_batch_conversation_limit: &mut usize,
active_commit_intervals: Option<(&mut usize, &mut usize, &mut usize)>,
sink_runtime: LexicalRebuildPipelineSinkRuntimeSnapshot,
) {
refresh_lexical_rebuild_pipeline_runtime(
latest_runtime,
progress,
flow_limiter,
producer_telemetry,
Some(responsiveness_controller),
pipeline_budget_controller.generation(),
sink_runtime,
);
if let Some(transition) = responsiveness_controller.observe_runtime(latest_runtime) {
apply_lexical_rebuild_budget_transition(
transition,
flow_limiter,
pipeline_budget_controller,
current_batch_conversation_limit,
active_commit_intervals,
);
refresh_lexical_rebuild_pipeline_runtime(
latest_runtime,
progress,
flow_limiter,
producer_telemetry,
Some(responsiveness_controller),
pipeline_budget_controller.generation(),
sink_runtime,
);
}
}
fn exact_total_counts_from_progress(
progress: Option<&Arc<IndexingProgress>>,
) -> Option<(usize, usize)> {
let progress = progress?;
let stats = progress.stats.lock().ok()?;
if !stats.total_counts_exact {
return None;
}
Some((stats.total_conversations, stats.total_messages))
}
fn record_exact_total_counts_in_progress(
progress: Option<&Arc<IndexingProgress>>,
total_conversations: usize,
total_messages: usize,
) {
if let Some(progress) = progress
&& let Ok(mut stats) = progress.stats.lock()
{
stats.total_conversations = total_conversations;
stats.total_messages = total_messages;
stats.total_counts_exact = true;
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
struct MatchingLexicalRebuildStateStatus {
has_pending_resume: bool,
has_completed_checkpoint: bool,
completed_indexed_docs: Option<usize>,
completed_exact_totals: Option<(usize, usize)>,
completed_storage_fingerprint: Option<String>,
}
#[cfg(test)]
fn matching_lexical_rebuild_state_status(
index_path: &Path,
db_state: &LexicalRebuildDbState,
) -> Result<MatchingLexicalRebuildStateStatus> {
let Some(state) = load_lexical_rebuild_state(index_path)? else {
return Ok(MatchingLexicalRebuildStateStatus::default());
};
Ok(matching_lexical_rebuild_state_status_for_loaded_state(
state, db_state,
))
}
fn matching_lexical_rebuild_state_status_if_present(
index_path: &Path,
load_current_db_state: impl FnOnce() -> Result<LexicalRebuildDbState>,
) -> Result<MatchingLexicalRebuildStateStatus> {
let Some(state) = load_lexical_rebuild_state(index_path)? else {
return Ok(MatchingLexicalRebuildStateStatus::default());
};
let db_state = load_current_db_state()?;
Ok(matching_lexical_rebuild_state_status_for_loaded_state(
state, &db_state,
))
}
fn matching_completed_lexical_rebuild_state_status_without_fingerprint(
index_path: &Path,
db_path: &Path,
total_conversations: usize,
) -> Result<Option<MatchingLexicalRebuildStateStatus>> {
let Some(state) = load_lexical_rebuild_state(index_path)? else {
return Ok(Some(MatchingLexicalRebuildStateStatus::default()));
};
if state.is_incomplete() {
return Ok(None);
}
let normalized_db_path = crate::normalize_path_identity(db_path)
.to_string_lossy()
.into_owned();
if state.version != LEXICAL_REBUILD_STATE_VERSION
|| state.schema_hash != crate::search::tantivy::SCHEMA_HASH
|| state.db.total_conversations != total_conversations
|| !lexical_rebuild_db_paths_match(&state.db.db_path, &normalized_db_path)
|| !lexical_rebuild_page_size_is_compatible(state.page_size)
{
return Ok(Some(MatchingLexicalRebuildStateStatus::default()));
}
let has_completed_checkpoint = state.completed
&& state.pending.is_none()
&& state.execution_mode == LexicalRebuildExecutionMode::SharedWriter
&& !state.runtime.is_observed();
Ok(Some(MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint,
completed_indexed_docs: has_completed_checkpoint.then_some(state.indexed_docs),
completed_exact_totals: has_completed_checkpoint
.then_some((state.db.total_conversations, state.indexed_docs)),
completed_storage_fingerprint: has_completed_checkpoint
.then_some(state.db.storage_fingerprint),
}))
}
fn matching_lexical_rebuild_state_status_for_loaded_state(
state: LexicalRebuildState,
db_state: &LexicalRebuildDbState,
) -> MatchingLexicalRebuildStateStatus {
if !state.matches_run(db_state, LEXICAL_REBUILD_PAGE_SIZE) {
return MatchingLexicalRebuildStateStatus::default();
}
let has_completed_checkpoint = state.completed
&& state.pending.is_none()
&& state.execution_mode == LexicalRebuildExecutionMode::SharedWriter
&& !state.runtime.is_observed();
MatchingLexicalRebuildStateStatus {
has_pending_resume: state.is_incomplete(),
has_completed_checkpoint,
completed_indexed_docs: has_completed_checkpoint.then_some(state.indexed_docs),
completed_exact_totals: has_completed_checkpoint
.then_some((state.db.total_conversations, state.indexed_docs)),
completed_storage_fingerprint: has_completed_checkpoint
.then_some(state.db.storage_fingerprint),
}
}
fn nonresumable_pending_lexical_rebuild_status_without_fingerprint(
index_path: &Path,
db_path: &Path,
total_conversations: usize,
) -> Result<Option<MatchingLexicalRebuildStateStatus>> {
let Some(state) = load_lexical_rebuild_state(index_path)? else {
return Ok(None);
};
if !state.is_incomplete()
|| !state.execution_mode.requires_restart_from_zero_on_resume()
|| state.version != LEXICAL_REBUILD_STATE_VERSION
|| state.schema_hash != crate::search::tantivy::SCHEMA_HASH
|| !lexical_rebuild_page_size_is_compatible(state.page_size)
|| state.db.total_conversations != total_conversations
{
return Ok(None);
}
let normalized_db_path = crate::normalize_path_identity(db_path)
.to_string_lossy()
.into_owned();
if !lexical_rebuild_db_paths_match(&state.db.db_path, &normalized_db_path) {
return Ok(None);
}
Ok(Some(MatchingLexicalRebuildStateStatus {
has_pending_resume: true,
..MatchingLexicalRebuildStateStatus::default()
}))
}
fn nonresumable_pending_lexical_rebuild_status_from_readonly_db(
index_path: &Path,
db_path: &Path,
) -> Result<Option<(MatchingLexicalRebuildStateStatus, usize)>> {
let mut storage = FrankenStorage::open_readonly(db_path).with_context(|| {
format!(
"opening readonly storage to classify pending lexical rebuild checkpoint: {}",
db_path.display()
)
})?;
let total_conversations = count_total_conversations_exact(&storage)?;
storage.close_best_effort_in_place();
let status = nonresumable_pending_lexical_rebuild_status_without_fingerprint(
index_path,
db_path,
total_conversations,
)?;
Ok(status.map(|status| (status, total_conversations)))
}
fn should_try_readonly_nonresumable_lexical_resume(opts: &IndexOptions) -> bool {
!opts.full
&& !opts.force_rebuild
&& !opts.watch
&& !opts.semantic
&& opts
.watch_once_paths
.as_ref()
.is_none_or(|paths| paths.is_empty())
}
fn should_try_readonly_canonical_force_rebuild(opts: &IndexOptions) -> bool {
opts.force_rebuild
&& !opts.full
&& !opts.watch
&& !opts.semantic
&& !opts.build_hnsw
&& opts
.watch_once_paths
.as_ref()
.is_none_or(|paths| paths.is_empty())
&& opts.db_path.exists()
}
fn try_readonly_canonical_force_rebuild(
opts: &IndexOptions,
progress_bump: &Arc<AtomicI64>,
) -> Result<bool> {
if !should_try_readonly_canonical_force_rebuild(opts) {
return Ok(false);
}
let storage = FrankenStorage::open_readonly(&opts.db_path).with_context(|| {
format!(
"opening canonical database read-only for force rebuild: {}",
opts.db_path.display()
)
})?;
let total_conversations = count_total_conversations_exact(&storage)?;
if total_conversations == 0 {
storage.close_without_checkpoint().with_context(|| {
format!(
"closing empty canonical database after read-only force rebuild preflight: {}",
opts.db_path.display()
)
})?;
return Ok(false);
}
let total_messages = count_total_messages_exact(&storage)?;
storage.close_without_checkpoint().with_context(|| {
format!(
"closing canonical database before read-only force rebuild: {}",
opts.db_path.display()
)
})?;
tracing::info!(
db_path = %opts.db_path.display(),
conversations = total_conversations,
messages = total_messages,
"running force rebuild from populated canonical database without writable storage preflight"
);
record_lexical_population_strategy(
opts.progress.as_ref(),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
"force_rebuild_uses_readonly_authoritative_canonical_db_rebuild_only",
);
tracing::info!(
strategy = LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild.as_str(),
reason = "force_rebuild_uses_readonly_authoritative_canonical_db_rebuild_only",
"selected_lexical_population_strategy"
);
let rebuild_start = Instant::now();
let rebuild = rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
&opts.db_path,
&opts.data_dir,
total_conversations,
opts.progress.clone(),
Arc::clone(progress_bump),
)?;
if let Some(p) = &opts.progress
&& let Ok(mut stats) = p.stats.lock()
{
stats.scan_ms = 0;
stats.index_ms = rebuild_start.elapsed().as_millis() as u64;
stats.total_conversations = total_conversations;
stats.total_messages = total_messages;
stats.total_counts_exact = true;
}
if let Some(observed_messages) = rebuild.observed_messages {
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
total_conversations,
observed_messages,
);
}
Ok(true)
}
fn should_preserve_matching_completed_lexical_checkpoint_during_full_scan(
full_rebuild: bool,
resume_lexical_rebuild: bool,
canonical_only_full_rebuild: bool,
initial_checkpoint_status: &MatchingLexicalRebuildStateStatus,
) -> bool {
full_rebuild
&& !resume_lexical_rebuild
&& !canonical_only_full_rebuild
&& initial_checkpoint_status.has_completed_checkpoint
}
fn should_preflight_existing_tantivy_reader(
resume_lexical_rebuild: bool,
full_rebuild: bool,
) -> bool {
!resume_lexical_rebuild && !full_rebuild
}
fn should_probe_live_tantivy_docs_for_post_full_scan_skip(
full_rebuild: bool,
rebuild_was_required: bool,
salvage_messages_imported: usize,
initial_checkpoint_status: &MatchingLexicalRebuildStateStatus,
scan_canonical_mutations: CanonicalMutationCounts,
observed_tantivy_docs: Option<usize>,
) -> bool {
full_rebuild
&& !rebuild_was_required
&& salvage_messages_imported == 0
&& !scan_canonical_mutations.changed()
&& initial_checkpoint_status.has_completed_checkpoint
&& observed_tantivy_docs.is_none()
}
fn observed_tantivy_docs_for_post_full_scan_skip(
index_path: &Path,
full_rebuild: bool,
rebuild_was_required: bool,
salvage_messages_imported: usize,
initial_checkpoint_status: &MatchingLexicalRebuildStateStatus,
scan_canonical_mutations: CanonicalMutationCounts,
observed_tantivy_docs: Option<usize>,
) -> Result<Option<usize>> {
if should_probe_live_tantivy_docs_for_post_full_scan_skip(
full_rebuild,
rebuild_was_required,
salvage_messages_imported,
initial_checkpoint_status,
scan_canonical_mutations,
observed_tantivy_docs,
) {
live_tantivy_doc_count(index_path)
} else {
Ok(observed_tantivy_docs)
}
}
fn should_force_authoritative_rebuild(
canonical_storage_rebuilt: bool,
tantivy_requires_rebuild: bool,
) -> bool {
canonical_storage_rebuilt || tantivy_requires_rebuild
}
fn should_skip_noop_final_lexical_checkpoint_refresh(
full_rebuild: bool,
rebuild_was_required: bool,
initial_checkpoint_status: &MatchingLexicalRebuildStateStatus,
exact_total_counts: Option<(usize, usize)>,
canonical_mutations: CanonicalMutationCounts,
) -> bool {
!full_rebuild
&& !rebuild_was_required
&& exact_total_counts.is_none()
&& !canonical_mutations.changed()
&& initial_checkpoint_status.has_completed_checkpoint
}
fn should_skip_post_full_scan_authoritative_rebuild(
full_rebuild: bool,
rebuild_was_required: bool,
salvage_messages_imported: usize,
initial_checkpoint_status: &MatchingLexicalRebuildStateStatus,
scan_canonical_mutations: CanonicalMutationCounts,
observed_tantivy_docs: Option<usize>,
) -> bool {
full_rebuild
&& !rebuild_was_required
&& salvage_messages_imported == 0
&& !scan_canonical_mutations.changed()
&& initial_checkpoint_status.has_completed_checkpoint
&& initial_checkpoint_status.completed_indexed_docs == observed_tantivy_docs
}
struct RunIndexProgressReset {
progress: Option<Arc<IndexingProgress>>,
}
impl RunIndexProgressReset {
fn new(progress: Option<Arc<IndexingProgress>>) -> Self {
Self { progress }
}
}
impl Drop for RunIndexProgressReset {
fn drop(&mut self) {
reset_progress_to_idle(self.progress.as_ref());
}
}
const LEXICAL_REBUILD_STATE_VERSION: u8 = 2;
const LEXICAL_REBUILD_PAGE_SIZE: i64 = 1024;
const LEXICAL_REBUILD_LEGACY_COMPAT_PAGE_SIZE: i64 = 200;
pub(crate) const LEXICAL_REBUILD_PAGE_SIZE_PUBLIC: i64 = LEXICAL_REBUILD_PAGE_SIZE;
pub(crate) fn lexical_rebuild_page_size_is_compatible(page_size: i64) -> bool {
page_size == LEXICAL_REBUILD_PAGE_SIZE || page_size == LEXICAL_REBUILD_LEGACY_COMPAT_PAGE_SIZE
}
fn lexical_rebuild_db_paths_match(saved: &str, current: &str) -> bool {
crate::path_identities_match(Path::new(saved), Path::new(current))
}
fn lexical_rebuild_db_state_matches(
saved: &LexicalRebuildDbState,
current: &LexicalRebuildDbState,
) -> bool {
saved.total_conversations == current.total_conversations
&& saved.storage_fingerprint == current.storage_fingerprint
&& lexical_rebuild_db_paths_match(&saved.db_path, ¤t.db_path)
}
fn lexical_rebuild_db_state_matches_legacy(
saved: &LexicalRebuildDbState,
current: &LexicalRebuildDbState,
) -> bool {
let messages_match = saved.total_messages == 0
|| current.total_messages == 0
|| saved.total_messages == current.total_messages;
saved.total_conversations == current.total_conversations
&& messages_match
&& lexical_rebuild_db_paths_match(&saved.db_path, ¤t.db_path)
}
#[derive(Debug)]
struct IndexRunLockGuard {
file: File,
_path: PathBuf,
started_at_ms: i64,
updated_at_ms: i64,
last_progress_at_ms: i64,
last_progress_at_ms_atomic: Arc<AtomicI64>,
db_path: PathBuf,
job_id: String,
job_kind: SearchMaintenanceJobKind,
metadata_write_lock: Arc<Mutex<()>>,
}
impl Drop for IndexRunLockGuard {
fn drop(&mut self) {
let _ = self.file.set_len(0);
let _ = self.file.rewind();
let _ = self.file.flush();
let _ = self.file.unlock();
}
}
impl IndexRunLockGuard {
fn write_metadata(&mut self, mode: SearchMaintenanceMode) -> Result<()> {
let _write_guard = self
.metadata_write_lock
.lock()
.map_err(|_| anyhow::anyhow!("index-run metadata write lock poisoned"))?;
let now_ms = FrankenStorage::now_millis();
self.updated_at_ms = now_ms;
self.last_progress_at_ms = now_ms;
self.last_progress_at_ms_atomic
.store(now_ms, Ordering::Relaxed);
self.file.set_len(0).with_context(|| {
format!(
"truncating index-run lock file before metadata update: {}",
self._path.display()
)
})?;
self.file.rewind().with_context(|| {
format!(
"rewinding index-run lock file after truncation: {}",
self._path.display()
)
})?;
writeln!(
self.file,
"pid={}\nstarted_at_ms={}\nupdated_at_ms={}\nlast_progress_at_ms={}\ndb_path={}\nmode={}\njob_id={}\njob_kind={}\nphase={}",
std::process::id(),
self.started_at_ms,
self.updated_at_ms,
self.last_progress_at_ms,
self.db_path.display(),
mode.as_lock_value(),
self.job_id,
self.job_kind.as_lock_value(),
mode.as_lock_value()
)
.with_context(|| format!("writing index-run metadata to {}", self._path.display()))?;
self.file
.flush()
.with_context(|| format!("flushing index-run lock file {}", self._path.display()))?;
self.file
.sync_all()
.with_context(|| format!("syncing index-run lock file {}", self._path.display()))?;
Ok(())
}
fn set_mode(&mut self, mode: SearchMaintenanceMode) -> Result<()> {
self.write_metadata(mode)
}
}
fn bump_index_run_lock_progress_atomic(atomic: &Arc<AtomicI64>) -> i64 {
let now_ms = FrankenStorage::now_millis();
atomic.store(now_ms, Ordering::Relaxed);
now_ms
}
fn bump_index_run_lock_progress_if_present(progress_bump: Option<&Arc<AtomicI64>>) {
if let Some(atomic) = progress_bump {
bump_index_run_lock_progress_atomic(atomic);
}
}
struct IndexRunLockHeartbeat {
stop: Arc<AtomicBool>,
join: Option<JoinHandle<()>>,
}
impl IndexRunLockHeartbeat {
fn start(
data_dir: PathBuf,
interval: Duration,
metadata_write_lock: Arc<Mutex<()>>,
last_progress_at_ms_atomic: Arc<AtomicI64>,
) -> Self {
let stop = Arc::new(AtomicBool::new(false));
let stop_flag = Arc::clone(&stop);
let join = thread::spawn(move || {
while !stop_flag.load(Ordering::Relaxed) {
thread::park_timeout(interval);
if stop_flag.load(Ordering::Relaxed) {
break;
}
let last_progress_at_ms = last_progress_at_ms_atomic.load(Ordering::Relaxed);
if let Err(err) = heartbeat_index_run_lock_with_lock_and_progress(
&data_dir,
Some(&metadata_write_lock),
last_progress_at_ms,
) {
tracing::debug!(
error = %err,
path = %data_dir.display(),
"failed to refresh index-run heartbeat from background worker"
);
}
}
});
Self {
stop,
join: Some(join),
}
}
}
impl Drop for IndexRunLockHeartbeat {
fn drop(&mut self) {
self.stop.store(true, Ordering::Relaxed);
if let Some(join) = self.join.take() {
join.thread().unpark();
let _ = join.join();
}
}
}
fn heartbeat_index_run_lock_with_lock(
data_dir: &Path,
metadata_write_lock: Option<&Arc<Mutex<()>>>,
) -> Result<()> {
heartbeat_index_run_lock_with_lock_and_progress(data_dir, metadata_write_lock, 0)
}
fn heartbeat_index_run_lock_with_lock_and_progress(
data_dir: &Path,
metadata_write_lock: Option<&Arc<Mutex<()>>>,
last_progress_at_ms: i64,
) -> Result<()> {
let _write_guard = metadata_write_lock
.map(|lock| {
lock.lock()
.map_err(|_| anyhow::anyhow!("index-run metadata write lock poisoned"))
})
.transpose()?;
let lock_path = data_dir.join("index-run.lock");
let existing = match fs::read_to_string(&lock_path) {
Ok(contents) if !contents.is_empty() => contents,
Ok(_) => return Ok(()),
Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(()),
Err(err) => {
return Err(err).with_context(|| {
format!("reading index-run lock heartbeat {}", lock_path.display())
});
}
};
let now_ms = FrankenStorage::now_millis();
let mut updated_at_buf = itoa::Buffer::new();
let now_ms_str = updated_at_buf.format(now_ms);
let existing_progress = existing
.lines()
.find_map(|line| line.strip_prefix("last_progress_at_ms="))
.and_then(|raw| raw.trim().parse::<i64>().ok())
.unwrap_or(0);
let should_advance_progress = last_progress_at_ms > existing_progress;
let mut progress_buf = itoa::Buffer::new();
let progress_str = if should_advance_progress {
progress_buf.format(last_progress_at_ms)
} else {
""
};
let mut wrote_updated_at = false;
let mut wrote_progress = !should_advance_progress;
let mut refreshed = String::with_capacity(existing.len() + 64);
for line in existing.lines() {
if line.strip_prefix("updated_at_ms=").is_some() {
refreshed.push_str("updated_at_ms=");
refreshed.push_str(now_ms_str);
wrote_updated_at = true;
} else if line.strip_prefix("last_progress_at_ms=").is_some() {
if should_advance_progress {
refreshed.push_str("last_progress_at_ms=");
refreshed.push_str(progress_str);
} else {
refreshed.push_str(line);
}
wrote_progress = true;
} else {
refreshed.push_str(line);
}
refreshed.push('\n');
}
if !wrote_updated_at {
refreshed.push_str("updated_at_ms=");
refreshed.push_str(now_ms_str);
refreshed.push('\n');
}
if !wrote_progress {
refreshed.push_str("last_progress_at_ms=");
refreshed.push_str(progress_str);
refreshed.push('\n');
}
write_index_run_lock_heartbeat_in_place(&lock_path, &refreshed)
}
fn write_index_run_lock_heartbeat_in_place(lock_path: &Path, refreshed: &str) -> Result<()> {
let mut file = OpenOptions::new()
.read(true)
.write(true)
.open(lock_path)
.with_context(|| format!("opening index-run lock heartbeat {}", lock_path.display()))?;
file.set_len(0).with_context(|| {
format!(
"truncating index-run lock heartbeat {}",
lock_path.display()
)
})?;
file.rewind()
.with_context(|| format!("rewinding index-run lock heartbeat {}", lock_path.display()))?;
file.write_all(refreshed.as_bytes())
.with_context(|| format!("writing index-run lock heartbeat {}", lock_path.display()))?;
file.flush()
.with_context(|| format!("flushing index-run lock heartbeat {}", lock_path.display()))?;
file.sync_all()
.with_context(|| format!("syncing index-run lock heartbeat {}", lock_path.display()))
}
#[cfg_attr(not(test), allow(dead_code))]
fn heartbeat_index_run_lock(data_dir: &Path) -> Result<()> {
heartbeat_index_run_lock_with_lock(data_dir, None)
}
fn index_run_lock_heartbeat_interval() -> Duration {
Duration::from_millis(
dotenvy::var("CASS_INDEX_RUN_LOCK_HEARTBEAT_EVERY_MS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(1_000),
)
}
fn lexical_rebuild_noise_role(is_tool_role: bool) -> Option<&'static str> {
is_tool_role.then_some("tool")
}
fn lexical_rebuild_packet_provenance_from_canonical(
conv: &crate::storage::sqlite::LexicalRebuildConversationRow,
source_map: &HashMap<String, (SourceKind, Option<String>)>,
) -> (
LexicalRebuildPacketProvenance,
LexicalRebuildPacketProvenanceMode,
) {
let trimmed_source_id = conv.source_id.trim();
let source_lookup = source_map.get(&conv.source_id).cloned();
let (kind, host_label, mode) = if let Some((kind, host_label)) = source_lookup {
(
kind,
host_label,
LexicalRebuildPacketProvenanceMode::SourceMapLookup,
)
} else {
let fallback_kind = if conv.source_id == LOCAL_SOURCE_ID {
SourceKind::Local
} else {
SourceKind::Ssh
};
let fallback_mode = if trimmed_source_id.is_empty() {
if conv
.origin_host
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
.is_some()
{
LexicalRebuildPacketProvenanceMode::HostFallback
} else {
LexicalRebuildPacketProvenanceMode::ConversationFields
}
} else {
LexicalRebuildPacketProvenanceMode::ConversationFields
};
(fallback_kind, None, fallback_mode)
};
let origin_host = crate::search::tantivy::normalized_index_origin_host(
conv.origin_host.as_deref().or(host_label.as_deref()),
);
let source_id = crate::search::tantivy::normalized_index_source_id(
Some(&conv.source_id),
Some(kind.as_str()),
origin_host.as_deref(),
);
let origin_kind =
crate::search::tantivy::normalized_index_origin_kind(&source_id, Some(kind.as_str()));
(
LexicalRebuildPacketProvenance {
source_id,
origin_kind,
origin_host,
},
mode,
)
}
#[cfg(test)]
fn lexical_rebuild_packet_provenance_from_metadata(
conv: &NormalizedConversation,
) -> (
LexicalRebuildPacketProvenance,
LexicalRebuildPacketProvenanceMode,
) {
let cass_origin = conv
.metadata
.get("cass")
.and_then(|cass| cass.get("origin"));
let raw_source_id = cass_origin
.and_then(|origin| origin.get("source_id"))
.and_then(|value| value.as_str());
let raw_origin_kind = cass_origin
.and_then(|origin| origin.get("kind"))
.and_then(|value| value.as_str());
let origin_host = crate::search::tantivy::normalized_index_origin_host(
cass_origin
.and_then(|origin| origin.get("host"))
.and_then(|value| value.as_str()),
);
let source_id = crate::search::tantivy::normalized_index_source_id(
raw_source_id,
raw_origin_kind,
origin_host.as_deref(),
);
let origin_kind =
crate::search::tantivy::normalized_index_origin_kind(&source_id, raw_origin_kind);
let mode = if raw_source_id
.map(str::trim)
.filter(|value| !value.is_empty())
.is_some()
|| raw_origin_kind
.map(str::trim)
.filter(|value| !value.is_empty())
.is_some()
{
LexicalRebuildPacketProvenanceMode::MetadataFields
} else if origin_host.is_some() {
LexicalRebuildPacketProvenanceMode::HostFallback
} else {
LexicalRebuildPacketProvenanceMode::LocalDefault
};
(
LexicalRebuildPacketProvenance {
source_id,
origin_kind,
origin_host,
},
mode,
)
}
#[cfg(test)]
fn lexical_rebuild_grouped_message_from_normalized(
message: &NormalizedMessage,
) -> crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: message.idx,
is_tool_role: message.role.eq_ignore_ascii_case("tool"),
created_at: message.created_at,
content: message.content.clone(),
}
}
fn lexical_rebuild_contract_provenance(
provenance: &LexicalRebuildPacketProvenance,
) -> ConversationPacketProvenance {
ConversationPacketProvenance {
source_id: provenance.source_id.clone(),
origin_kind: provenance.origin_kind.clone(),
origin_host: provenance.origin_host.clone(),
}
}
fn lexical_rebuild_contract_from_grouped_messages(
conversation: &crate::storage::sqlite::LexicalRebuildConversationRow,
provenance: &LexicalRebuildPacketProvenance,
messages: &crate::storage::sqlite::LexicalRebuildGroupedMessageRows,
) -> ConversationPacket {
let canonical_messages = messages
.iter()
.map(|message| crate::model::types::Message {
id: None,
idx: message.idx,
role: if message.is_tool_role {
crate::model::types::MessageRole::Tool
} else {
crate::model::types::MessageRole::Agent
},
author: None,
created_at: message.created_at,
content: message.content.clone(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
.collect::<Vec<_>>();
lexical_rebuild_contract_from_canonical_messages(conversation, provenance, canonical_messages)
}
fn lexical_rebuild_contract_from_canonical_messages(
conversation: &crate::storage::sqlite::LexicalRebuildConversationRow,
provenance: &LexicalRebuildPacketProvenance,
messages: Vec<crate::model::types::Message>,
) -> ConversationPacket {
let canonical = crate::model::types::Conversation {
id: conversation.id,
agent_slug: conversation.agent_slug.clone(),
workspace: conversation.workspace.clone(),
external_id: conversation.external_id.clone(),
title: conversation.title.clone(),
source_path: conversation.source_path.clone(),
started_at: conversation.started_at,
ended_at: conversation.ended_at,
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages,
source_id: provenance.source_id.clone(),
origin_host: provenance.origin_host.clone(),
};
ConversationPacket::from_canonical_replay(
&canonical,
lexical_rebuild_contract_provenance(provenance),
)
}
impl LexicalRebuildConversationPacket {
fn from_canonical_replay(
conversation: crate::storage::sqlite::LexicalRebuildConversationRow,
messages: crate::storage::sqlite::LexicalRebuildGroupedMessageRows,
last_message_id: Option<i64>,
source_map: &HashMap<String, (SourceKind, Option<String>)>,
) -> Self {
let (provenance, provenance_mode) =
lexical_rebuild_packet_provenance_from_canonical(&conversation, source_map);
let contract =
lexical_rebuild_contract_from_grouped_messages(&conversation, &provenance, &messages);
Self::from_canonical_replay_parts(
conversation,
messages,
last_message_id,
provenance,
provenance_mode,
contract,
)
}
fn from_canonical_replay_messages(
conversation: crate::storage::sqlite::LexicalRebuildConversationRow,
messages: Vec<crate::model::types::Message>,
source_map: &HashMap<String, (SourceKind, Option<String>)>,
) -> Result<Self> {
let (provenance, provenance_mode) =
lexical_rebuild_packet_provenance_from_canonical(&conversation, source_map);
let contract = lexical_rebuild_contract_from_canonical_messages(
&conversation,
&provenance,
messages.clone(),
);
let mut grouped_rows = crate::storage::sqlite::LexicalRebuildGroupedMessageRows::new();
grouped_rows.reserve(messages.len());
let mut last_message_id = None;
for message in messages {
let message_id = message.id.ok_or_else(|| {
anyhow::anyhow!(
"lexical rebuild batch fetch returned message without id for conversation {}",
conversation.id.unwrap_or_default()
)
})?;
last_message_id = Some(last_message_id.unwrap_or(0).max(message_id));
grouped_rows.push(crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: message.idx,
is_tool_role: matches!(message.role, crate::model::types::MessageRole::Tool),
created_at: message.created_at,
content: message.content,
});
}
Ok(Self::from_canonical_replay_parts(
conversation,
grouped_rows,
last_message_id,
provenance,
provenance_mode,
contract,
))
}
fn from_canonical_replay_parts(
conversation: crate::storage::sqlite::LexicalRebuildConversationRow,
messages: crate::storage::sqlite::LexicalRebuildGroupedMessageRows,
last_message_id: Option<i64>,
provenance: LexicalRebuildPacketProvenance,
provenance_mode: LexicalRebuildPacketProvenanceMode,
contract: ConversationPacket,
) -> Self {
let message_count = contract.payload.messages.len();
let message_bytes = contract.projections.lexical.total_content_bytes;
let contract_hashes = contract.hashes;
let contract_projections = contract.projections;
Self {
diagnostics: LexicalRebuildPacketDiagnostics {
version: LEXICAL_REBUILD_PACKET_VERSION,
source: LexicalRebuildPacketSource::CanonicalReplay,
provenance_mode,
missing_conversation_id: conversation.id.is_none(),
},
identity: LexicalRebuildPacketIdentity {
conversation_id: conversation.id,
external_id: conversation.external_id,
agent: conversation.agent_slug,
workspace: conversation
.workspace
.as_ref()
.map(|workspace| workspace.to_string_lossy().to_string()),
source_path: conversation.source_path.to_string_lossy().to_string(),
title: conversation.title,
started_at: conversation.started_at,
ended_at: conversation.ended_at,
},
provenance,
contract_hashes,
contract_projections,
messages,
message_count,
message_bytes,
flow_reservation_bytes: 0,
last_message_id,
}
}
fn fingerprint_input(&self) -> LexicalRebuildPacketFingerprintInput<'_> {
LexicalRebuildPacketFingerprintInput {
version: self.diagnostics.version,
agent: self.identity.agent.as_str(),
external_id: self.identity.external_id.as_deref(),
workspace: self.identity.workspace.as_deref(),
source_path: self.identity.source_path.as_str(),
title: self.identity.title.as_deref(),
started_at: self.identity.started_at,
ended_at: self.identity.ended_at,
source_id: self.provenance.source_id.as_str(),
origin_kind: self.provenance.origin_kind.as_str(),
origin_host: self.provenance.origin_host.as_deref(),
lexical_projected_content_bytes: self.contract_projections.lexical.total_content_bytes,
messages: &self.messages,
message_count: self.message_count,
message_bytes: self.message_bytes,
}
}
fn prebuilt_docs(&self) -> Vec<frankensearch::lexical::CassDocumentRef<'_>> {
let Some(conversation_id) = self.identity.conversation_id else {
return Vec::new();
};
let mut docs = Vec::with_capacity(self.contract_projections.lexical.message_indices.len());
for message_index in &self.contract_projections.lexical.message_indices {
let Some(message) = self.messages.get(*message_index) else {
continue;
};
if is_hard_message_noise(
lexical_rebuild_noise_role(message.is_tool_role),
&message.content,
) {
continue;
}
docs.push(frankensearch::lexical::CassDocumentRef {
agent: self.identity.agent.as_str(),
workspace: self.identity.workspace.as_deref(),
workspace_original: None,
source_path: self.identity.source_path.as_str(),
msg_idx: message.idx.max(0) as u64,
created_at: message.created_at.or(self.identity.started_at),
title: self.identity.title.as_deref(),
content: message.content.as_str(),
source_id: self.provenance.source_id.as_str(),
origin_kind: self.provenance.origin_kind.as_str(),
origin_host: self.provenance.origin_host.as_deref(),
conversation_id: Some(conversation_id),
});
}
docs
}
#[cfg(test)]
fn from_normalized_conversation(conv: &NormalizedConversation) -> Self {
let (provenance, provenance_mode) = lexical_rebuild_packet_provenance_from_metadata(conv);
let contract = ConversationPacket::from_normalized_conversation(
conv,
lexical_rebuild_contract_provenance(&provenance),
);
let messages = conv
.messages
.iter()
.map(lexical_rebuild_grouped_message_from_normalized)
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>();
let message_count = contract.payload.messages.len();
let message_bytes = contract.projections.lexical.total_content_bytes;
let contract_hashes = contract.hashes;
let contract_projections = contract.projections;
Self {
diagnostics: LexicalRebuildPacketDiagnostics {
version: LEXICAL_REBUILD_PACKET_VERSION,
source: LexicalRebuildPacketSource::NormalizedConversation,
provenance_mode,
missing_conversation_id: true,
},
identity: LexicalRebuildPacketIdentity {
conversation_id: None,
external_id: conv.external_id.clone(),
agent: conv.agent_slug.clone(),
workspace: conv
.workspace
.as_ref()
.map(|workspace| workspace.to_string_lossy().to_string()),
source_path: conv.source_path.to_string_lossy().to_string(),
title: conv.title.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
},
provenance,
contract_hashes,
contract_projections,
messages,
message_count,
message_bytes,
flow_reservation_bytes: 0,
last_message_id: None,
}
}
#[cfg(test)]
fn semantic_view(&self) -> LexicalRebuildPacketSemanticView {
LexicalRebuildPacketSemanticView {
version: self.diagnostics.version,
agent: self.identity.agent.clone(),
external_id: self.identity.external_id.clone(),
workspace: self.identity.workspace.clone(),
source_path: self.identity.source_path.clone(),
title: self.identity.title.clone(),
started_at: self.identity.started_at,
ended_at: self.identity.ended_at,
source_id: self.provenance.source_id.clone(),
origin_kind: self.provenance.origin_kind.clone(),
origin_host: self.provenance.origin_host.clone(),
contract_projections: self.contract_projections.clone(),
messages: self.messages.clone(),
message_count: self.message_count,
message_bytes: self.message_bytes,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub(crate) struct LexicalRebuildEquivalenceEvidence {
pub document_count: u64,
pub manifest_fingerprint: String,
pub golden_query_digest: String,
pub golden_query_hit_counts: Vec<LexicalRebuildEquivalenceGoldenHit>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub(crate) struct LexicalRebuildEquivalenceGoldenHit {
pub probe: String,
pub hit_count: u64,
}
const LEXICAL_REBUILD_EQUIVALENCE_DEFAULT_PROBES: &[&str] =
&["error", "TODO", "function", "import", "test"];
struct LexicalRebuildEquivalenceAccumulator {
document_count: u64,
manifest_hasher: blake3::Hasher,
probes: Vec<String>,
probe_hashers: Vec<blake3::Hasher>,
probe_counts: Vec<u64>,
}
impl LexicalRebuildEquivalenceAccumulator {
fn new() -> Self {
Self::with_probes(
LEXICAL_REBUILD_EQUIVALENCE_DEFAULT_PROBES
.iter()
.map(|probe| (*probe).to_string()),
)
}
fn with_probes<I>(probes: I) -> Self
where
I: IntoIterator<Item = String>,
{
let probes: Vec<String> = probes.into_iter().collect();
let probe_hashers = probes.iter().map(|_| blake3::Hasher::new()).collect();
let probe_counts = vec![0_u64; probes.len()];
Self {
document_count: 0,
manifest_hasher: blake3::Hasher::new(),
probes,
probe_hashers,
probe_counts,
}
}
fn absorb_packet(&mut self, packet: &LexicalRebuildConversationPacket) {
let fingerprint = packet.fingerprint_input();
self.manifest_hasher.update(b"pkt");
self.manifest_hasher
.update(&fingerprint.version.to_le_bytes());
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
Some(fingerprint.agent),
);
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
fingerprint.external_id,
);
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
fingerprint.workspace,
);
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
Some(fingerprint.source_path),
);
lexical_rebuild_equivalence_update_opt_str(&mut self.manifest_hasher, fingerprint.title);
self.manifest_hasher
.update(&fingerprint.started_at.unwrap_or(i64::MIN).to_le_bytes());
self.manifest_hasher
.update(&fingerprint.ended_at.unwrap_or(i64::MIN).to_le_bytes());
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
Some(fingerprint.source_id),
);
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
Some(fingerprint.origin_kind),
);
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
fingerprint.origin_host,
);
self.manifest_hasher
.update(&(fingerprint.lexical_projected_content_bytes as u64).to_le_bytes());
self.manifest_hasher
.update(&(fingerprint.message_count as u64).to_le_bytes());
self.manifest_hasher
.update(&(fingerprint.message_bytes as u64).to_le_bytes());
let docs = packet.prebuilt_docs();
self.document_count = self.document_count.saturating_add(docs.len() as u64);
for doc in &docs {
self.manifest_hasher.update(b"doc");
lexical_rebuild_equivalence_update_opt_str(&mut self.manifest_hasher, Some(doc.agent));
lexical_rebuild_equivalence_update_opt_str(&mut self.manifest_hasher, doc.workspace);
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
Some(doc.source_path),
);
self.manifest_hasher.update(&doc.msg_idx.to_le_bytes());
self.manifest_hasher
.update(&doc.created_at.unwrap_or(i64::MIN).to_le_bytes());
lexical_rebuild_equivalence_update_opt_str(&mut self.manifest_hasher, doc.title);
self.manifest_hasher
.update(&(doc.content.len() as u64).to_le_bytes());
self.manifest_hasher.update(doc.content.as_bytes());
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
Some(doc.source_id),
);
lexical_rebuild_equivalence_update_opt_str(
&mut self.manifest_hasher,
Some(doc.origin_kind),
);
lexical_rebuild_equivalence_update_opt_str(&mut self.manifest_hasher, doc.origin_host);
for ((probe, hasher), count) in self
.probes
.iter()
.zip(self.probe_hashers.iter_mut())
.zip(self.probe_counts.iter_mut())
{
let probe_str = probe.as_str();
let hit = doc.content.contains(probe_str)
|| doc
.title
.map(|value| value.contains(probe_str))
.unwrap_or(false)
|| doc
.workspace
.map(|value| value.contains(probe_str))
.unwrap_or(false)
|| doc.source_path.contains(probe_str);
if hit {
*count = count.saturating_add(1);
hasher.update(b"hit");
lexical_rebuild_equivalence_update_opt_str(hasher, Some(doc.source_path));
hasher.update(&doc.msg_idx.to_le_bytes());
hasher.update(&doc.created_at.unwrap_or(i64::MIN).to_le_bytes());
hasher.update(&(doc.content.len() as u64).to_le_bytes());
hasher.update(doc.content.as_bytes());
}
}
}
}
fn finalize(self) -> LexicalRebuildEquivalenceEvidence {
let manifest_fingerprint = self.manifest_hasher.finalize().to_hex().to_string();
let mut combined = blake3::Hasher::new();
let mut golden_query_hit_counts = Vec::with_capacity(self.probes.len());
for ((probe, hasher), count) in self
.probes
.into_iter()
.zip(self.probe_hashers)
.zip(self.probe_counts)
{
combined.update(b"probe");
combined.update(&(probe.len() as u64).to_le_bytes());
combined.update(probe.as_bytes());
combined.update(&count.to_le_bytes());
combined.update(hasher.finalize().as_bytes());
golden_query_hit_counts.push(LexicalRebuildEquivalenceGoldenHit {
probe,
hit_count: count,
});
}
LexicalRebuildEquivalenceEvidence {
document_count: self.document_count,
manifest_fingerprint,
golden_query_digest: combined.finalize().to_hex().to_string(),
golden_query_hit_counts,
}
}
}
fn lexical_rebuild_equivalence_update_opt_str(hasher: &mut blake3::Hasher, value: Option<&str>) {
match value {
Some(s) => {
hasher.update(&[0x01_u8]);
hasher.update(&(s.len() as u64).to_le_bytes());
hasher.update(s.as_bytes());
}
None => {
hasher.update(&[0x00_u8]);
}
}
}
#[derive(Debug)]
struct LexicalRebuildPacketPrepInput {
conversation: crate::storage::sqlite::LexicalRebuildConversationRow,
messages: Option<Vec<crate::model::types::Message>>,
}
fn prepare_lexical_rebuild_packet_from_canonical(
input: LexicalRebuildPacketPrepInput,
source_map: &HashMap<String, (SourceKind, Option<String>)>,
) -> Result<LexicalRebuildConversationPacket> {
if let Some(messages) = input.messages {
return LexicalRebuildConversationPacket::from_canonical_replay_messages(
input.conversation,
messages,
source_map,
);
}
Ok(LexicalRebuildConversationPacket::from_canonical_replay(
input.conversation,
crate::storage::sqlite::LexicalRebuildGroupedMessageRows::new(),
None,
source_map,
))
}
fn prepare_lexical_rebuild_packet_batch(
conversation_page: Vec<crate::storage::sqlite::LexicalRebuildConversationRow>,
mut grouped_messages: HashMap<i64, Vec<crate::model::types::Message>>,
source_map: &HashMap<String, (SourceKind, Option<String>)>,
lexical_rebuild_worker_pool: Option<&ThreadPool>,
) -> Result<Vec<LexicalRebuildConversationPacket>> {
let inputs = conversation_page
.into_iter()
.map(|conversation| {
let messages = conversation
.id
.and_then(|conversation_id| grouped_messages.remove(&conversation_id));
LexicalRebuildPacketPrepInput {
conversation,
messages,
}
})
.collect::<Vec<_>>();
match lexical_rebuild_worker_pool {
Some(pool) => pool.install(|| {
inputs
.into_par_iter()
.map(|input| prepare_lexical_rebuild_packet_from_canonical(input, source_map))
.collect::<Result<Vec<_>>>()
}),
None => inputs
.into_iter()
.map(|input| prepare_lexical_rebuild_packet_from_canonical(input, source_map))
.collect::<Result<Vec<_>>>(),
}
}
fn assign_lexical_rebuild_flow_reservation_bytes(
packets: &mut [LexicalRebuildConversationPacket],
reserved_bytes: usize,
) {
if packets.is_empty() || reserved_bytes == 0 {
return;
}
let total_message_bytes = packets
.iter()
.map(|packet| packet.message_bytes)
.sum::<usize>();
if total_message_bytes == 0 {
if let Some(first_packet) = packets.first_mut() {
first_packet.flow_reservation_bytes = reserved_bytes;
}
return;
}
let mut remaining_reserved = reserved_bytes;
let mut remaining_message_bytes = total_message_bytes;
let len = packets.len();
for (idx, packet) in packets.iter_mut().enumerate() {
let share = if idx + 1 == len {
remaining_reserved
} else if remaining_message_bytes == 0 || packet.message_bytes == 0 {
0
} else {
packet.message_bytes.saturating_mul(remaining_reserved) / remaining_message_bytes
};
packet.flow_reservation_bytes = share;
remaining_reserved = remaining_reserved.saturating_sub(share);
remaining_message_bytes = remaining_message_bytes.saturating_sub(packet.message_bytes);
}
}
struct StreamingBatchFlowReservation<'a> {
limiter: Option<&'a StreamingByteLimiter>,
reserved_bytes: usize,
}
impl<'a> StreamingBatchFlowReservation<'a> {
fn new(limiter: Option<&'a StreamingByteLimiter>, reserved_bytes: usize) -> Self {
Self {
limiter,
reserved_bytes,
}
}
fn release_now(&mut self) {
let reserved_bytes = std::mem::take(&mut self.reserved_bytes);
if reserved_bytes == 0 {
return;
}
if let Some(limiter) = self.limiter {
limiter.release(reserved_bytes);
}
}
}
impl Drop for StreamingBatchFlowReservation<'_> {
fn drop(&mut self) {
self.release_now();
}
}
#[cfg(test)]
#[derive(Debug, Clone, PartialEq, Eq)]
struct LexicalRebuildPacketSemanticView {
version: u32,
agent: String,
external_id: Option<String>,
workspace: Option<String>,
source_path: String,
title: Option<String>,
started_at: Option<i64>,
ended_at: Option<i64>,
source_id: String,
origin_kind: String,
origin_host: Option<String>,
contract_projections: ConversationPacketSinkProjections,
messages: crate::storage::sqlite::LexicalRebuildGroupedMessageRows,
message_count: usize,
message_bytes: usize,
}
#[allow(clippy::too_many_arguments)]
fn flush_streamed_lexical_rebuild_batch(
pending_batch: &mut Vec<LexicalRebuildConversationPacket>,
pending_batch_message_count: &mut usize,
pending_batch_message_bytes: &mut usize,
lexical_rebuild_flow_limiter: Option<&StreamingByteLimiter>,
lexical_rebuild_worker_pool: Option<&ThreadPool>,
t_index: &mut TantivyIndex,
indexed_docs: &mut usize,
messages_since_commit: &mut usize,
message_bytes_since_commit: &mut usize,
current_batch_conversation_limit: &mut usize,
batch_conversation_limit: usize,
page_size: i64,
perf_profile: Option<&mut LexicalRebuildPerfProfile>,
) -> Result<()> {
if pending_batch.is_empty() {
return Ok(());
}
let batch_conversations = pending_batch.len();
let chunk_message_count = *pending_batch_message_count;
let chunk_message_bytes = *pending_batch_message_bytes;
let chunk_missing_conversation_ids = pending_batch
.iter()
.filter(|packet| packet.diagnostics.missing_conversation_id)
.count();
let chunk_last_message_id = pending_batch
.iter()
.filter_map(|packet| packet.last_message_id)
.max();
let chunk_flow_reservation_bytes = pending_batch
.iter()
.map(|packet| packet.flow_reservation_bytes)
.sum::<usize>();
let mut flow_reservation = StreamingBatchFlowReservation::new(
lexical_rebuild_flow_limiter,
chunk_flow_reservation_bytes,
);
let chunk_limit = *current_batch_conversation_limit;
let prepare_started = perf_profile.as_ref().map(|_| Instant::now());
let prepared_docs =
lexical_rebuild_prepare_prebuilt_doc_refs(pending_batch, lexical_rebuild_worker_pool);
let add_started = perf_profile.as_ref().map(|_| Instant::now());
*indexed_docs =
(*indexed_docs).saturating_add(t_index.add_prebuilt_document_refs_slice(&prepared_docs)?);
if let Some(profile) = perf_profile {
profile.batch_flushes = profile.batch_flushes.saturating_add(1);
profile.batch_conversations = profile
.batch_conversations
.saturating_add(batch_conversations);
profile.batch_messages = profile.batch_messages.saturating_add(chunk_message_count);
profile.batch_message_bytes = profile
.batch_message_bytes
.saturating_add(chunk_message_bytes);
if let Some(started) = prepare_started {
profile.prepare_duration += started.elapsed();
}
if let Some(started) = add_started {
profile.add_duration += started.elapsed();
}
}
*messages_since_commit = (*messages_since_commit).saturating_add(chunk_message_count);
*message_bytes_since_commit = (*message_bytes_since_commit).saturating_add(chunk_message_bytes);
flow_reservation.release_now();
tracing::info!(
page_size,
packet_version = pending_batch
.first()
.map(|packet| packet.diagnostics.version)
.unwrap_or(LEXICAL_REBUILD_PACKET_VERSION),
chunk_conversations = batch_conversations,
chunk_messages = chunk_message_count,
chunk_message_bytes = chunk_message_bytes,
chunk_missing_conversation_ids,
chunk_last_message_id,
chunk_limit,
"lexical rebuild flushed a streamed conversation batch"
);
pending_batch.clear();
*pending_batch_message_count = 0;
*pending_batch_message_bytes = 0;
*current_batch_conversation_limit = batch_conversation_limit;
Ok(())
}
fn lexical_rebuild_prepare_prebuilt_doc_refs<'a>(
batch: &'a [LexicalRebuildConversationPacket],
lexical_rebuild_worker_pool: Option<&ThreadPool>,
) -> Vec<frankensearch::lexical::CassDocumentRef<'a>> {
let build_doc_shards = || {
batch
.par_iter()
.map(LexicalRebuildConversationPacket::prebuilt_docs)
.collect::<Vec<_>>()
};
let prepared_doc_shards = if let Some(pool) = lexical_rebuild_worker_pool {
pool.install(build_doc_shards)
} else {
build_doc_shards()
};
let doc_capacity = prepared_doc_shards.iter().map(|shard| shard.len()).sum();
let mut prepared_docs = Vec::with_capacity(doc_capacity);
for shard in prepared_doc_shards {
prepared_docs.extend(shard);
}
prepared_docs
}
#[cfg_attr(not(test), allow(dead_code))]
fn build_lexical_rebuild_shard_index(
shard_index_path: &Path,
batch: &[LexicalRebuildConversationPacket],
lexical_rebuild_worker_pool: Option<&ThreadPool>,
) -> Result<usize> {
Ok(
build_lexical_rebuild_shard_index_summary_with_writer_parallelism(
shard_index_path,
batch,
lexical_rebuild_worker_pool,
None,
)?
.docs,
)
}
fn build_lexical_rebuild_shard_index_summary_with_writer_parallelism(
shard_index_path: &Path,
batch: &[LexicalRebuildConversationPacket],
lexical_rebuild_worker_pool: Option<&ThreadPool>,
writer_parallelism: Option<usize>,
) -> Result<SearchableIndexSummary> {
let mut shard_index = if let Some(writer_parallelism) = writer_parallelism {
TantivyIndex::open_or_create_with_writer_parallelism(shard_index_path, writer_parallelism)
} else {
TantivyIndex::open_or_create(shard_index_path)
}
.with_context(|| {
format!(
"opening lexical rebuild shard index at {}",
shard_index_path.display()
)
})?;
shard_index.configure_bulk_load_merge_policy();
let prepared_docs =
lexical_rebuild_prepare_prebuilt_doc_refs(batch, lexical_rebuild_worker_pool);
let indexed_docs = shard_index
.add_prebuilt_document_refs_slice(&prepared_docs)
.with_context(|| {
format!(
"adding prebuilt lexical rebuild docs into shard index {}",
shard_index_path.display()
)
})?;
shard_index.commit().with_context(|| {
format!(
"committing lexical rebuild shard index at {}",
shard_index_path.display()
)
})?;
Ok(SearchableIndexSummary {
docs: indexed_docs,
segments: shard_index.segment_count(),
})
}
#[derive(Debug)]
struct LexicalRebuildShardBuildWork {
shard: LexicalShardPlanShard,
packets: Vec<LexicalRebuildConversationPacket>,
message_bytes: usize,
shard_index_path: PathBuf,
writer_parallelism: usize,
}
#[derive(Debug)]
struct LexicalRebuildShardBuildResult {
shard: LexicalShardPlanShard,
indexed_docs: usize,
segments: usize,
shard_index_path: PathBuf,
message_bytes: usize,
index_size_bytes: u64,
build_duration_ms: u64,
amplification_milli: Option<u64>,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
struct LexicalRebuildShardBuildTelemetrySnapshot {
completed_jobs: usize,
last_shard_index: Option<usize>,
last_message_bytes: usize,
last_index_size_bytes: u64,
last_duration_ms: u64,
last_amplification_milli: Option<u64>,
observed_amplification_milli: Option<u64>,
}
#[derive(Debug, Default)]
struct LexicalRebuildShardBuildTelemetry {
completed_jobs: AtomicUsize,
last_shard_index: AtomicUsize,
last_message_bytes: AtomicUsize,
last_index_size_bytes: AtomicU64,
last_duration_ms: AtomicU64,
last_amplification_milli: AtomicU64,
observed_amplification_milli: AtomicU64,
}
impl LexicalRebuildShardBuildTelemetry {
fn record(&self, result: &LexicalRebuildShardBuildResult) {
self.completed_jobs.fetch_add(1, Ordering::Relaxed);
self.last_shard_index
.store(result.shard.shard_index, Ordering::Relaxed);
self.last_message_bytes
.store(result.message_bytes, Ordering::Relaxed);
self.last_index_size_bytes
.store(result.index_size_bytes, Ordering::Relaxed);
self.last_duration_ms
.store(result.build_duration_ms, Ordering::Relaxed);
self.last_amplification_milli
.store(result.amplification_milli.unwrap_or(0), Ordering::Relaxed);
if let Some(amplification_milli) = result.amplification_milli {
let conservative_amplification = amplification_milli
.max(LEXICAL_REBUILD_STAGED_SHARD_BUILD_AMPLIFICATION_FLOOR_MILLI);
let _ = self.observed_amplification_milli.fetch_update(
Ordering::Relaxed,
Ordering::Relaxed,
|current| Some(current.max(conservative_amplification)),
);
}
}
fn snapshot(&self) -> LexicalRebuildShardBuildTelemetrySnapshot {
let completed_jobs = self.completed_jobs.load(Ordering::Relaxed);
let last_shard_index = self.last_shard_index.load(Ordering::Relaxed);
let last_amplification_milli = self.last_amplification_milli.load(Ordering::Relaxed);
let observed_amplification_milli =
self.observed_amplification_milli.load(Ordering::Relaxed);
LexicalRebuildShardBuildTelemetrySnapshot {
completed_jobs,
last_shard_index: (completed_jobs > 0).then_some(last_shard_index),
last_message_bytes: self.last_message_bytes.load(Ordering::Relaxed),
last_index_size_bytes: self.last_index_size_bytes.load(Ordering::Relaxed),
last_duration_ms: self.last_duration_ms.load(Ordering::Relaxed),
last_amplification_milli: (last_amplification_milli > 0)
.then_some(last_amplification_milli),
observed_amplification_milli: (observed_amplification_milli > 0)
.then_some(observed_amplification_milli),
}
}
}
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone)]
struct LexicalRebuildShardMergeWork {
output_path: PathBuf,
input_paths: Vec<PathBuf>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct LexicalRebuildShardMergeArtifact {
first_shard_index: usize,
last_shard_index: usize,
index_path: PathBuf,
docs: usize,
segments: usize,
}
#[derive(Debug)]
struct LexicalRebuildShardMergeResult {
output_level: usize,
artifact: LexicalRebuildShardMergeArtifact,
}
#[derive(Debug)]
struct LexicalRebuildShardMergeJob {
output_level: usize,
output_path: PathBuf,
input_artifacts: Vec<LexicalRebuildShardMergeArtifact>,
}
#[derive(Debug)]
enum LexicalRebuildShardMergeMessage {
Built(LexicalRebuildShardMergeResult),
Error {
output_level: usize,
first_shard_index: usize,
last_shard_index: usize,
error: String,
},
}
#[derive(Debug)]
enum LexicalRebuildShardBuildMessage {
Built(LexicalRebuildShardBuildResult),
Error { shard_index: usize, error: String },
}
fn lexical_rebuild_amplification_milli(index_size_bytes: u64, message_bytes: usize) -> Option<u64> {
if message_bytes == 0 {
return None;
}
let scaled = u128::from(index_size_bytes).saturating_mul(1_000);
u64::try_from(scaled / (message_bytes as u128)).ok()
}
fn spawn_lexical_rebuild_shard_builder_workers(
worker_count: usize,
rx: Receiver<LexicalRebuildShardBuildWork>,
tx: Sender<LexicalRebuildShardBuildMessage>,
flow_limiter: Arc<StreamingByteLimiter>,
lexical_rebuild_worker_pool: Option<Arc<ThreadPool>>,
) -> Vec<JoinHandle<()>> {
let tracing_dispatch = tracing::dispatcher::get_default(|dispatch| dispatch.clone());
(0..worker_count.max(1))
.map(|worker_idx| {
let rx = rx.clone();
let tx = tx.clone();
let flow_limiter = Arc::clone(&flow_limiter);
let lexical_rebuild_worker_pool = lexical_rebuild_worker_pool.clone();
let tracing_dispatch = tracing_dispatch.clone();
thread::spawn(move || {
tracing::dispatcher::with_default(&tracing_dispatch, || {
while let Ok(work) = rx.recv() {
let flow_reservation_bytes = work
.packets
.iter()
.map(|packet| packet.flow_reservation_bytes)
.sum::<usize>();
let shard_message_bytes = work.message_bytes;
let build_started = Instant::now();
let result =
build_lexical_rebuild_shard_index_summary_with_writer_parallelism(
&work.shard_index_path,
&work.packets,
lexical_rebuild_worker_pool.as_deref(),
Some(work.writer_parallelism),
);
let build_duration_ms =
u64::try_from(build_started.elapsed().as_millis()).unwrap_or(u64::MAX);
let index_size_bytes =
directory_size_bytes_best_effort(&work.shard_index_path);
let amplification_milli = lexical_rebuild_amplification_milli(
index_size_bytes,
shard_message_bytes,
);
flow_limiter.release(flow_reservation_bytes);
match result {
Ok(summary) => {
tracing::info!(
worker_idx,
shard_index = work.shard.shard_index,
writer_parallelism = work.writer_parallelism,
indexed_docs = summary.docs,
shard_conversations = work.shard.conversation_count,
shard_message_bytes,
index_size_bytes,
build_duration_ms,
amplification_milli,
"built lexical rebuild shard index"
);
if tx
.send(LexicalRebuildShardBuildMessage::Built(
LexicalRebuildShardBuildResult {
shard: work.shard,
indexed_docs: summary.docs,
segments: summary.segments,
shard_index_path: work.shard_index_path,
message_bytes: shard_message_bytes,
index_size_bytes,
build_duration_ms,
amplification_milli,
},
))
.is_err()
{
return;
}
}
Err(err) => {
let _ = tx.send(LexicalRebuildShardBuildMessage::Error {
shard_index: work.shard.shard_index,
error: format!("{err:#}"),
});
return;
}
}
}
});
})
})
.collect()
}
fn spawn_lexical_rebuild_shard_merge_workers(
worker_count: usize,
rx: Receiver<LexicalRebuildShardMergeJob>,
tx: Sender<LexicalRebuildShardMergeMessage>,
) -> Vec<JoinHandle<()>> {
let tracing_dispatch = tracing::dispatcher::get_default(|dispatch| dispatch.clone());
(0..worker_count.max(1))
.map(|worker_idx| {
let rx = rx.clone();
let tx = tx.clone();
let tracing_dispatch = tracing_dispatch.clone();
thread::spawn(move || {
tracing::dispatcher::with_default(&tracing_dispatch, || {
while let Ok(work) = rx.recv() {
let first_shard_index = work
.input_artifacts
.first()
.map(|artifact| artifact.first_shard_index)
.unwrap_or(usize::MAX);
let last_shard_index = work
.input_artifacts
.last()
.map(|artifact| artifact.last_shard_index)
.unwrap_or(usize::MAX);
let input_paths = work
.input_artifacts
.iter()
.map(|artifact| artifact.index_path.clone())
.collect::<Vec<_>>();
let result =
crate::search::tantivy::TantivyIndex::merge_compatible_index_directories(
&work.output_path,
&input_paths,
);
match result {
Ok(merged_index) => {
let docs = work
.input_artifacts
.iter()
.map(|artifact| artifact.docs)
.sum();
let segments = merged_index.segment_count();
drop(merged_index);
tracing::info!(
worker_idx,
output_level = work.output_level,
first_shard_index,
last_shard_index,
input_count = input_paths.len(),
"merged staged lexical rebuild shard group"
);
if tx
.send(LexicalRebuildShardMergeMessage::Built(
LexicalRebuildShardMergeResult {
output_level: work.output_level,
artifact: LexicalRebuildShardMergeArtifact {
first_shard_index,
last_shard_index,
index_path: work.output_path,
docs,
segments,
},
},
))
.is_err()
{
return;
}
}
Err(err) => {
let _ = tx.send(LexicalRebuildShardMergeMessage::Error {
output_level: work.output_level,
first_shard_index,
last_shard_index,
error: format!("{err:#}"),
});
return;
}
}
}
});
})
})
.collect()
}
#[derive(Debug)]
struct LexicalRebuildShardMergeCoordinator {
stage_root: PathBuf,
ready_levels: Vec<VecDeque<LexicalRebuildShardMergeArtifact>>,
next_output_seq_by_level: Vec<usize>,
pending_merge_jobs: usize,
allowed_pending_merge_jobs: usize,
}
impl LexicalRebuildShardMergeCoordinator {
const EAGER_MERGE_FAN_IN: usize = 8;
fn new(stage_root: PathBuf) -> Self {
Self {
stage_root,
ready_levels: Vec::new(),
next_output_seq_by_level: Vec::new(),
pending_merge_jobs: 0,
allowed_pending_merge_jobs: 0,
}
}
fn pending_merge_jobs(&self) -> usize {
self.pending_merge_jobs
}
fn ready_artifact_count(&self) -> usize {
self.ready_levels.iter().map(VecDeque::len).sum()
}
fn ready_merge_groups(&self) -> usize {
self.ready_levels
.iter()
.map(|level| level.len() / Self::EAGER_MERGE_FAN_IN)
.sum()
}
fn set_allowed_pending_merge_jobs(
&mut self,
allowed_pending_merge_jobs: usize,
merge_work_tx: &Sender<LexicalRebuildShardMergeJob>,
) -> Result<()> {
self.allowed_pending_merge_jobs = allowed_pending_merge_jobs;
self.schedule_ready_merges(merge_work_tx)
}
fn queue_base_artifact(
&mut self,
artifact: LexicalRebuildShardMergeArtifact,
merge_work_tx: &Sender<LexicalRebuildShardMergeJob>,
) -> Result<()> {
self.queue_artifact_at_level(0, artifact, merge_work_tx)
}
fn complete_merge(
&mut self,
result: LexicalRebuildShardMergeResult,
merge_work_tx: &Sender<LexicalRebuildShardMergeJob>,
) -> Result<()> {
self.pending_merge_jobs = self.pending_merge_jobs.saturating_sub(1);
self.queue_artifact_at_level(result.output_level, result.artifact, merge_work_tx)
}
fn final_merge_input_artifacts(&self) -> Vec<LexicalRebuildShardMergeArtifact> {
let mut artifacts = self
.ready_levels
.iter()
.flat_map(|level| level.iter().cloned())
.collect::<Vec<_>>();
artifacts.sort_by_key(|artifact| (artifact.first_shard_index, artifact.last_shard_index));
artifacts
}
fn queue_artifact_at_level(
&mut self,
level: usize,
artifact: LexicalRebuildShardMergeArtifact,
merge_work_tx: &Sender<LexicalRebuildShardMergeJob>,
) -> Result<()> {
self.ensure_level(level);
self.ready_levels[level].push_back(artifact);
self.schedule_ready_merges(merge_work_tx)
}
fn schedule_ready_merges(
&mut self,
merge_work_tx: &Sender<LexicalRebuildShardMergeJob>,
) -> Result<()> {
let mut level = 0usize;
while level < self.ready_levels.len() {
while self.pending_merge_jobs < self.allowed_pending_merge_jobs
&& self.ready_levels[level].len() >= Self::EAGER_MERGE_FAN_IN
{
let output_level = level.saturating_add(1);
self.ensure_level(output_level);
let inputs = (0..Self::EAGER_MERGE_FAN_IN)
.map(|_| {
self.ready_levels[level]
.pop_front()
.expect("merge coordinator should have enough artifacts ready")
})
.collect::<Vec<_>>();
let seq = self.next_output_seq_by_level[output_level];
self.next_output_seq_by_level[output_level] =
self.next_output_seq_by_level[output_level].saturating_add(1);
let round_dir = self.stage_root.join(format!("round-{output_level:05}"));
fs::create_dir_all(&round_dir).with_context(|| {
format!(
"creating eager staged lexical merge round directory {}",
round_dir.display()
)
})?;
let first_shard_index = inputs
.first()
.map(|artifact| artifact.first_shard_index)
.unwrap_or(usize::MAX);
let last_shard_index = inputs
.last()
.map(|artifact| artifact.last_shard_index)
.unwrap_or(usize::MAX);
let output_path = round_dir.join(format!("merge-{seq:05}"));
tracing::info!(
merge_level = output_level,
first_shard_index,
last_shard_index,
input_count = inputs.len(),
"queueing eager staged lexical rebuild merge job"
);
merge_work_tx
.send(LexicalRebuildShardMergeJob {
output_level,
output_path,
input_artifacts: inputs,
})
.map_err(|_| {
anyhow::anyhow!(
"staged lexical rebuild eager merge worker queue disconnected"
)
})?;
self.pending_merge_jobs = self.pending_merge_jobs.saturating_add(1);
}
level = level.saturating_add(1);
}
Ok(())
}
fn ensure_level(&mut self, level: usize) {
while self.ready_levels.len() <= level {
self.ready_levels.push(VecDeque::new());
}
while self.next_output_seq_by_level.len() <= level {
self.next_output_seq_by_level.push(0);
}
}
}
const LEXICAL_REBUILD_FINAL_FRONTIER_FEDERATED_SHARD_LIMIT: usize = 32;
fn should_reduce_staged_lexical_final_frontier(frontier_artifacts: usize) -> bool {
frontier_artifacts > LEXICAL_REBUILD_FINAL_FRONTIER_FEDERATED_SHARD_LIMIT
}
fn reduce_staged_lexical_final_merge_frontier_via_workers(
mut frontier: Vec<LexicalRebuildShardMergeArtifact>,
stage_root: &Path,
max_parallel_jobs: usize,
merge_work_tx: &Sender<LexicalRebuildShardMergeJob>,
merge_result_rx: &Receiver<LexicalRebuildShardMergeMessage>,
) -> Result<Vec<LexicalRebuildShardMergeArtifact>> {
if frontier.len() <= 1 {
return Ok(frontier);
}
let worker_limit = max_parallel_jobs.max(1);
let reduction_stage_root = stage_root.join("worker-final-frontier");
fs::create_dir_all(&reduction_stage_root).with_context(|| {
format!(
"creating staged lexical final-frontier reduction directory {}",
reduction_stage_root.display()
)
})?;
frontier.sort_by_key(|artifact| (artifact.first_shard_index, artifact.last_shard_index));
tracing::info!(
ready_artifacts = frontier.len(),
worker_limit,
"draining staged lexical rebuild final merge frontier via merge workers"
);
let mut pending_jobs = 0usize;
let mut next_output_seq = 0usize;
while frontier.len().saturating_add(pending_jobs) > 1 {
while pending_jobs < worker_limit && frontier.len() > 1 {
let input_count = frontier
.len()
.min(LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN);
let inputs = frontier.drain(0..input_count).collect::<Vec<_>>();
let first_shard_index = inputs
.first()
.map(|artifact| artifact.first_shard_index)
.unwrap_or(usize::MAX);
let last_shard_index = inputs
.last()
.map(|artifact| artifact.last_shard_index)
.unwrap_or(usize::MAX);
let output_path = reduction_stage_root.join(format!("merge-{next_output_seq:05}"));
next_output_seq = next_output_seq.saturating_add(1);
tracing::info!(
first_shard_index,
last_shard_index,
input_count = inputs.len(),
"queueing staged lexical rebuild final-frontier merge job"
);
merge_work_tx
.send(LexicalRebuildShardMergeJob {
output_level: 0,
output_path,
input_artifacts: inputs,
})
.map_err(|_| {
anyhow::anyhow!(
"staged lexical rebuild eager merge worker queue disconnected during final-frontier reduction"
)
})?;
pending_jobs = pending_jobs.saturating_add(1);
}
let message = merge_result_rx.recv().map_err(|_| {
anyhow::anyhow!(
"staged lexical rebuild eager merge channel closed during final-frontier reduction"
)
})?;
match message {
LexicalRebuildShardMergeMessage::Built(result) => {
pending_jobs = pending_jobs.saturating_sub(1);
frontier.push(result.artifact);
frontier.sort_by_key(|artifact| {
(artifact.first_shard_index, artifact.last_shard_index)
});
}
LexicalRebuildShardMergeMessage::Error {
output_level,
first_shard_index,
last_shard_index,
error,
} => {
return Err(anyhow::anyhow!(
"staged lexical final-frontier merge at level {output_level} for shard range {first_shard_index}..={last_shard_index} failed: {error}"
));
}
}
}
Ok(frontier)
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
struct LexicalRebuildStagedMergeRuntimeSnapshot {
workers_max: usize,
allowed_jobs: usize,
active_jobs: usize,
ready_artifacts: usize,
ready_groups: usize,
controller_reason: String,
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
struct LexicalRebuildStagedShardBuildRuntimeSnapshot {
workers_max: usize,
allowed_jobs: usize,
active_jobs: usize,
pending_jobs: usize,
controller_reason: String,
}
#[derive(Debug)]
struct LexicalRebuildStagedMergeController {
max_workers: usize,
loadavg_high_watermark_1m_milli: Option<u32>,
}
impl LexicalRebuildStagedMergeController {
fn new(max_workers: usize, loadavg_high_watermark_1m_milli: Option<u32>) -> Self {
Self {
max_workers: max_workers.max(1),
loadavg_high_watermark_1m_milli,
}
}
fn decide(
&self,
producer_finished: bool,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
merge_coordinator: &LexicalRebuildShardMergeCoordinator,
) -> LexicalRebuildStagedMergeRuntimeSnapshot {
let active_jobs = merge_coordinator.pending_merge_jobs();
let ready_artifacts = merge_coordinator.ready_artifact_count();
let ready_groups = merge_coordinator.ready_merge_groups();
let backlog_groups = active_jobs.saturating_add(ready_groups);
if backlog_groups == 0 {
return LexicalRebuildStagedMergeRuntimeSnapshot {
workers_max: self.max_workers,
controller_reason: "no_staged_merge_backlog".to_string(),
..LexicalRebuildStagedMergeRuntimeSnapshot::default()
};
}
let (allowed_jobs, controller_reason) = if !producer_finished {
if let (Some(loadavg_1m_milli), Some(high_watermark_1m_milli)) = (
runtime.host_loadavg_1m_milli,
self.loadavg_high_watermark_1m_milli,
) && loadavg_1m_milli >= high_watermark_1m_milli
{
(
0,
format!(
"host_loadavg_1m_{}_reached_high_watermark_{}",
format_lexical_rebuild_loadavg_1m_milli(loadavg_1m_milli),
format_lexical_rebuild_loadavg_1m_milli(high_watermark_1m_milli)
),
)
} else if runtime.ordered_buffered_pages > 0 || runtime.queue_depth > 0 {
let debt_budget = ready_groups.min(self.max_workers).max(1);
let trickle_budget = active_jobs.max(debt_budget).min(self.max_workers);
(
trickle_budget,
format!(
"builder_handoff_pressure_scaling_staged_merge_budget_{}_active_jobs_{}_ready_groups_{}_debt_budget_{}_buffered_pages_{}_queue_depth_{}",
trickle_budget,
active_jobs,
ready_groups,
debt_budget,
runtime.ordered_buffered_pages,
runtime.queue_depth
),
)
} else if runtime.page_prep_workers > 0
&& runtime.active_page_prep_jobs >= runtime.page_prep_workers
{
let saturated_budget = ready_groups
.min(self.max_workers.div_ceil(2))
.max(1)
.max(active_jobs.min(self.max_workers));
(
saturated_budget,
if saturated_budget == 1 {
format!(
"page_prep_workers_saturated_{}_of_{}",
runtime.active_page_prep_jobs, runtime.page_prep_workers
)
} else {
format!(
"page_prep_workers_saturated_{}_of_{}_merge_budget_{}_active_jobs_{}_ready_groups_{}",
runtime.active_page_prep_jobs,
runtime.page_prep_workers,
saturated_budget,
active_jobs,
ready_groups
)
},
)
} else if runtime.active_page_prep_jobs > 0
|| runtime.pending_batch_conversations > 0
|| runtime.pending_batch_message_bytes > 0
{
(
self.max_workers.div_ceil(2).max(1),
format!(
"pipeline_active_page_prep_jobs_{}_pending_batch_conversations_{}_pending_batch_message_bytes_{}",
runtime.active_page_prep_jobs,
runtime.pending_batch_conversations,
runtime.pending_batch_message_bytes
),
)
} else {
(
self.max_workers,
"producer_idle_allowing_max_staged_merge_parallelism".to_string(),
)
}
} else if ready_artifacts.saturating_add(active_jobs)
<= LEXICAL_REBUILD_FINAL_FRONTIER_FEDERATED_SHARD_LIMIT
{
(
active_jobs.min(self.max_workers),
format!(
"producer_finished_final_frontier_within_federated_cap_{}_active_jobs_{}_ready_artifacts_{}",
LEXICAL_REBUILD_FINAL_FRONTIER_FEDERATED_SHARD_LIMIT,
active_jobs,
ready_artifacts
),
)
} else if let (Some(loadavg_1m_milli), Some(high_watermark_1m_milli)) = (
runtime.host_loadavg_1m_milli,
self.loadavg_high_watermark_1m_milli,
) && loadavg_1m_milli >= high_watermark_1m_milli
{
(
1.min(self.max_workers),
format!(
"producer_finished_but_host_loadavg_1m_{}_remains_high",
format_lexical_rebuild_loadavg_1m_milli(loadavg_1m_milli)
),
)
} else {
(
self.max_workers,
"producer_finished_allowing_max_staged_merge_parallelism".to_string(),
)
};
LexicalRebuildStagedMergeRuntimeSnapshot {
workers_max: self.max_workers,
allowed_jobs: allowed_jobs.min(backlog_groups),
active_jobs,
ready_artifacts,
ready_groups,
controller_reason,
}
}
}
#[derive(Debug)]
struct LexicalRebuildStagedShardBuildController {
max_workers: usize,
loadavg_high_watermark_1m_milli: Option<u32>,
memory_reserve_bytes: usize,
emergency_memory_reserve_bytes: usize,
}
impl LexicalRebuildStagedShardBuildController {
fn new(max_workers: usize, loadavg_high_watermark_1m_milli: Option<u32>) -> Self {
Self::new_with_memory_reserves(
max_workers,
loadavg_high_watermark_1m_milli,
lexical_rebuild_staged_shard_build_memory_reserve_bytes(),
lexical_rebuild_staged_shard_build_emergency_memory_reserve_bytes(),
)
}
fn new_with_memory_reserves(
max_workers: usize,
loadavg_high_watermark_1m_milli: Option<u32>,
memory_reserve_bytes: usize,
emergency_memory_reserve_bytes: usize,
) -> Self {
let memory_reserve_bytes = memory_reserve_bytes.max(1);
Self {
max_workers: max_workers.max(1),
loadavg_high_watermark_1m_milli,
memory_reserve_bytes,
emergency_memory_reserve_bytes: emergency_memory_reserve_bytes
.max(1)
.min(memory_reserve_bytes),
}
}
fn decide(
&self,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
staged_merge_runtime: &LexicalRebuildStagedMergeRuntimeSnapshot,
active_jobs: usize,
pending_jobs: usize,
next_pending_job_message_bytes: Option<usize>,
) -> LexicalRebuildStagedShardBuildRuntimeSnapshot {
let backlog_jobs = active_jobs.saturating_add(pending_jobs);
if backlog_jobs == 0 {
return LexicalRebuildStagedShardBuildRuntimeSnapshot {
workers_max: self.max_workers,
allowed_jobs: 0,
active_jobs,
pending_jobs,
controller_reason: "no_staged_shard_build_backlog".to_string(),
};
}
let (allowed_jobs, controller_reason) =
if let (Some(loadavg_1m_milli), Some(high_watermark_1m_milli)) = (
runtime.host_loadavg_1m_milli,
self.loadavg_high_watermark_1m_milli,
) && loadavg_1m_milli >= high_watermark_1m_milli
{
(
1.min(self.max_workers),
format!(
"host_loadavg_1m_{}_reached_high_watermark_{}",
format_lexical_rebuild_loadavg_1m_milli(loadavg_1m_milli),
format_lexical_rebuild_loadavg_1m_milli(high_watermark_1m_milli)
),
)
} else {
let merge_reserved_slots = staged_merge_runtime.allowed_jobs.min(self.max_workers);
let shard_budget = self.max_workers.saturating_sub(merge_reserved_slots).max(1);
if merge_reserved_slots > 0 {
(
shard_budget,
format!(
"reserving_{}_slots_for_staged_merge_active_jobs_{}_ready_groups_{}",
merge_reserved_slots,
staged_merge_runtime.active_jobs,
staged_merge_runtime.ready_groups
),
)
} else {
(
self.max_workers,
"staged_merge_idle_lending_full_budget_to_shard_builders".to_string(),
)
}
};
let (allowed_jobs, controller_reason) = self.apply_memory_admission(
runtime,
active_jobs,
backlog_jobs,
next_pending_job_message_bytes,
allowed_jobs,
controller_reason,
);
LexicalRebuildStagedShardBuildRuntimeSnapshot {
workers_max: self.max_workers,
allowed_jobs: allowed_jobs.min(backlog_jobs),
active_jobs,
pending_jobs,
controller_reason,
}
}
fn apply_memory_admission(
&self,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
active_jobs: usize,
backlog_jobs: usize,
next_pending_job_message_bytes: Option<usize>,
allowed_jobs: usize,
controller_reason: String,
) -> (usize, String) {
let Some(available_memory_bytes) = runtime.host_available_memory_bytes else {
return (allowed_jobs, controller_reason);
};
let available_memory_bytes = usize_from_u64_saturating(available_memory_bytes);
let estimated_builder_bytes =
self.estimated_builder_memory_bytes(runtime, next_pending_job_message_bytes);
if available_memory_bytes <= self.emergency_memory_reserve_bytes {
if active_jobs == 0 && backlog_jobs > 0 {
let probe_ceiling =
available_memory_bytes.saturating_sub(available_memory_bytes / 4);
if estimated_builder_bytes <= probe_ceiling {
return (
1.min(allowed_jobs).min(backlog_jobs),
format!(
"host_available_memory_bytes_{}_below_emergency_reserve_{}_admitting_single_small_staged_shard_build_estimated_builder_bytes_{}",
available_memory_bytes,
self.emergency_memory_reserve_bytes,
estimated_builder_bytes
),
);
}
}
return (
active_jobs.min(backlog_jobs),
format!(
"host_available_memory_bytes_{}_below_emergency_reserve_{}_pausing_new_staged_shard_builds",
available_memory_bytes, self.emergency_memory_reserve_bytes
),
);
}
if available_memory_bytes <= self.memory_reserve_bytes {
let allowed_under_reserve = if active_jobs == 0 && backlog_jobs > 0 {
1
} else {
active_jobs
}
.min(allowed_jobs)
.min(backlog_jobs);
return (
allowed_under_reserve,
format!(
"host_available_memory_bytes_{}_below_reserve_{}_limiting_staged_shard_builds_to_{}",
available_memory_bytes, self.memory_reserve_bytes, allowed_under_reserve
),
);
}
let memory_headroom = available_memory_bytes.saturating_sub(self.memory_reserve_bytes);
let active_estimated_bytes = estimated_builder_bytes.saturating_mul(active_jobs);
let additional_jobs =
memory_headroom.saturating_sub(active_estimated_bytes) / estimated_builder_bytes.max(1);
let memory_allowed_jobs = active_jobs
.saturating_add(additional_jobs)
.min(self.max_workers)
.min(backlog_jobs)
.max(active_jobs.min(backlog_jobs));
if memory_allowed_jobs < allowed_jobs {
(
memory_allowed_jobs,
format!(
"host_available_memory_bytes_{}_reserve_{}_estimated_builder_bytes_{}_limiting_staged_shard_builds_to_{}",
available_memory_bytes,
self.memory_reserve_bytes,
estimated_builder_bytes,
memory_allowed_jobs
),
)
} else {
(allowed_jobs, controller_reason)
}
}
fn estimated_builder_memory_bytes(
&self,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
next_pending_job_message_bytes: Option<usize>,
) -> usize {
let amplification_milli = runtime
.staged_shard_build_observed_amplification_milli
.unwrap_or(LEXICAL_REBUILD_STAGED_SHARD_BUILD_AMPLIFICATION_FLOOR_MILLI)
.max(LEXICAL_REBUILD_STAGED_SHARD_BUILD_AMPLIFICATION_FLOOR_MILLI)
.saturating_mul(LEXICAL_REBUILD_STAGED_SHARD_BUILD_AMPLIFICATION_HEADROOM_MILLI)
/ 1_000;
let amplified_message_bytes = next_pending_job_message_bytes
.map(|message_bytes| {
let scaled =
(message_bytes as u128).saturating_mul(u128::from(amplification_milli)) / 1_000;
usize::try_from(scaled).unwrap_or(usize::MAX)
})
.unwrap_or(0);
amplified_message_bytes
.max(usize_from_u64_saturating(
LEXICAL_REBUILD_STAGED_SHARD_BUILD_MIN_ESTIMATED_BYTES,
))
.max(1)
}
}
fn apply_staged_merge_runtime_snapshot(
latest_runtime: &mut LexicalRebuildPipelineRuntimeSnapshot,
progress: Option<&Arc<IndexingProgress>>,
staged_merge_runtime: &LexicalRebuildStagedMergeRuntimeSnapshot,
) {
latest_runtime.staged_merge_workers_max = staged_merge_runtime.workers_max;
latest_runtime.staged_merge_allowed_jobs = staged_merge_runtime.allowed_jobs;
latest_runtime.staged_merge_active_jobs = staged_merge_runtime.active_jobs;
latest_runtime.staged_merge_ready_artifacts = staged_merge_runtime.ready_artifacts;
latest_runtime.staged_merge_ready_groups = staged_merge_runtime.ready_groups;
latest_runtime.staged_merge_controller_reason = staged_merge_runtime.controller_reason.clone();
let Some(progress) = progress else {
return;
};
progress
.rebuild_pipeline_staged_merge_workers_max
.store(staged_merge_runtime.workers_max, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_allowed_jobs
.store(staged_merge_runtime.allowed_jobs, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_active_jobs
.store(staged_merge_runtime.active_jobs, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_ready_artifacts
.store(staged_merge_runtime.ready_artifacts, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_ready_groups
.store(staged_merge_runtime.ready_groups, Ordering::Relaxed);
if let Ok(mut staged_merge_reason) = progress
.rebuild_pipeline_staged_merge_controller_reason
.lock()
{
*staged_merge_reason = staged_merge_runtime.controller_reason.clone();
}
}
fn apply_staged_shard_build_runtime_snapshot(
latest_runtime: &mut LexicalRebuildPipelineRuntimeSnapshot,
progress: Option<&Arc<IndexingProgress>>,
staged_shard_build_runtime: &LexicalRebuildStagedShardBuildRuntimeSnapshot,
) {
latest_runtime.staged_shard_build_workers_max = staged_shard_build_runtime.workers_max;
latest_runtime.staged_shard_build_allowed_jobs = staged_shard_build_runtime.allowed_jobs;
latest_runtime.staged_shard_build_active_jobs = staged_shard_build_runtime.active_jobs;
latest_runtime.staged_shard_build_pending_jobs = staged_shard_build_runtime.pending_jobs;
latest_runtime.staged_shard_build_controller_reason =
staged_shard_build_runtime.controller_reason.clone();
let Some(progress) = progress else {
return;
};
progress
.rebuild_pipeline_staged_shard_build_workers_max
.store(staged_shard_build_runtime.workers_max, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_allowed_jobs
.store(staged_shard_build_runtime.allowed_jobs, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_active_jobs
.store(staged_shard_build_runtime.active_jobs, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_pending_jobs
.store(staged_shard_build_runtime.pending_jobs, Ordering::Relaxed);
if let Ok(mut staged_shard_build_reason) = progress
.rebuild_pipeline_staged_shard_build_controller_reason
.lock()
{
*staged_shard_build_reason = staged_shard_build_runtime.controller_reason.clone();
}
}
fn apply_staged_shard_build_telemetry_snapshot(
latest_runtime: &mut LexicalRebuildPipelineRuntimeSnapshot,
progress: Option<&Arc<IndexingProgress>>,
memory_reserve_bytes: usize,
emergency_memory_reserve_bytes: usize,
telemetry: &LexicalRebuildShardBuildTelemetry,
) {
let snapshot = telemetry.snapshot();
latest_runtime.staged_shard_build_memory_reserve_bytes = memory_reserve_bytes;
latest_runtime.staged_shard_build_emergency_memory_reserve_bytes =
emergency_memory_reserve_bytes;
latest_runtime.staged_shard_build_completed_jobs = snapshot.completed_jobs;
latest_runtime.staged_shard_build_last_shard_index = snapshot.last_shard_index;
latest_runtime.staged_shard_build_last_message_bytes = snapshot.last_message_bytes;
latest_runtime.staged_shard_build_last_index_size_bytes = snapshot.last_index_size_bytes;
latest_runtime.staged_shard_build_last_duration_ms = snapshot.last_duration_ms;
latest_runtime.staged_shard_build_last_amplification_milli = snapshot.last_amplification_milli;
latest_runtime.staged_shard_build_observed_amplification_milli =
snapshot.observed_amplification_milli;
let Some(progress) = progress else {
return;
};
progress
.rebuild_pipeline_staged_shard_build_memory_reserve_bytes
.store(memory_reserve_bytes, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_emergency_memory_reserve_bytes
.store(emergency_memory_reserve_bytes, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_completed_jobs
.store(snapshot.completed_jobs, Ordering::Relaxed);
if let Ok(mut last_shard_index) = progress
.rebuild_pipeline_staged_shard_build_last_shard_index
.lock()
{
*last_shard_index = snapshot.last_shard_index;
}
progress
.rebuild_pipeline_staged_shard_build_last_message_bytes
.store(snapshot.last_message_bytes, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_last_index_size_bytes
.store(snapshot.last_index_size_bytes, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_last_duration_ms
.store(snapshot.last_duration_ms, Ordering::Relaxed);
if let Ok(mut last_amplification) = progress
.rebuild_pipeline_staged_shard_build_last_amplification_milli
.lock()
{
*last_amplification = snapshot.last_amplification_milli;
}
if let Ok(mut observed_amplification) = progress
.rebuild_pipeline_staged_shard_build_observed_amplification_milli
.lock()
{
*observed_amplification = snapshot.observed_amplification_milli;
}
}
#[allow(clippy::too_many_arguments)]
fn flush_streamed_lexical_rebuild_batch_for_planned_shard_boundary(
planned_shard_index: Option<usize>,
finishes_planned_shard: bool,
pending_batch: &mut Vec<LexicalRebuildConversationPacket>,
pending_batch_message_count: &mut usize,
pending_batch_message_bytes: &mut usize,
lexical_rebuild_flow_limiter: Option<&StreamingByteLimiter>,
lexical_rebuild_worker_pool: Option<&ThreadPool>,
t_index: &mut TantivyIndex,
indexed_docs: &mut usize,
messages_since_commit: &mut usize,
message_bytes_since_commit: &mut usize,
current_batch_conversation_limit: &mut usize,
batch_conversation_limit: usize,
page_size: i64,
perf_profile: Option<&mut LexicalRebuildPerfProfile>,
) -> Result<bool> {
if !finishes_planned_shard || pending_batch.is_empty() {
return Ok(false);
}
let preserved_batch_conversation_limit = *current_batch_conversation_limit;
tracing::info!(
planned_shard_index,
shard_conversations = pending_batch.len(),
shard_message_bytes = *pending_batch_message_bytes,
"lexical rebuild flushing pending batch at planned shard boundary"
);
flush_streamed_lexical_rebuild_batch(
pending_batch,
pending_batch_message_count,
pending_batch_message_bytes,
lexical_rebuild_flow_limiter,
lexical_rebuild_worker_pool,
t_index,
indexed_docs,
messages_since_commit,
message_bytes_since_commit,
current_batch_conversation_limit,
batch_conversation_limit,
page_size,
perf_profile,
)?;
*current_batch_conversation_limit = preserved_batch_conversation_limit;
Ok(true)
}
#[allow(clippy::too_many_arguments)]
fn commit_lexical_rebuild_progress(
index_path: &Path,
rebuild_state: &mut LexicalRebuildState,
next_conversation_id: Option<i64>,
processed_conversations: usize,
indexed_docs: usize,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
t_index: &mut TantivyIndex,
persist_finalized_checkpoint: bool,
mut perf_profile: Option<&mut LexicalRebuildPerfProfile>,
) -> Result<()> {
let pending_progress_started = perf_profile.as_ref().map(|_| Instant::now());
persist_pending_lexical_rebuild_progress(
index_path,
rebuild_state,
next_conversation_id,
processed_conversations,
indexed_docs,
runtime,
)?;
if let (Some(profile), Some(started)) = (perf_profile.as_mut(), pending_progress_started) {
profile.pending_progress_duration += started.elapsed();
}
let commit_started = perf_profile.as_ref().map(|_| Instant::now());
t_index.commit()?;
if let (Some(profile), Some(started)) = (perf_profile.as_mut(), commit_started) {
profile.commit_count = profile.commit_count.saturating_add(1);
profile.commit_duration += started.elapsed();
}
let meta_fingerprint_started = perf_profile.as_ref().map(|_| Instant::now());
let meta_fingerprint = index_meta_fingerprint(index_path)?;
if let (Some(profile), Some(started)) = (perf_profile.as_mut(), meta_fingerprint_started) {
profile.meta_fingerprint_duration += started.elapsed();
}
let checkpoint_persist_started = perf_profile.as_ref().map(|_| Instant::now());
rebuild_state.finalize_commit(meta_fingerprint);
if persist_finalized_checkpoint {
persist_lexical_rebuild_state(index_path, rebuild_state)?;
if let (Some(profile), Some(started)) = (perf_profile.as_mut(), checkpoint_persist_started)
{
profile.checkpoint_persist_duration += started.elapsed();
}
}
Ok(())
}
fn maintenance_job_kind_for_mode(_mode: SearchMaintenanceMode) -> SearchMaintenanceJobKind {
SearchMaintenanceJobKind::LexicalRefresh
}
#[derive(Debug, Default)]
struct LexicalRebuildPerfProfile {
total_duration: Duration,
batch_flushes: usize,
commit_count: usize,
heartbeat_persist_count: usize,
batch_conversations: usize,
batch_messages: usize,
batch_message_bytes: usize,
conversation_list_duration: Duration,
message_stream_duration: Duration,
finish_conversation_duration: Duration,
prepare_duration: Duration,
add_duration: Duration,
commit_duration: Duration,
pending_progress_duration: Duration,
heartbeat_progress_duration: Duration,
checkpoint_persist_duration: Duration,
meta_fingerprint_duration: Duration,
}
impl LexicalRebuildPerfProfile {
fn from_env() -> Option<Self> {
std::env::var_os("CASS_TANTIVY_REBUILD_PROFILE").map(|_| Self::default())
}
fn millis(duration: Duration) -> f64 {
duration.as_secs_f64() * 1000.0
}
fn log_summary(&self) {
let flushes = self.batch_flushes.max(1) as f64;
let commits = self.commit_count.max(1) as f64;
let heartbeat_persists = self.heartbeat_persist_count.max(1) as f64;
let accounted_duration = self.conversation_list_duration
+ self.message_stream_duration
+ self.prepare_duration
+ self.add_duration
+ self.commit_duration
+ self.pending_progress_duration
+ self.heartbeat_progress_duration
+ self.checkpoint_persist_duration
+ self.meta_fingerprint_duration;
let residual_duration = self.total_duration.saturating_sub(accounted_duration);
eprintln!(
concat!(
"CASS_REBUILD_PROFILE ",
"flushes={} commits={} heartbeat_persists={} ",
"batch_conversations={} batch_messages={} batch_message_bytes={} ",
"total_ms={:.3} conversation_list_ms={:.3} message_stream_ms={:.3} ",
"finish_conversation_ms={:.3} residual_ms={:.3} ",
"prepare_ms={:.3} add_ms={:.3} commit_ms={:.3} ",
"pending_progress_ms={:.3} heartbeat_progress_ms={:.3} ",
"checkpoint_persist_ms={:.3} meta_fingerprint_ms={:.3} ",
"avg_prepare_ms_per_flush={:.3} avg_add_ms_per_flush={:.3} ",
"avg_commit_ms_per_commit={:.3} avg_pending_progress_ms_per_commit={:.3} ",
"avg_heartbeat_progress_ms={:.3}"
),
self.batch_flushes,
self.commit_count,
self.heartbeat_persist_count,
self.batch_conversations,
self.batch_messages,
self.batch_message_bytes,
Self::millis(self.total_duration),
Self::millis(self.conversation_list_duration),
Self::millis(self.message_stream_duration),
Self::millis(self.finish_conversation_duration),
Self::millis(residual_duration),
Self::millis(self.prepare_duration),
Self::millis(self.add_duration),
Self::millis(self.commit_duration),
Self::millis(self.pending_progress_duration),
Self::millis(self.heartbeat_progress_duration),
Self::millis(self.checkpoint_persist_duration),
Self::millis(self.meta_fingerprint_duration),
Self::millis(self.prepare_duration) / flushes,
Self::millis(self.add_duration) / flushes,
Self::millis(self.commit_duration) / commits,
Self::millis(self.pending_progress_duration) / commits,
Self::millis(self.heartbeat_progress_duration) / heartbeat_persists,
);
}
}
fn log_lexical_rebuild_prep_profile_step(
rebuild_started: Option<Instant>,
step_started: Instant,
step: &str,
) {
if let Some(rebuild_started) = rebuild_started {
let step_ms = step_started.elapsed().as_millis() as u64;
let total_ms = rebuild_started.elapsed().as_millis() as u64;
eprintln!(
"CASS_PREP_PROFILE step={step} step_ms={} total_ms={}",
step_ms, total_ms
);
tracing::info!(
component = "main",
step,
step_ms,
total_ms,
"lexical rebuild prep profile"
);
}
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(deny_unknown_fields)]
struct LexicalRebuildDbState {
db_path: String,
total_conversations: usize,
#[serde(default)]
total_messages: usize,
storage_fingerprint: String,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(deny_unknown_fields)]
struct PendingLexicalCommit {
next_offset: i64,
#[serde(default)]
next_conversation_id: Option<i64>,
processed_conversations: usize,
indexed_docs: usize,
base_meta_fingerprint: Option<String>,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(default)]
pub(crate) struct LexicalRebuildPipelineRuntimeSnapshot {
pub queue_depth: usize,
pub inflight_message_bytes: usize,
pub max_message_bytes_in_flight: usize,
pub pending_batch_conversations: usize,
pub pending_batch_message_bytes: usize,
pub page_prep_workers: usize,
pub active_page_prep_jobs: usize,
pub ordered_buffered_pages: usize,
pub budget_generation: usize,
pub producer_budget_wait_count: usize,
pub producer_budget_wait_ms: usize,
pub producer_handoff_wait_count: usize,
pub producer_handoff_wait_ms: usize,
pub host_loadavg_1m_milli: Option<u32>,
pub host_available_memory_bytes: Option<u64>,
pub process_rss_bytes: Option<u64>,
pub controller_mode: String,
pub controller_reason: String,
pub staged_merge_workers_max: usize,
pub staged_merge_allowed_jobs: usize,
pub staged_merge_active_jobs: usize,
pub staged_merge_ready_artifacts: usize,
pub staged_merge_ready_groups: usize,
pub staged_merge_controller_reason: String,
pub staged_shard_build_workers_max: usize,
pub staged_shard_build_allowed_jobs: usize,
pub staged_shard_build_active_jobs: usize,
pub staged_shard_build_pending_jobs: usize,
pub staged_shard_build_controller_reason: String,
pub staged_shard_build_memory_reserve_bytes: usize,
pub staged_shard_build_emergency_memory_reserve_bytes: usize,
pub staged_shard_build_completed_jobs: usize,
pub staged_shard_build_last_shard_index: Option<usize>,
pub staged_shard_build_last_message_bytes: usize,
pub staged_shard_build_last_index_size_bytes: u64,
pub staged_shard_build_last_duration_ms: u64,
pub staged_shard_build_last_amplification_milli: Option<u64>,
pub staged_shard_build_observed_amplification_milli: Option<u64>,
pub updated_at_ms: i64,
}
impl LexicalRebuildPipelineRuntimeSnapshot {
fn is_observed(&self) -> bool {
self.updated_at_ms > 0
|| self.queue_depth > 0
|| self.inflight_message_bytes > 0
|| self.pending_batch_conversations > 0
|| self.pending_batch_message_bytes > 0
|| self.page_prep_workers > 0
|| self.active_page_prep_jobs > 0
|| self.ordered_buffered_pages > 0
|| self.budget_generation > 0
|| self.producer_budget_wait_count > 0
|| self.producer_budget_wait_ms > 0
|| self.producer_handoff_wait_count > 0
|| self.producer_handoff_wait_ms > 0
|| self.host_loadavg_1m_milli.is_some()
|| self.host_available_memory_bytes.is_some()
|| self.process_rss_bytes.is_some()
|| !self.controller_mode.is_empty()
|| !self.controller_reason.is_empty()
|| self.staged_merge_workers_max > 0
|| self.staged_merge_allowed_jobs > 0
|| self.staged_merge_active_jobs > 0
|| self.staged_merge_ready_artifacts > 0
|| self.staged_merge_ready_groups > 0
|| !self.staged_merge_controller_reason.is_empty()
|| self.staged_shard_build_workers_max > 0
|| self.staged_shard_build_allowed_jobs > 0
|| self.staged_shard_build_active_jobs > 0
|| self.staged_shard_build_pending_jobs > 0
|| !self.staged_shard_build_controller_reason.is_empty()
|| self.staged_shard_build_memory_reserve_bytes > 0
|| self.staged_shard_build_emergency_memory_reserve_bytes > 0
|| self.staged_shard_build_completed_jobs > 0
|| self.staged_shard_build_last_shard_index.is_some()
|| self.staged_shard_build_last_message_bytes > 0
|| self.staged_shard_build_last_index_size_bytes > 0
|| self.staged_shard_build_last_duration_ms > 0
|| self.staged_shard_build_last_amplification_milli.is_some()
|| self
.staged_shard_build_observed_amplification_milli
.is_some()
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "snake_case")]
enum LexicalRebuildExecutionMode {
#[default]
SharedWriter,
StagedShardBuild,
}
impl LexicalRebuildExecutionMode {
fn requires_restart_from_zero_on_resume(self) -> bool {
matches!(self, Self::StagedShardBuild)
}
fn as_str(self) -> &'static str {
match self {
Self::SharedWriter => "shared_writer",
Self::StagedShardBuild => "staged_shard_build",
}
}
}
fn lexical_rebuild_execution_mode_is_default(mode: &LexicalRebuildExecutionMode) -> bool {
*mode == LexicalRebuildExecutionMode::SharedWriter
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(deny_unknown_fields)]
struct LexicalRebuildState {
version: u8,
schema_hash: String,
db: LexicalRebuildDbState,
page_size: i64,
committed_offset: i64,
#[serde(default)]
committed_conversation_id: Option<i64>,
processed_conversations: usize,
indexed_docs: usize,
committed_meta_fingerprint: Option<String>,
pending: Option<PendingLexicalCommit>,
completed: bool,
updated_at_ms: i64,
#[serde(
default,
skip_serializing_if = "lexical_rebuild_execution_mode_is_default"
)]
execution_mode: LexicalRebuildExecutionMode,
#[serde(default)]
runtime: LexicalRebuildPipelineRuntimeSnapshot,
}
impl LexicalRebuildState {
fn new(db: LexicalRebuildDbState, page_size: i64) -> Self {
Self {
version: LEXICAL_REBUILD_STATE_VERSION,
schema_hash: crate::search::tantivy::SCHEMA_HASH.to_string(),
db,
page_size,
committed_offset: 0,
committed_conversation_id: None,
processed_conversations: 0,
indexed_docs: 0,
committed_meta_fingerprint: None,
pending: None,
completed: false,
updated_at_ms: FrankenStorage::now_millis(),
execution_mode: LexicalRebuildExecutionMode::SharedWriter,
runtime: LexicalRebuildPipelineRuntimeSnapshot::default(),
}
}
fn matches_run(&self, db: &LexicalRebuildDbState, _page_size: i64) -> bool {
let db_matches = if self.db.storage_fingerprint.starts_with("content-v1:")
&& db.storage_fingerprint.starts_with("content-v1:")
{
lexical_rebuild_db_state_matches(&self.db, db)
} else {
lexical_rebuild_db_state_matches_legacy(&self.db, db)
};
self.version == LEXICAL_REBUILD_STATE_VERSION
&& self.schema_hash == crate::search::tantivy::SCHEMA_HASH
&& db_matches
&& lexical_rebuild_page_size_is_compatible(self.page_size)
}
fn record_pending_commit(
&mut self,
next_conversation_id: Option<i64>,
processed_conversations: usize,
indexed_docs: usize,
base_meta_fingerprint: Option<String>,
) {
self.pending = Some(PendingLexicalCommit {
next_offset: i64::try_from(processed_conversations).unwrap_or(i64::MAX),
next_conversation_id,
processed_conversations,
indexed_docs,
base_meta_fingerprint,
});
self.completed = false;
self.updated_at_ms = FrankenStorage::now_millis();
}
fn finalize_commit(&mut self, committed_meta_fingerprint: Option<String>) {
if let Some(pending) = self.pending.take() {
self.committed_offset =
i64::try_from(pending.processed_conversations).unwrap_or(i64::MAX);
self.committed_conversation_id = pending.next_conversation_id;
self.processed_conversations = pending.processed_conversations;
self.indexed_docs = pending.indexed_docs;
}
self.committed_meta_fingerprint = committed_meta_fingerprint;
self.completed = false;
self.updated_at_ms = FrankenStorage::now_millis();
}
fn clear_pending(&mut self) {
self.pending = None;
self.updated_at_ms = FrankenStorage::now_millis();
}
fn set_runtime(&mut self, runtime: &LexicalRebuildPipelineRuntimeSnapshot) {
self.runtime = runtime.clone();
}
fn clear_runtime(&mut self) {
self.runtime = LexicalRebuildPipelineRuntimeSnapshot::default();
}
fn set_execution_mode(&mut self, execution_mode: LexicalRebuildExecutionMode) {
self.execution_mode = execution_mode;
self.updated_at_ms = FrankenStorage::now_millis();
}
fn mark_completed(&mut self, committed_meta_fingerprint: Option<String>) {
self.committed_meta_fingerprint = committed_meta_fingerprint;
self.pending = None;
self.clear_runtime();
self.completed = true;
self.updated_at_ms = FrankenStorage::now_millis();
self.execution_mode = LexicalRebuildExecutionMode::SharedWriter;
}
fn is_incomplete(&self) -> bool {
!self.completed
}
fn reported_processed_conversations(&self) -> usize {
self.pending
.as_ref()
.map(|pending| pending.processed_conversations)
.unwrap_or(self.processed_conversations)
}
fn reported_committed_conversation_id(&self) -> Option<i64> {
self.pending
.as_ref()
.and_then(|pending| pending.next_conversation_id)
.or(self.committed_conversation_id)
}
fn reported_indexed_docs(&self) -> usize {
self.pending
.as_ref()
.map(|pending| pending.indexed_docs)
.unwrap_or(self.indexed_docs)
}
}
fn acquire_index_run_lock(
data_dir: &Path,
db_path: &Path,
mode: SearchMaintenanceMode,
) -> Result<IndexRunLockGuard> {
fs::create_dir_all(data_dir)
.with_context(|| format!("creating cass data directory {}", data_dir.display()))?;
let lock_path = data_dir.join("index-run.lock");
let file = OpenOptions::new()
.create(true)
.truncate(false)
.read(true)
.write(true)
.open(&lock_path)
.with_context(|| format!("opening index-run lock file {}", lock_path.display()))?;
if let Err(err) = file.try_lock_exclusive() {
if err.kind() == std::io::ErrorKind::WouldBlock {
anyhow::bail!(
"another cass index process already holds {}",
lock_path.display()
);
}
return Err(err)
.with_context(|| format!("acquiring index-run lock {}", lock_path.display()));
}
let now_ms = FrankenStorage::now_millis();
let mut guard = IndexRunLockGuard {
file,
_path: lock_path,
started_at_ms: now_ms,
updated_at_ms: now_ms,
last_progress_at_ms: now_ms,
last_progress_at_ms_atomic: Arc::new(AtomicI64::new(now_ms)),
db_path: crate::normalize_path_identity(db_path),
job_id: String::new(),
job_kind: maintenance_job_kind_for_mode(mode),
metadata_write_lock: Arc::new(Mutex::new(())),
};
guard.job_id = format!(
"{}-{}-{}",
guard.job_kind.as_lock_value(),
guard.started_at_ms,
std::process::id()
);
guard.write_metadata(mode)?;
Ok(guard)
}
fn lexical_rebuild_state_path(index_path: &Path) -> PathBuf {
index_path.join(".lexical-rebuild-state.json")
}
fn lexical_rebuild_equivalence_evidence_path(index_path: &Path) -> PathBuf {
index_path.join(".lexical-rebuild-equivalence.json")
}
fn lexical_refresh_ledger_path(index_path: &Path) -> PathBuf {
index_path.join(".lexical-refresh-ledger.json")
}
fn lexical_refresh_evidence_path(index_path: &Path) -> PathBuf {
index_path.join(".lexical-refresh-evidence.json")
}
fn persist_lexical_rebuild_equivalence_evidence(
index_path: &Path,
evidence: &LexicalRebuildEquivalenceEvidence,
) -> Result<()> {
let path = lexical_rebuild_equivalence_evidence_path(index_path);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).with_context(|| {
format!(
"creating lexical rebuild equivalence evidence parent directory {}",
parent.display()
)
})?;
}
write_json_pretty_atomically(&path, evidence).with_context(|| {
format!(
"persisting lexical rebuild equivalence evidence to {}",
path.display()
)
})
}
fn persist_lexical_refresh_ledger(index_path: &Path, ledger: &RefreshLedger) -> Result<()> {
let path = lexical_refresh_ledger_path(index_path);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).with_context(|| {
format!(
"creating lexical refresh ledger parent directory {}",
parent.display()
)
})?;
}
let prior_evidence = load_prior_refresh_evidence_for_comparison(index_path);
write_json_pretty_atomically(&path, ledger)
.with_context(|| format!("persisting lexical refresh ledger to {}", path.display()))?;
let current_evidence = ledger.evidence_summary();
if let Err(err) = persist_lexical_refresh_evidence(index_path, ¤t_evidence) {
tracing::warn!(
target: "cass::indexer::lexical_refresh",
error = %err,
"failed to persist .lexical-refresh-evidence.json sidecar; raw ledger \
persisted OK and evidence can be re-derived offline via \
RefreshLedger::evidence_summary()"
);
}
if let Some(prior) = prior_evidence {
let comparison = current_evidence.compare_to(&prior);
comparison.emit_tracing_summary();
}
Ok(())
}
fn load_prior_refresh_evidence_for_comparison(index_path: &Path) -> Option<RefreshLedgerEvidence> {
let path = lexical_refresh_evidence_path(index_path);
let raw = match fs::read_to_string(&path) {
Ok(raw) => raw,
Err(err) => {
tracing::debug!(
target: "cass::indexer::lexical_refresh",
path = %path.display(),
error = %err,
"no prior .lexical-refresh-evidence.json sidecar to compare against \
(first publish on this data dir, or sidecar pruned)"
);
return None;
}
};
match serde_json::from_str::<RefreshLedgerEvidence>(&raw) {
Ok(prior) => Some(prior),
Err(err) => {
tracing::debug!(
target: "cass::indexer::lexical_refresh",
path = %path.display(),
error = %err,
"prior .lexical-refresh-evidence.json sidecar present but unparseable; \
skipping cross-run comparison for this publish"
);
None
}
}
}
fn persist_lexical_refresh_evidence(
index_path: &Path,
evidence: &RefreshLedgerEvidence,
) -> Result<()> {
let path = lexical_refresh_evidence_path(index_path);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).with_context(|| {
format!(
"creating lexical refresh evidence parent directory {}",
parent.display()
)
})?;
}
write_json_pretty_atomically(&path, evidence)
.with_context(|| format!("persisting lexical refresh evidence to {}", path.display()))
}
struct AuthoritativeLexicalRefreshLedgerInput<'a> {
publish_mode: &'static str,
lexical_duration: Duration,
publish_duration: Duration,
processed_conversations: usize,
total_conversations: usize,
final_observed_messages: usize,
indexed_docs: usize,
equivalence_evidence: &'a LexicalRebuildEquivalenceEvidence,
}
fn build_authoritative_lexical_refresh_ledger(
input: AuthoritativeLexicalRefreshLedgerInput<'_>,
) -> RefreshLedger {
let AuthoritativeLexicalRefreshLedgerInput {
publish_mode,
lexical_duration,
publish_duration,
processed_conversations,
total_conversations,
final_observed_messages,
indexed_docs,
equivalence_evidence,
} = input;
let lexical_duration_ms = u64::try_from(lexical_duration.as_millis()).unwrap_or(u64::MAX);
let publish_duration_ms = u64::try_from(publish_duration.as_millis()).unwrap_or(u64::MAX);
let total_duration_ms = lexical_duration_ms.saturating_add(publish_duration_ms);
let completed_at_ms = FrankenStorage::now_millis();
let started_at_ms =
completed_at_ms.saturating_sub(i64::try_from(total_duration_ms).unwrap_or(i64::MAX));
let processed_conversations_u64 = u64::try_from(processed_conversations).unwrap_or(u64::MAX);
let total_conversations_u64 = u64::try_from(total_conversations).unwrap_or(u64::MAX);
let final_observed_messages_u64 = u64::try_from(final_observed_messages).unwrap_or(u64::MAX);
let indexed_docs_u64 = u64::try_from(indexed_docs).unwrap_or(u64::MAX);
let equivalence_probe_count =
u64::try_from(equivalence_evidence.golden_query_hit_counts.len()).unwrap_or(u64::MAX);
RefreshLedger {
version: 1,
started_at_ms,
completed_at_ms,
total_duration_ms,
full_rebuild: true,
corpus_family: "authoritative_canonical_packet_replay".to_string(),
phases: vec![
PhaseRecord {
phase: RefreshPhase::LexicalRebuild,
duration_ms: lexical_duration_ms,
items_processed: processed_conversations_u64,
items_skipped: total_conversations_u64.saturating_sub(processed_conversations_u64),
errors: 0,
counters: BTreeMap::from([
("indexed_docs".to_string(), indexed_docs_u64),
("observed_messages".to_string(), final_observed_messages_u64),
("total_conversations".to_string(), total_conversations_u64),
(
"equivalence_probe_count".to_string(),
equivalence_probe_count,
),
]),
success: true,
error_message: None,
},
PhaseRecord {
phase: RefreshPhase::Publish,
duration_ms: publish_duration_ms,
items_processed: 1,
items_skipped: 0,
errors: 0,
counters: BTreeMap::from([
("indexed_docs".to_string(), indexed_docs_u64),
("observed_messages".to_string(), final_observed_messages_u64),
("published_generations".to_string(), 1),
]),
success: true,
error_message: None,
},
],
equivalence: RefreshEquivalenceArtifacts {
conversation_count: total_conversations_u64,
message_count: final_observed_messages_u64,
lexical_doc_count: indexed_docs_u64,
lexical_fingerprint: None,
semantic_manifest_fingerprint: None,
search_hit_digest: Some(equivalence_evidence.golden_query_digest.clone()),
peak_rss_bytes: None,
db_size_bytes: None,
lexical_index_size_bytes: None,
},
tags: BTreeMap::from([
("dataflow".to_string(), "conversation_packet".to_string()),
("publish_mode".to_string(), publish_mode.to_string()),
("source".to_string(), "canonical_db".to_string()),
]),
}
}
fn log_lexical_refresh_ledger_published(ledger: &RefreshLedger) {
let milestones = ledger.readiness_milestones();
tracing::info!(
corpus_family = ledger.corpus_family.as_str(),
total_duration_ms = ledger.total_duration_ms,
time_to_lexical_ready_ms = milestones.time_to_lexical_ready_ms,
time_to_search_ready_ms = milestones.time_to_search_ready_ms,
time_to_full_settled_ms = milestones.time_to_full_settled_ms,
failed_phase = milestones.failed_phase.as_deref().unwrap_or(""),
search_readiness_state = ?milestones.search_readiness_state,
"lexical refresh ledger published"
);
}
fn build_lexical_rebuild_generation_manifest(
source_db_fingerprint: &str,
total_conversations: usize,
final_observed_messages: usize,
indexed_docs: usize,
equivalence_evidence: &LexicalRebuildEquivalenceEvidence,
) -> lexical_generation::LexicalGenerationManifest {
let manifest_now_ms = lexical_generation::now_ms();
let generation_fingerprint_head = source_db_fingerprint
.get(..16)
.unwrap_or(source_db_fingerprint);
let generation_id = format!("gen-{manifest_now_ms:016x}-{generation_fingerprint_head}");
let attempt_id = format!("attempt-{manifest_now_ms:016x}");
let mut generation_manifest = lexical_generation::LexicalGenerationManifest::new_scratch(
generation_id,
attempt_id,
source_db_fingerprint.to_string(),
manifest_now_ms,
);
generation_manifest.conversation_count = u64::try_from(total_conversations).unwrap_or(u64::MAX);
generation_manifest.message_count = u64::try_from(final_observed_messages).unwrap_or(u64::MAX);
generation_manifest.indexed_doc_count = u64::try_from(indexed_docs).unwrap_or(u64::MAX);
generation_manifest.equivalence_manifest_fingerprint =
Some(equivalence_evidence.manifest_fingerprint.clone());
generation_manifest.transition_build(
lexical_generation::LexicalGenerationBuildState::Built,
manifest_now_ms,
);
generation_manifest.transition_build(
lexical_generation::LexicalGenerationBuildState::Validated,
manifest_now_ms,
);
generation_manifest.transition_publish(
lexical_generation::LexicalGenerationPublishState::Published,
manifest_now_ms,
);
generation_manifest
}
fn log_lexical_generation_manifest_published(
generation_manifest: &lexical_generation::LexicalGenerationManifest,
equivalence_evidence: &LexicalRebuildEquivalenceEvidence,
) {
tracing::info!(
generation_id = generation_manifest.generation_id.as_str(),
attempt_id = generation_manifest.attempt_id.as_str(),
conversation_count = generation_manifest.conversation_count,
message_count = generation_manifest.message_count,
indexed_doc_count = generation_manifest.indexed_doc_count,
source_db_fingerprint = generation_manifest.source_db_fingerprint.as_str(),
equivalence_manifest_fingerprint = equivalence_evidence.manifest_fingerprint.as_str(),
"lexical generation manifest published"
);
}
fn persist_lexical_rebuild_generation_artifacts(
generation_dir: &Path,
source_db_fingerprint: &str,
processed_conversations: usize,
total_conversations: usize,
final_observed_messages: usize,
indexed_docs: usize,
equivalence_evidence: &LexicalRebuildEquivalenceEvidence,
) -> Result<lexical_generation::LexicalGenerationManifest> {
tracing::info!(
document_count = equivalence_evidence.document_count,
manifest_fingerprint = equivalence_evidence.manifest_fingerprint.as_str(),
golden_query_digest = equivalence_evidence.golden_query_digest.as_str(),
golden_probe_count = equivalence_evidence.golden_query_hit_counts.len(),
golden_query_hit_total = equivalence_evidence
.golden_query_hit_counts
.iter()
.map(|hit| hit.hit_count)
.sum::<u64>(),
indexed_docs,
total_conversations,
processed_conversations,
"lexical rebuild authoritative equivalence evidence"
);
persist_lexical_rebuild_equivalence_evidence(generation_dir, equivalence_evidence)?;
let generation_manifest = build_lexical_rebuild_generation_manifest(
source_db_fingerprint,
total_conversations,
final_observed_messages,
indexed_docs,
equivalence_evidence,
);
lexical_generation::store_manifest(generation_dir, &generation_manifest).with_context(
|| {
format!(
"persisting lexical generation manifest for published generation {} at {}",
generation_manifest.generation_id,
generation_dir.display()
)
},
)?;
Ok(generation_manifest)
}
fn lexical_rebuild_commit_interval_conversations() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(10_000)
}
fn lexical_rebuild_initial_commit_interval_conversations() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(2_048)
}
fn lexical_rebuild_commit_interval_messages() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_COMMIT_EVERY_MESSAGES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(800_000)
}
fn lexical_rebuild_initial_commit_interval_messages() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_MESSAGES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(800_000)
}
fn lexical_rebuild_commit_interval_message_bytes() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_COMMIT_EVERY_MESSAGE_BYTES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(512 * 1024 * 1024)
}
fn lexical_rebuild_initial_commit_interval_message_bytes() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_MESSAGE_BYTES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(128 * 1024 * 1024)
}
fn lexical_rebuild_commit_intervals_for_state(
rebuild_state: &LexicalRebuildState,
) -> (usize, usize, usize) {
let steady_conversations = lexical_rebuild_commit_interval_conversations();
let steady_messages = lexical_rebuild_commit_interval_messages();
let steady_message_bytes = lexical_rebuild_commit_interval_message_bytes();
if rebuild_state.processed_conversations > 0 {
return (steady_conversations, steady_messages, steady_message_bytes);
}
(
lexical_rebuild_initial_commit_interval_conversations().min(steady_conversations),
lexical_rebuild_initial_commit_interval_messages().min(steady_messages),
lexical_rebuild_initial_commit_interval_message_bytes().min(steady_message_bytes),
)
}
fn lexical_rebuild_progress_heartbeat_interval_conversations() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_PROGRESS_HEARTBEAT_EVERY_CONVERSATIONS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(2_000)
}
fn lexical_rebuild_progress_heartbeat_interval() -> Duration {
Duration::from_millis(
dotenvy::var("CASS_TANTIVY_REBUILD_PROGRESS_HEARTBEAT_EVERY_MS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(10_000),
)
}
fn lexical_rebuild_batch_fetch_conversation_limit(page_size: i64) -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_default_batch_fetch_conversation_limit(
page_size,
crate::search::tantivy::tantivy_writer_parallelism_hint(),
)
})
.min(usize::try_from(page_size.max(1)).unwrap_or(usize::MAX))
}
fn lexical_rebuild_default_batch_fetch_conversation_limit(
page_size: i64,
tantivy_writer_parallelism: usize,
) -> usize {
let page_size = usize::try_from(page_size.max(1)).unwrap_or(usize::MAX);
512_usize
.max(tantivy_writer_parallelism.max(1).saturating_mul(128))
.min(page_size.max(1))
}
fn lexical_rebuild_initial_batch_fetch_conversation_limit(default_limit: usize) -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.map(|value| value.min(default_limit.max(1)))
.unwrap_or_else(|| 32.min(default_limit.max(1)))
}
fn lexical_rebuild_available_parallelism() -> usize {
std::thread::available_parallelism()
.map(std::num::NonZeroUsize::get)
.unwrap_or(1)
.max(1)
}
fn lexical_rebuild_default_reserved_cores_for_available(available_parallelism: usize) -> usize {
match available_parallelism {
0 | 1 => 0,
2..=4 => 1,
5..=15 => 2,
cores => (cores / 8).clamp(2, 8),
}
}
fn lexical_rebuild_reserved_cores_for_available(available_parallelism: usize) -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_RESERVED_CORES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.unwrap_or_else(|| {
lexical_rebuild_default_reserved_cores_for_available(available_parallelism)
})
.min(available_parallelism.saturating_sub(1))
}
fn lexical_rebuild_worker_parallelism_for_available_and_reserved(
available_parallelism: usize,
reserved_cores: usize,
) -> usize {
available_parallelism
.max(1)
.saturating_sub(reserved_cores)
.clamp(1, 64)
}
#[cfg(test)]
fn lexical_rebuild_default_worker_parallelism_for_available(available_parallelism: usize) -> usize {
lexical_rebuild_worker_parallelism_for_available_and_reserved(
available_parallelism,
lexical_rebuild_default_reserved_cores_for_available(available_parallelism),
)
}
fn lexical_rebuild_configured_worker_parallelism_for_available(
available_parallelism: usize,
) -> usize {
lexical_rebuild_worker_parallelism_for_available_and_reserved(
available_parallelism,
lexical_rebuild_reserved_cores_for_available(available_parallelism),
)
}
fn lexical_rebuild_worker_parallelism() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_WORKERS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_configured_worker_parallelism_for_available(
lexical_rebuild_available_parallelism(),
)
})
}
fn build_lexical_rebuild_worker_pool() -> Result<Option<ThreadPool>> {
let raw = lexical_rebuild_worker_parallelism();
let parallelism = responsiveness::effective_worker_count(raw).max(1);
if parallelism <= 1 {
return Ok(None);
}
ThreadPoolBuilder::new()
.num_threads(parallelism)
.thread_name(|idx| format!("cass-lexical-rebuild-{idx}"))
.build()
.map(Some)
.map_err(anyhow::Error::new)
.with_context(|| format!("building lexical rebuild worker pool with {parallelism} threads"))
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub(crate) struct LexicalRebuildPipelineSettingsSnapshot {
pub workers: usize,
pub available_parallelism: usize,
pub reserved_cores: usize,
pub tantivy_writer_threads: usize,
pub staged_shard_builders: usize,
pub staged_merge_workers: usize,
pub controller_mode: String,
pub controller_restore_clear_samples: usize,
pub controller_restore_hold_ms: u64,
pub controller_loadavg_high_watermark_1m_milli: Option<u32>,
pub controller_loadavg_low_watermark_1m_milli: Option<u32>,
pub page_size: i64,
pub steady_batch_fetch_conversations: usize,
pub startup_batch_fetch_conversations: usize,
pub steady_commit_every_conversations: usize,
pub startup_commit_every_conversations: usize,
pub steady_commit_every_messages: usize,
pub startup_commit_every_messages: usize,
pub steady_commit_every_message_bytes: usize,
pub startup_commit_every_message_bytes: usize,
pub pipeline_channel_size: usize,
pub page_prep_workers: usize,
pub pipeline_max_message_bytes_in_flight: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
enum LexicalRebuildResponsivenessPolicy {
Auto,
Steady,
Conservative,
}
impl LexicalRebuildResponsivenessPolicy {
fn as_str(self) -> &'static str {
match self {
Self::Auto => "auto",
Self::Steady => "steady",
Self::Conservative => "conservative",
}
}
}
fn lexical_rebuild_responsiveness_policy() -> LexicalRebuildResponsivenessPolicy {
if let Some(policy) = dotenvy::var("CASS_TANTIVY_REBUILD_CONTROLLER_MODE")
.ok()
.map(|value| value.trim().to_ascii_lowercase())
.as_deref()
.map(|value| match value {
"steady" | "pinned_steady" | "off" | "disabled" => {
LexicalRebuildResponsivenessPolicy::Steady
}
"conservative" | "startup" | "pinned_conservative" => {
LexicalRebuildResponsivenessPolicy::Conservative
}
_ => LexicalRebuildResponsivenessPolicy::Auto,
})
{
return policy;
}
if responsiveness::disabled_via_env() {
LexicalRebuildResponsivenessPolicy::Steady
} else {
LexicalRebuildResponsivenessPolicy::Auto
}
}
fn lexical_rebuild_controller_restore_clear_samples() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_CLEAR_SAMPLES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(3)
}
fn lexical_rebuild_controller_restore_hold() -> Duration {
Duration::from_millis(
dotenvy::var("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_HOLD_MS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(5_000),
)
}
fn parse_lexical_rebuild_loadavg_1m_milli(contents: &str) -> Option<u32> {
let first = contents.split_whitespace().next()?;
let parsed = first.parse::<f64>().ok()?;
if !parsed.is_finite() || parsed < 0.0 {
return None;
}
let milli = (parsed * 1000.0).round();
if !(0.0..=(u32::MAX as f64)).contains(&milli) {
return None;
}
Some(milli as u32)
}
#[cfg(target_os = "linux")]
fn lexical_rebuild_host_loadavg_1m_milli() -> Option<u32> {
fs::read_to_string("/proc/loadavg")
.ok()
.and_then(|contents| parse_lexical_rebuild_loadavg_1m_milli(&contents))
}
#[cfg(not(target_os = "linux"))]
fn lexical_rebuild_host_loadavg_1m_milli() -> Option<u32> {
None
}
fn lexical_rebuild_default_controller_loadavg_high_watermark_1m_milli_for_available_and_reserved(
available_parallelism: usize,
reserved_cores: usize,
) -> Option<u32> {
if !cfg!(target_os = "linux") || available_parallelism <= 1 {
return None;
}
let reserved_headroom = reserved_cores.max(1);
let high_slots = available_parallelism
.saturating_sub(reserved_headroom.saturating_sub(1))
.max(1);
u32::try_from(high_slots.saturating_mul(1000)).ok()
}
fn parse_lexical_rebuild_loadavg_override_milli(var: &str) -> Option<u32> {
dotenvy::var(var)
.ok()
.and_then(|value| parse_lexical_rebuild_loadavg_1m_milli(&value))
}
fn lexical_rebuild_controller_loadavg_high_watermark_1m_milli_for_available_and_reserved(
available_parallelism: usize,
reserved_cores: usize,
) -> Option<u32> {
let env_override = parse_lexical_rebuild_loadavg_override_milli(
"CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_HIGH_WATERMARK_1M",
);
env_override.or_else(|| {
if responsiveness::disabled_via_env() {
None
} else {
lexical_rebuild_default_controller_loadavg_high_watermark_1m_milli_for_available_and_reserved(
available_parallelism,
reserved_cores,
)
}
})
}
fn lexical_rebuild_controller_loadavg_low_watermark_1m_milli_from_high(
high_watermark_1m_milli: Option<u32>,
) -> Option<u32> {
parse_lexical_rebuild_loadavg_override_milli(
"CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_LOW_WATERMARK_1M",
)
.or_else(|| high_watermark_1m_milli.map(|high| high.saturating_sub(1000)))
}
pub(crate) fn lexical_rebuild_pipeline_settings_snapshot() -> LexicalRebuildPipelineSettingsSnapshot
{
lexical_rebuild_pipeline_settings_snapshot_inner(true)
}
pub(crate) fn lexical_rebuild_pipeline_settings_snapshot_passive()
-> LexicalRebuildPipelineSettingsSnapshot {
lexical_rebuild_pipeline_settings_snapshot_inner(false)
}
fn lexical_rebuild_pipeline_settings_snapshot_inner(
apply_responsiveness_governor: bool,
) -> LexicalRebuildPipelineSettingsSnapshot {
let steady_batch_fetch_conversations =
lexical_rebuild_batch_fetch_conversation_limit(LEXICAL_REBUILD_PAGE_SIZE);
let startup_batch_fetch_conversations =
lexical_rebuild_initial_batch_fetch_conversation_limit(steady_batch_fetch_conversations);
let steady_commit_every_conversations = lexical_rebuild_commit_interval_conversations();
let startup_commit_every_conversations =
lexical_rebuild_initial_commit_interval_conversations()
.min(steady_commit_every_conversations);
let steady_commit_every_messages = lexical_rebuild_commit_interval_messages();
let startup_commit_every_messages =
lexical_rebuild_initial_commit_interval_messages().min(steady_commit_every_messages);
let steady_commit_every_message_bytes = lexical_rebuild_commit_interval_message_bytes();
let startup_commit_every_message_bytes =
lexical_rebuild_initial_commit_interval_message_bytes()
.min(steady_commit_every_message_bytes);
let pipeline_channel_size = lexical_rebuild_pipeline_channel_size();
let pipeline_max_message_bytes_in_flight = lexical_rebuild_pipeline_max_message_bytes_in_flight(
startup_commit_every_message_bytes,
pipeline_channel_size,
);
let available_parallelism = lexical_rebuild_available_parallelism();
let reserved_cores = lexical_rebuild_reserved_cores_for_available(available_parallelism);
let tantivy_writer_threads_raw =
crate::search::tantivy::tantivy_writer_parallelism_hint_for_available(
available_parallelism,
);
let tantivy_writer_threads = if apply_responsiveness_governor {
responsiveness::effective_worker_count(tantivy_writer_threads_raw).max(1)
} else {
tantivy_writer_threads_raw
};
let controller_restore_hold = lexical_rebuild_controller_restore_hold();
let controller_loadavg_high_watermark_1m_milli =
lexical_rebuild_controller_loadavg_high_watermark_1m_milli_for_available_and_reserved(
available_parallelism,
reserved_cores,
);
let controller_loadavg_low_watermark_1m_milli =
lexical_rebuild_controller_loadavg_low_watermark_1m_milli_from_high(
controller_loadavg_high_watermark_1m_milli,
);
LexicalRebuildPipelineSettingsSnapshot {
workers: lexical_rebuild_worker_parallelism(),
available_parallelism,
reserved_cores,
tantivy_writer_threads,
staged_shard_builders: if apply_responsiveness_governor {
lexical_rebuild_staged_shard_builder_parallelism()
} else {
lexical_rebuild_staged_shard_builder_parallelism_configured()
},
staged_merge_workers: if apply_responsiveness_governor {
lexical_rebuild_staged_merge_worker_parallelism()
} else {
lexical_rebuild_staged_merge_worker_parallelism_configured()
},
controller_mode: lexical_rebuild_responsiveness_policy().as_str().to_string(),
controller_restore_clear_samples: lexical_rebuild_controller_restore_clear_samples(),
controller_restore_hold_ms: controller_restore_hold.as_millis() as u64,
controller_loadavg_high_watermark_1m_milli,
controller_loadavg_low_watermark_1m_milli,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations,
startup_batch_fetch_conversations,
steady_commit_every_conversations,
startup_commit_every_conversations,
steady_commit_every_messages,
startup_commit_every_messages,
steady_commit_every_message_bytes,
startup_commit_every_message_bytes,
pipeline_channel_size,
page_prep_workers: if apply_responsiveness_governor {
lexical_rebuild_page_prep_worker_parallelism()
} else {
lexical_rebuild_page_prep_worker_parallelism_configured()
},
pipeline_max_message_bytes_in_flight,
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum LexicalRebuildResponsivenessState {
Startup,
Steady,
PressureLimited,
PinnedSteady,
PinnedConservative,
}
impl LexicalRebuildResponsivenessState {
fn as_str(self) -> &'static str {
match self {
Self::Startup => "startup",
Self::Steady => "steady",
Self::PressureLimited => "pressure_limited",
Self::PinnedSteady => "pinned_steady",
Self::PinnedConservative => "pinned_conservative",
}
}
}
#[derive(Debug)]
struct LexicalRebuildBudgetTransition {
old_budget: LexicalRebuildPipelineBudgetSnapshot,
new_budget: LexicalRebuildPipelineBudgetSnapshot,
mode: &'static str,
reason: String,
}
#[derive(Debug)]
struct LexicalRebuildResponsivenessController {
policy: LexicalRebuildResponsivenessPolicy,
startup_budget: LexicalRebuildPipelineBudgetSnapshot,
steady_budget: LexicalRebuildPipelineBudgetSnapshot,
pipeline_channel_size: usize,
loadavg_high_watermark_1m_milli: Option<u32>,
loadavg_low_watermark_1m_milli: Option<u32>,
restore_clear_samples: usize,
restore_hold: Duration,
memory_reserve_bytes: usize,
emergency_memory_reserve_bytes: usize,
state: LexicalRebuildResponsivenessState,
reason: String,
clear_samples: usize,
last_transition_at: Instant,
last_observed_producer_handoff_wait_count: usize,
}
impl LexicalRebuildResponsivenessController {
const INFLIGHT_HIGH_WATERMARK_PERCENT: usize = 90;
const INFLIGHT_LOW_WATERMARK_PERCENT: usize = 50;
fn new(
policy: LexicalRebuildResponsivenessPolicy,
startup_budget: LexicalRebuildPipelineBudgetSnapshot,
steady_budget: LexicalRebuildPipelineBudgetSnapshot,
pipeline_channel_size: usize,
start_conservative: bool,
loadavg_high_watermark_1m_milli: Option<u32>,
loadavg_low_watermark_1m_milli: Option<u32>,
) -> Self {
let (state, reason) = match policy {
LexicalRebuildResponsivenessPolicy::Steady => (
LexicalRebuildResponsivenessState::PinnedSteady,
"pinned_steady_by_controller_mode".to_string(),
),
LexicalRebuildResponsivenessPolicy::Conservative => (
LexicalRebuildResponsivenessState::PinnedConservative,
"pinned_conservative_by_controller_mode".to_string(),
),
LexicalRebuildResponsivenessPolicy::Auto if start_conservative => (
LexicalRebuildResponsivenessState::Startup,
"startup_safe_budget_before_first_durable_commit".to_string(),
),
LexicalRebuildResponsivenessPolicy::Auto => (
LexicalRebuildResponsivenessState::Steady,
"steady_budget_selected_for_resumed_or_pinned_run".to_string(),
),
};
Self {
policy,
startup_budget,
steady_budget,
pipeline_channel_size: pipeline_channel_size.max(1),
loadavg_high_watermark_1m_milli,
loadavg_low_watermark_1m_milli,
restore_clear_samples: lexical_rebuild_controller_restore_clear_samples(),
restore_hold: lexical_rebuild_controller_restore_hold(),
memory_reserve_bytes: lexical_rebuild_staged_shard_build_memory_reserve_bytes(),
emergency_memory_reserve_bytes:
lexical_rebuild_staged_shard_build_emergency_memory_reserve_bytes(),
state,
reason,
clear_samples: 0,
last_transition_at: Instant::now(),
last_observed_producer_handoff_wait_count: 0,
}
}
fn current_budget(&self) -> LexicalRebuildPipelineBudgetSnapshot {
match self.state {
LexicalRebuildResponsivenessState::Startup
| LexicalRebuildResponsivenessState::PressureLimited
| LexicalRebuildResponsivenessState::PinnedConservative => self.startup_budget,
LexicalRebuildResponsivenessState::Steady
| LexicalRebuildResponsivenessState::PinnedSteady => self.steady_budget,
}
}
fn mode(&self) -> &'static str {
self.state.as_str()
}
fn reason(&self) -> &str {
&self.reason
}
fn waits_for_first_durable_commit(&self) -> bool {
self.policy == LexicalRebuildResponsivenessPolicy::Auto
&& self.state == LexicalRebuildResponsivenessState::Startup
&& self.startup_budget != self.steady_budget
}
fn record_first_durable_commit(&mut self) -> Option<LexicalRebuildBudgetTransition> {
if self.state != LexicalRebuildResponsivenessState::Startup {
return None;
}
let old_budget = self.current_budget();
self.state = LexicalRebuildResponsivenessState::Steady;
self.reason = "first_durable_commit_promoted_steady_budget".to_string();
self.clear_samples = 0;
self.last_transition_at = Instant::now();
let new_budget = self.current_budget();
if old_budget == new_budget {
return None;
}
Some(LexicalRebuildBudgetTransition {
old_budget,
new_budget,
mode: self.mode(),
reason: self.reason.clone(),
})
}
fn observe_runtime(
&mut self,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
) -> Option<LexicalRebuildBudgetTransition> {
let new_producer_handoff_wait =
self.observe_new_producer_handoff_wait(runtime.producer_handoff_wait_count);
if self.policy != LexicalRebuildResponsivenessPolicy::Auto
|| self.state == LexicalRebuildResponsivenessState::Startup
{
return None;
}
if let Some(reason) = self.detect_pressure(runtime, new_producer_handoff_wait) {
self.clear_samples = 0;
self.reason = reason.clone();
if self.state != LexicalRebuildResponsivenessState::PressureLimited {
let old_budget = self.current_budget();
self.state = LexicalRebuildResponsivenessState::PressureLimited;
self.last_transition_at = Instant::now();
let new_budget = self.current_budget();
if old_budget != new_budget {
return Some(LexicalRebuildBudgetTransition {
old_budget,
new_budget,
mode: self.mode(),
reason,
});
}
}
return None;
}
if self.state == LexicalRebuildResponsivenessState::PressureLimited {
let held_for = self.last_transition_at.elapsed();
if held_for < self.restore_hold {
self.reason = format!(
"holding_conservative_budget_after_pressure_demote_for_{}ms",
self.restore_hold.as_millis()
);
self.clear_samples = 0;
return None;
}
if self.runtime_is_clear(runtime) {
self.clear_samples = self.clear_samples.saturating_add(1);
if self.clear_samples >= self.restore_clear_samples {
let old_budget = self.current_budget();
self.state = LexicalRebuildResponsivenessState::Steady;
self.reason = format!(
"restored_steady_budget_after_{}_clear_samples",
self.restore_clear_samples
);
self.clear_samples = 0;
self.last_transition_at = Instant::now();
let new_budget = self.current_budget();
if old_budget != new_budget {
return Some(LexicalRebuildBudgetTransition {
old_budget,
new_budget,
mode: self.mode(),
reason: self.reason.clone(),
});
}
} else {
self.reason = format!(
"awaiting_clear_pressure_window_{}/{}",
self.clear_samples, self.restore_clear_samples
);
}
} else {
self.clear_samples = 0;
self.reason = "pressure_signals_not_yet_clear".to_string();
}
} else {
self.reason = "steady_budget_with_headroom".to_string();
}
None
}
fn observe_new_producer_handoff_wait(&mut self, observed_count: usize) -> bool {
let new_wait_observed = observed_count > self.last_observed_producer_handoff_wait_count;
self.last_observed_producer_handoff_wait_count = observed_count;
new_wait_observed
}
fn detect_pressure(
&self,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
new_producer_handoff_wait: bool,
) -> Option<String> {
let current_budget = self.current_budget();
if let (Some(loadavg_1m_milli), Some(high_watermark_1m_milli)) = (
runtime.host_loadavg_1m_milli,
self.loadavg_high_watermark_1m_milli,
) && loadavg_1m_milli >= high_watermark_1m_milli
{
return Some(format!(
"host_loadavg_1m_{}_reached_high_watermark_{}",
format_lexical_rebuild_loadavg_1m_milli(loadavg_1m_milli),
format_lexical_rebuild_loadavg_1m_milli(high_watermark_1m_milli)
));
}
if let Some(available_memory_bytes) = runtime.host_available_memory_bytes {
let available_memory_bytes = usize_from_u64_saturating(available_memory_bytes);
if available_memory_bytes <= self.emergency_memory_reserve_bytes {
return Some(format!(
"host_available_memory_bytes_{}_below_emergency_reserve_{}",
available_memory_bytes, self.emergency_memory_reserve_bytes
));
}
if available_memory_bytes <= self.memory_reserve_bytes {
return Some(format!(
"host_available_memory_bytes_{}_below_reserve_{}",
available_memory_bytes, self.memory_reserve_bytes
));
}
}
if new_producer_handoff_wait {
return Some(format!(
"producer_handoff_wait_count_{}_observed_consumer_backpressure",
runtime.producer_handoff_wait_count
));
}
if runtime.queue_depth >= self.pipeline_channel_size {
return Some(format!(
"queue_depth_{}_reached_pipeline_capacity_{}",
runtime.queue_depth, self.pipeline_channel_size
));
}
if runtime.ordered_buffered_pages > 0 && runtime.queue_depth > 0 {
return Some(format!(
"ordered_barrier_buffered_{}_pages_while_queue_depth_is_{}",
runtime.ordered_buffered_pages, runtime.queue_depth
));
}
if current_budget.max_message_bytes_in_flight > 0
&& runtime.inflight_message_bytes.saturating_mul(100)
>= current_budget
.max_message_bytes_in_flight
.saturating_mul(Self::INFLIGHT_HIGH_WATERMARK_PERCENT)
{
return Some(format!(
"inflight_message_bytes_{}_near_limit_{}",
runtime.inflight_message_bytes, current_budget.max_message_bytes_in_flight
));
}
if runtime.pending_batch_conversations >= current_budget.page_conversation_limit {
return Some(format!(
"pending_batch_conversations_{}_reached_limit_{}",
runtime.pending_batch_conversations, current_budget.page_conversation_limit
));
}
if runtime.pending_batch_message_bytes >= current_budget.batch_fetch_message_bytes_limit {
return Some(format!(
"pending_batch_message_bytes_{}_reached_limit_{}",
runtime.pending_batch_message_bytes, current_budget.batch_fetch_message_bytes_limit
));
}
None
}
fn runtime_is_clear(&self, runtime: &LexicalRebuildPipelineRuntimeSnapshot) -> bool {
let current_budget = self.current_budget();
let loadavg_is_clear = match (
runtime.host_loadavg_1m_milli,
self.loadavg_low_watermark_1m_milli,
) {
(_, None) => true,
(Some(loadavg_1m_milli), Some(low_watermark_1m_milli)) => {
loadavg_1m_milli <= low_watermark_1m_milli
}
(None, Some(_)) => true,
};
let memory_is_clear = runtime
.host_available_memory_bytes
.map(|available_memory_bytes| {
usize_from_u64_saturating(available_memory_bytes) > self.memory_reserve_bytes
})
.unwrap_or(true);
runtime.queue_depth == 0
&& runtime.ordered_buffered_pages == 0
&& runtime.pending_batch_conversations == 0
&& runtime.pending_batch_message_bytes == 0
&& current_budget.max_message_bytes_in_flight > 0
&& runtime.inflight_message_bytes.saturating_mul(100)
<= current_budget
.max_message_bytes_in_flight
.saturating_mul(Self::INFLIGHT_LOW_WATERMARK_PERCENT)
&& loadavg_is_clear
&& memory_is_clear
}
}
fn format_lexical_rebuild_loadavg_1m_milli(loadavg_1m_milli: u32) -> String {
format!("{:.3}", f64::from(loadavg_1m_milli) / 1000.0)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct LexicalRebuildPipelineBudgetSnapshot {
page_conversation_limit: usize,
batch_fetch_message_limit: usize,
batch_fetch_message_bytes_limit: usize,
max_message_bytes_in_flight: usize,
commit_interval_conversations: usize,
commit_interval_messages: usize,
commit_interval_message_bytes: usize,
}
impl LexicalRebuildPipelineBudgetSnapshot {
#[allow(clippy::too_many_arguments)]
fn new(
page_conversation_limit: usize,
batch_fetch_message_limit: usize,
batch_fetch_message_bytes_limit: usize,
max_message_bytes_in_flight: usize,
commit_interval_conversations: usize,
commit_interval_messages: usize,
commit_interval_message_bytes: usize,
) -> Self {
Self {
page_conversation_limit: page_conversation_limit.max(1),
batch_fetch_message_limit: batch_fetch_message_limit.max(1),
batch_fetch_message_bytes_limit: batch_fetch_message_bytes_limit.max(1),
max_message_bytes_in_flight: max_message_bytes_in_flight.max(1),
commit_interval_conversations: commit_interval_conversations.max(1),
commit_interval_messages: commit_interval_messages.max(1),
commit_interval_message_bytes: commit_interval_message_bytes.max(1),
}
}
}
#[derive(Debug)]
struct LexicalRebuildPipelineBudgetController {
page_conversation_limit: AtomicUsize,
batch_fetch_message_limit: AtomicUsize,
batch_fetch_message_bytes_limit: AtomicUsize,
max_message_bytes_in_flight: AtomicUsize,
commit_interval_conversations: AtomicUsize,
commit_interval_messages: AtomicUsize,
commit_interval_message_bytes: AtomicUsize,
generation: AtomicUsize,
generation_lock: Mutex<usize>,
generation_cv: Condvar,
}
impl LexicalRebuildPipelineBudgetController {
fn new(snapshot: LexicalRebuildPipelineBudgetSnapshot) -> Self {
Self {
page_conversation_limit: AtomicUsize::new(snapshot.page_conversation_limit),
batch_fetch_message_limit: AtomicUsize::new(snapshot.batch_fetch_message_limit),
batch_fetch_message_bytes_limit: AtomicUsize::new(
snapshot.batch_fetch_message_bytes_limit,
),
max_message_bytes_in_flight: AtomicUsize::new(snapshot.max_message_bytes_in_flight),
commit_interval_conversations: AtomicUsize::new(snapshot.commit_interval_conversations),
commit_interval_messages: AtomicUsize::new(snapshot.commit_interval_messages),
commit_interval_message_bytes: AtomicUsize::new(snapshot.commit_interval_message_bytes),
generation: AtomicUsize::new(0),
generation_lock: Mutex::new(0),
generation_cv: Condvar::new(),
}
}
fn snapshot(&self) -> LexicalRebuildPipelineBudgetSnapshot {
LexicalRebuildPipelineBudgetSnapshot {
page_conversation_limit: self.page_conversation_limit.load(Ordering::Relaxed),
batch_fetch_message_limit: self.batch_fetch_message_limit.load(Ordering::Relaxed),
batch_fetch_message_bytes_limit: self
.batch_fetch_message_bytes_limit
.load(Ordering::Relaxed),
max_message_bytes_in_flight: self.max_message_bytes_in_flight.load(Ordering::Relaxed),
commit_interval_conversations: self
.commit_interval_conversations
.load(Ordering::Relaxed),
commit_interval_messages: self.commit_interval_messages.load(Ordering::Relaxed),
commit_interval_message_bytes: self
.commit_interval_message_bytes
.load(Ordering::Relaxed),
}
}
fn generation(&self) -> usize {
self.generation.load(Ordering::Acquire)
}
fn update(&self, snapshot: LexicalRebuildPipelineBudgetSnapshot) {
self.page_conversation_limit
.store(snapshot.page_conversation_limit, Ordering::Relaxed);
self.batch_fetch_message_limit
.store(snapshot.batch_fetch_message_limit, Ordering::Relaxed);
self.batch_fetch_message_bytes_limit
.store(snapshot.batch_fetch_message_bytes_limit, Ordering::Relaxed);
self.max_message_bytes_in_flight
.store(snapshot.max_message_bytes_in_flight, Ordering::Relaxed);
self.commit_interval_conversations
.store(snapshot.commit_interval_conversations, Ordering::Relaxed);
self.commit_interval_messages
.store(snapshot.commit_interval_messages, Ordering::Relaxed);
self.commit_interval_message_bytes
.store(snapshot.commit_interval_message_bytes, Ordering::Relaxed);
let new_generation = self.generation.fetch_add(1, Ordering::AcqRel) + 1;
let mut guard = self
.generation_lock
.lock()
.unwrap_or_else(|poisoned| poisoned.into_inner());
*guard = new_generation;
self.generation_cv.notify_all();
}
fn wait_for_update_after(&self, observed_generation: usize, timeout: Duration) -> bool {
if self.generation() > observed_generation {
return true;
}
let deadline = Instant::now() + timeout;
let mut guard = self
.generation_lock
.lock()
.unwrap_or_else(|poisoned| poisoned.into_inner());
while *guard <= observed_generation {
let now = Instant::now();
if now >= deadline {
return self.generation() > observed_generation;
}
let remaining = deadline.saturating_duration_since(now);
let (next_guard, timeout_result) = self
.generation_cv
.wait_timeout(guard, remaining)
.unwrap_or_else(|poisoned| poisoned.into_inner());
guard = next_guard;
if timeout_result.timed_out() {
return self.generation() > observed_generation;
}
}
true
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
struct LexicalRebuildProducerTelemetrySnapshot {
page_prep_workers: usize,
active_page_prep_jobs: usize,
ordered_buffered_pages: usize,
budget_wait_count: usize,
budget_wait_ms: usize,
handoff_wait_count: usize,
handoff_wait_ms: usize,
}
#[derive(Debug, Default)]
struct LexicalRebuildProducerTelemetry {
page_prep_workers: AtomicUsize,
active_page_prep_jobs: AtomicUsize,
ordered_buffered_pages: AtomicUsize,
budget_wait_count: AtomicUsize,
budget_wait_ms: AtomicUsize,
handoff_wait_count: AtomicUsize,
handoff_wait_ms: AtomicUsize,
}
impl LexicalRebuildProducerTelemetry {
fn saturating_add(counter: &AtomicUsize, value: usize) {
let _ = counter.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| {
Some(current.saturating_add(value))
});
}
fn duration_millis(duration: Duration) -> usize {
if duration.is_zero() {
return 0;
}
usize::try_from(duration.as_millis())
.unwrap_or(usize::MAX)
.max(1)
}
fn snapshot(&self) -> LexicalRebuildProducerTelemetrySnapshot {
LexicalRebuildProducerTelemetrySnapshot {
page_prep_workers: self.page_prep_workers.load(Ordering::Relaxed),
active_page_prep_jobs: self.active_page_prep_jobs.load(Ordering::Relaxed),
ordered_buffered_pages: self.ordered_buffered_pages.load(Ordering::Relaxed),
budget_wait_count: self.budget_wait_count.load(Ordering::Relaxed),
budget_wait_ms: self.budget_wait_ms.load(Ordering::Relaxed),
handoff_wait_count: self.handoff_wait_count.load(Ordering::Relaxed),
handoff_wait_ms: self.handoff_wait_ms.load(Ordering::Relaxed),
}
}
fn record(
&self,
page_prep_workers: usize,
active_page_prep_jobs: usize,
ordered_buffered_pages: usize,
) {
self.page_prep_workers
.store(page_prep_workers, Ordering::Relaxed);
self.active_page_prep_jobs
.store(active_page_prep_jobs, Ordering::Relaxed);
self.ordered_buffered_pages
.store(ordered_buffered_pages, Ordering::Relaxed);
}
fn record_budget_wait(&self, duration: Duration) {
Self::saturating_add(&self.budget_wait_count, 1);
Self::saturating_add(&self.budget_wait_ms, Self::duration_millis(duration));
}
fn record_handoff_wait(&self, duration: Duration) {
Self::saturating_add(&self.handoff_wait_count, 1);
Self::saturating_add(&self.handoff_wait_ms, Self::duration_millis(duration));
}
}
fn lexical_rebuild_runtime_pipeline_budget_snapshot(
page_conversation_limit: usize,
batch_fetch_message_limit: usize,
batch_fetch_message_bytes_limit: usize,
pipeline_channel_size: usize,
commit_interval_conversations: usize,
commit_interval_messages: usize,
commit_interval_message_bytes: usize,
) -> LexicalRebuildPipelineBudgetSnapshot {
let batch_fetch_message_bytes_limit = batch_fetch_message_bytes_limit.max(1);
LexicalRebuildPipelineBudgetSnapshot::new(
page_conversation_limit,
batch_fetch_message_limit,
batch_fetch_message_bytes_limit,
lexical_rebuild_pipeline_max_message_bytes_in_flight(
batch_fetch_message_bytes_limit,
pipeline_channel_size,
),
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
)
}
fn lexical_rebuild_pipeline_channel_size() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_PIPELINE_CHANNEL_SIZE")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or(4)
}
fn lexical_rebuild_default_page_prep_worker_parallelism_for_workers(
worker_parallelism: usize,
) -> usize {
if worker_parallelism <= 1 {
1
} else {
worker_parallelism.div_ceil(2).clamp(2, 8)
}
}
fn lexical_rebuild_page_prep_worker_parallelism() -> usize {
let desired = lexical_rebuild_page_prep_worker_parallelism_configured();
responsiveness::effective_worker_count(desired).max(1)
}
fn lexical_rebuild_page_prep_worker_parallelism_configured() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_PAGE_PREP_WORKERS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(
lexical_rebuild_worker_parallelism(),
)
})
.max(1)
}
fn lexical_rebuild_first_budget_promotion_wait() -> Duration {
Duration::from_millis(
dotenvy::var("CASS_TANTIVY_REBUILD_FIRST_BUDGET_PROMOTION_WAIT_MS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(5_000),
)
}
fn lexical_rebuild_pipeline_max_message_bytes_in_flight(
batch_fetch_message_bytes_limit: usize,
pipeline_channel_size: usize,
) -> usize {
let desired = dotenvy::var("CASS_TANTIVY_REBUILD_PIPELINE_MAX_MESSAGE_BYTES_IN_FLIGHT")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
batch_fetch_message_bytes_limit
.max(1)
.saturating_mul(pipeline_channel_size.saturating_add(1).max(1))
});
responsiveness::effective_inflight_byte_limit(desired).max(1)
}
fn should_commit_lexical_rebuild(
conversations_since_commit: usize,
messages_since_commit: usize,
message_bytes_since_commit: usize,
commit_interval_conversations: usize,
commit_interval_messages: usize,
commit_interval_message_bytes: usize,
) -> bool {
conversations_since_commit >= commit_interval_conversations
|| messages_since_commit >= commit_interval_messages
|| message_bytes_since_commit >= commit_interval_message_bytes
}
fn apply_lexical_rebuild_budget_transition(
transition: LexicalRebuildBudgetTransition,
flow_limiter: &StreamingByteLimiter,
pipeline_budget_controller: &LexicalRebuildPipelineBudgetController,
current_batch_conversation_limit: &mut usize,
active_commit_intervals: Option<(&mut usize, &mut usize, &mut usize)>,
) {
flow_limiter.update_max_bytes_in_flight(transition.new_budget.max_message_bytes_in_flight);
pipeline_budget_controller.update(transition.new_budget);
*current_batch_conversation_limit = transition.new_budget.page_conversation_limit;
if let Some((
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
)) = active_commit_intervals
{
*commit_interval_conversations = transition.new_budget.commit_interval_conversations;
*commit_interval_messages = transition.new_budget.commit_interval_messages;
*commit_interval_message_bytes = transition.new_budget.commit_interval_message_bytes;
}
tracing::info!(
controller_mode = transition.mode,
controller_reason = %transition.reason,
old_page_conversation_limit = transition.old_budget.page_conversation_limit,
new_page_conversation_limit = transition.new_budget.page_conversation_limit,
old_batch_fetch_message_limit = transition.old_budget.batch_fetch_message_limit,
new_batch_fetch_message_limit = transition.new_budget.batch_fetch_message_limit,
old_batch_fetch_message_bytes_limit = transition.old_budget.batch_fetch_message_bytes_limit,
new_batch_fetch_message_bytes_limit = transition.new_budget.batch_fetch_message_bytes_limit,
old_max_message_bytes_in_flight = transition.old_budget.max_message_bytes_in_flight,
new_max_message_bytes_in_flight = transition.new_budget.max_message_bytes_in_flight,
old_commit_interval_conversations =
transition.old_budget.commit_interval_conversations,
new_commit_interval_conversations =
transition.new_budget.commit_interval_conversations,
old_commit_interval_messages = transition.old_budget.commit_interval_messages,
new_commit_interval_messages = transition.new_budget.commit_interval_messages,
old_commit_interval_message_bytes =
transition.old_budget.commit_interval_message_bytes,
new_commit_interval_message_bytes =
transition.new_budget.commit_interval_message_bytes,
"updated lexical rebuild pipeline budgets"
);
}
fn should_persist_lexical_rebuild_progress(
conversations_since_progress_persist: usize,
progress_heartbeat_interval_conversations: usize,
time_since_last_progress_persist: Duration,
progress_heartbeat_interval: Duration,
) -> bool {
conversations_since_progress_persist >= progress_heartbeat_interval_conversations
|| time_since_last_progress_persist >= progress_heartbeat_interval
}
fn write_json_pretty_atomically<T: serde::Serialize>(path: &Path, value: &T) -> Result<()> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
.with_context(|| format!("creating parent directory for {}", path.display()))?;
}
let temp_path = unique_atomic_temp_path(path);
{
let file = create_new_atomic_sidecar_file(&temp_path)
.with_context(|| format!("creating temporary file {}", temp_path.display()))?;
let mut writer = BufWriter::new(file);
serde_json::to_writer_pretty(&mut writer, value)
.with_context(|| format!("serializing {}", path.display()))?;
writer
.flush()
.with_context(|| format!("flushing temporary file {}", temp_path.display()))?;
writer
.get_ref()
.sync_all()
.with_context(|| format!("syncing temporary file {}", temp_path.display()))?;
}
replace_file_from_temp(&temp_path, path)
.with_context(|| format!("replacing {} from temp file", path.display()))
}
#[cfg(not(windows))]
fn sync_parent_directory(path: &Path) -> Result<()> {
let Some(parent) = path.parent() else {
return Ok(());
};
let directory = File::open(parent)
.with_context(|| format!("opening parent directory {} for sync", parent.display()))?;
directory
.sync_all()
.with_context(|| format!("syncing parent directory {}", parent.display()))
}
#[cfg(windows)]
fn sync_parent_directory(_path: &Path) -> Result<()> {
Ok(())
}
fn load_lexical_rebuild_state(index_path: &Path) -> Result<Option<LexicalRebuildState>> {
let path = lexical_rebuild_state_path(index_path);
let bytes = match fs::read(&path) {
Ok(bytes) => bytes,
Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(None),
Err(err) => {
return Err(err)
.with_context(|| format!("reading lexical rebuild state {}", path.display()));
}
};
match serde_json::from_slice::<LexicalRebuildState>(&bytes) {
Ok(state) => Ok(Some(state)),
Err(err) => {
tracing::warn!(
path = %path.display(),
error = %err,
"ignoring malformed lexical rebuild checkpoint"
);
Ok(None)
}
}
}
fn persist_lexical_rebuild_state(index_path: &Path, state: &LexicalRebuildState) -> Result<()> {
let path = lexical_rebuild_state_path(index_path);
write_json_pretty_atomically(&path, state)
}
fn clear_lexical_rebuild_state(index_path: &Path) -> Result<()> {
let path = lexical_rebuild_state_path(index_path);
match fs::remove_file(&path) {
Ok(()) => Ok(()),
Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(()),
Err(err) => {
Err(err).with_context(|| format!("removing lexical rebuild state {}", path.display()))
}
}
}
fn index_meta_fingerprint(index_path: &Path) -> Result<Option<String>> {
crate::search::tantivy::searchable_index_fingerprint(index_path)
}
fn completed_lexical_rebuild_meta_fingerprint(
state: &LexicalRebuildState,
index_path: &Path,
) -> Result<Option<String>> {
match &state.committed_meta_fingerprint {
Some(fingerprint) => Ok(Some(fingerprint.clone())),
None => index_meta_fingerprint(index_path),
}
}
fn live_tantivy_doc_count(index_path: &Path) -> Result<Option<usize>> {
match crate::search::tantivy::searchable_index_summary(index_path) {
Ok(Some(summary)) => Ok(Some(summary.docs)),
Ok(None) => Ok(None),
Err(err) => {
tracing::debug!(
path = %index_path.display(),
error = %err,
"live Tantivy reader unavailable while refreshing lexical checkpoint"
);
Ok(None)
}
}
}
fn validate_lexical_rebuild_shard_build_result(
result: &LexicalRebuildShardBuildResult,
) -> Result<LexicalRebuildShardMergeArtifact> {
crate::search::tantivy::validate_searchable_index_contract(&result.shard_index_path)
.with_context(|| {
format!(
"validating built lexical rebuild shard {} at {}",
result.shard.shard_index,
result.shard_index_path.display()
)
})?;
let observed_docs = live_tantivy_doc_count(&result.shard_index_path)?.ok_or_else(|| {
anyhow::anyhow!(
"built lexical rebuild shard {} at {} is not searchable",
result.shard.shard_index,
result.shard_index_path.display()
)
})?;
if observed_docs != result.indexed_docs {
return Err(anyhow::anyhow!(
"built lexical rebuild shard {} reported {} docs but a fresh Tantivy reader sees {}",
result.shard.shard_index,
result.indexed_docs,
observed_docs
));
}
if lexical_shard_message_count_is_known(result.shard.message_count)
&& observed_docs > result.shard.message_count
{
return Err(anyhow::anyhow!(
"built lexical rebuild shard {} indexed {} docs which EXCEEDS its shard plan's \
{} source messages — the lexical sink should never produce more docs than \
rows; investigate fan-out or duplicate-document bugs",
result.shard.shard_index,
observed_docs,
result.shard.message_count
));
}
if lexical_shard_message_count_is_known(result.shard.message_count) {
let filtered = result.shard.message_count.saturating_sub(observed_docs);
if filtered > 0 {
tracing::debug!(
target: "cass::indexer::lexical_rebuild",
shard_index = result.shard.shard_index,
observed_docs,
planned_message_count = result.shard.message_count,
filtered_messages = filtered,
"lexical rebuild shard indexed fewer docs than the shard plan's message count; \
gap is hard-noise/empty-content filtering applied by cass_document_for_message"
);
}
} else {
tracing::debug!(
target: "cass::indexer::lexical_rebuild",
shard_index = result.shard.shard_index,
observed_docs,
"lexical rebuild shard used conversation-only planning; source message count validation deferred to exact indexed-doc accounting"
);
}
Ok(LexicalRebuildShardMergeArtifact {
first_shard_index: result.shard.shard_index,
last_shard_index: result.shard.shard_index,
index_path: result.shard_index_path.clone(),
docs: observed_docs,
segments: result.segments,
})
}
fn validate_complete_lexical_rebuild_shard_artifacts(
shard_plan: &LexicalShardPlan,
artifacts: &[LexicalRebuildShardMergeArtifact],
) -> Result<()> {
if artifacts.len() != shard_plan.shards.len() {
return Err(anyhow::anyhow!(
"staged lexical rebuild validated {} shard artifacts but planned {} shards",
artifacts.len(),
shard_plan.shards.len()
));
}
for (expected, artifact) in shard_plan.shards.iter().zip(artifacts.iter()) {
if artifact.first_shard_index != expected.shard_index
|| artifact.last_shard_index != expected.shard_index
{
return Err(anyhow::anyhow!(
"staged lexical rebuild shard artifact order mismatch: expected shard {} but got range {}..={}",
expected.shard_index,
artifact.first_shard_index,
artifact.last_shard_index
));
}
let observed_docs = artifact.docs;
if lexical_shard_message_count_is_known(expected.message_count)
&& observed_docs > expected.message_count
{
return Err(anyhow::anyhow!(
"validated lexical rebuild shard {} has {} docs which EXCEEDS its shard plan's \
{} source messages — investigate fan-out or duplicate-document bugs",
expected.shard_index,
observed_docs,
expected.message_count
));
}
if !lexical_shard_message_count_is_known(expected.message_count) {
continue;
}
let filtered = expected.message_count.saturating_sub(observed_docs);
if filtered > 0 {
tracing::debug!(
target: "cass::indexer::lexical_rebuild",
shard_index = expected.shard_index,
observed_docs,
planned_message_count = expected.message_count,
filtered_messages = filtered,
"validated lexical rebuild shard has fewer docs than the shard plan's message \
count; gap is hard-noise/empty-content filtering"
);
}
}
Ok(())
}
fn pending_commit_landed(
base_meta_fingerprint: Option<&str>,
current_meta_fingerprint: Option<&str>,
) -> bool {
match (base_meta_fingerprint, current_meta_fingerprint) {
(None, Some(_)) => true,
(Some(base), Some(current)) => current != base,
_ => false,
}
}
fn reconcile_pending_lexical_commit(
index_path: &Path,
mut state: LexicalRebuildState,
) -> Result<LexicalRebuildState> {
let Some(pending) = state.pending.clone() else {
return Ok(state);
};
let current_meta_fingerprint = index_meta_fingerprint(index_path)?;
if pending_commit_landed(
pending.base_meta_fingerprint.as_deref(),
current_meta_fingerprint.as_deref(),
) {
state.finalize_commit(current_meta_fingerprint);
} else {
state.clear_pending();
}
state.clear_runtime();
persist_lexical_rebuild_state(index_path, &state)?;
Ok(state)
}
fn prepare_lexical_rebuild_state_for_active_run(
index_path: &Path,
state: &mut LexicalRebuildState,
) -> Result<()> {
if !state.is_incomplete() || !state.runtime.is_observed() {
return Ok(());
}
state.clear_runtime();
persist_lexical_rebuild_state(index_path, state)
}
fn persist_lexical_rebuild_state_for_active_run_start(
index_path: &Path,
state: &LexicalRebuildState,
) -> Result<()> {
persist_lexical_rebuild_state(index_path, state)
}
fn normalize_lexical_rebuild_state_for_current_run(
index_path: &Path,
state: &mut LexicalRebuildState,
) -> Result<()> {
if state.page_size == LEXICAL_REBUILD_PAGE_SIZE {
return Ok(());
}
if !lexical_rebuild_page_size_is_compatible(state.page_size) {
anyhow::bail!(
"refusing to normalize incompatible lexical rebuild checkpoint page_size={} at {}",
state.page_size,
index_path.display()
);
}
state.page_size = LEXICAL_REBUILD_PAGE_SIZE;
persist_lexical_rebuild_state(index_path, state)
}
fn resolve_legacy_lexical_rebuild_conversation_id_from_offset(
storage: &FrankenStorage,
committed_offset: i64,
) -> Result<Option<i64>> {
if committed_offset <= 0 {
return Ok(None);
}
let rows = storage
.list_conversations_for_lexical_rebuild_by_offset(
1,
committed_offset.saturating_sub(1),
&HashMap::new(),
&HashMap::new(),
)
.with_context(|| {
format!(
"resolving legacy lexical rebuild cursor at ordinal offset {}",
committed_offset.saturating_sub(1)
)
})?;
Ok(rows
.into_iter()
.next()
.and_then(|conversation| conversation.id))
}
fn upgrade_lexical_rebuild_state_resume_cursor_if_needed(
storage: &FrankenStorage,
index_path: &Path,
state: &mut LexicalRebuildState,
) -> Result<()> {
if state.completed {
return Ok(());
}
let mut changed = false;
if state.committed_offset > 0 && state.committed_conversation_id.is_none() {
let Some(conversation_id) = resolve_legacy_lexical_rebuild_conversation_id_from_offset(
storage,
state.committed_offset,
)?
else {
tracing::warn!(
path = %index_path.display(),
committed_offset = state.committed_offset,
"discarding incomplete lexical rebuild checkpoint because the legacy ordinal cursor could not be upgraded to a stable conversation id"
);
*state = LexicalRebuildState::new(state.db.clone(), LEXICAL_REBUILD_PAGE_SIZE);
persist_lexical_rebuild_state(index_path, state)?;
return Ok(());
};
tracing::info!(
path = %index_path.display(),
committed_offset = state.committed_offset,
committed_conversation_id = conversation_id,
"upgraded lexical rebuild checkpoint from ordinal offset resume to stable conversation-id resume"
);
state.committed_conversation_id = Some(conversation_id);
changed = true;
}
if let Some(pending) = state.pending.as_mut()
&& pending.next_offset > 0
&& pending.next_conversation_id.is_none()
{
let Some(conversation_id) = resolve_legacy_lexical_rebuild_conversation_id_from_offset(
storage,
pending.next_offset,
)?
else {
tracing::warn!(
path = %index_path.display(),
next_offset = pending.next_offset,
"dropping legacy pending lexical rebuild progress because the ordinal cursor could not be upgraded to a stable conversation id"
);
state.clear_pending();
persist_lexical_rebuild_state(index_path, state)?;
return Ok(());
};
tracing::info!(
path = %index_path.display(),
next_offset = pending.next_offset,
next_conversation_id = conversation_id,
"upgraded pending lexical rebuild checkpoint from ordinal offset resume to stable conversation-id resume"
);
pending.next_conversation_id = Some(conversation_id);
changed = true;
}
if changed {
persist_lexical_rebuild_state(index_path, state)?;
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn persist_pending_lexical_rebuild_progress_with_base_meta_fingerprint(
index_path: &Path,
state: &mut LexicalRebuildState,
next_conversation_id: Option<i64>,
processed_conversations: usize,
indexed_docs: usize,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
base_meta_fingerprint_override: Option<&str>,
) -> Result<()> {
let base_meta_fingerprint = match base_meta_fingerprint_override {
Some(base_meta_fingerprint) => Some(base_meta_fingerprint.to_string()),
None => index_meta_fingerprint(index_path)?,
};
let current_processed_conversations = state.reported_processed_conversations();
let current_indexed_docs = state.reported_indexed_docs();
if processed_conversations < current_processed_conversations
|| indexed_docs < current_indexed_docs
{
tracing::warn!(
requested_next_conversation_id = ?next_conversation_id,
requested_processed_conversations = processed_conversations,
current_processed_conversations,
requested_indexed_docs = indexed_docs,
current_indexed_docs,
"ignoring stale lexical rebuild progress checkpoint update while preserving runtime telemetry"
);
if state.runtime != *runtime {
state.set_runtime(runtime);
persist_lexical_rebuild_state(index_path, state)?;
}
return Ok(());
}
let already_recorded = state.pending.as_ref().is_some_and(|pending| {
pending.next_conversation_id == next_conversation_id
&& pending.processed_conversations == processed_conversations
&& pending.indexed_docs == indexed_docs
&& pending.base_meta_fingerprint == base_meta_fingerprint
&& state.runtime == *runtime
});
if already_recorded {
return Ok(());
}
state.record_pending_commit(
next_conversation_id,
processed_conversations,
indexed_docs,
base_meta_fingerprint,
);
state.set_runtime(runtime);
persist_lexical_rebuild_state(index_path, state)
}
fn persist_pending_lexical_rebuild_progress(
index_path: &Path,
state: &mut LexicalRebuildState,
next_conversation_id: Option<i64>,
processed_conversations: usize,
indexed_docs: usize,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
) -> Result<()> {
persist_pending_lexical_rebuild_progress_with_base_meta_fingerprint(
index_path,
state,
next_conversation_id,
processed_conversations,
indexed_docs,
runtime,
None,
)
}
#[allow(clippy::too_many_arguments)]
fn maybe_persist_staged_lexical_rebuild_progress(
index_path: &Path,
state: &mut LexicalRebuildState,
next_conversation_id: Option<i64>,
processed_conversations: usize,
indexed_docs: usize,
runtime: &LexicalRebuildPipelineRuntimeSnapshot,
base_meta_fingerprint_override: Option<&str>,
force: bool,
conversations_since_progress_persist: &mut usize,
progress_heartbeat_interval_conversations: usize,
last_progress_persist: &mut Instant,
progress_heartbeat_interval: Duration,
progress_bump: Option<&Arc<AtomicI64>>,
mut perf_profile: Option<&mut LexicalRebuildPerfProfile>,
) -> Result<bool> {
if !force
&& !should_persist_lexical_rebuild_progress(
*conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
last_progress_persist.elapsed(),
progress_heartbeat_interval,
)
{
return Ok(false);
}
let heartbeat_progress_started = perf_profile.as_ref().map(|_| Instant::now());
persist_pending_lexical_rebuild_progress_with_base_meta_fingerprint(
index_path,
state,
next_conversation_id,
processed_conversations,
indexed_docs,
runtime,
base_meta_fingerprint_override,
)?;
bump_index_run_lock_progress_if_present(progress_bump);
if let (Some(profile), Some(started)) = (perf_profile.as_mut(), heartbeat_progress_started) {
profile.heartbeat_persist_count = profile.heartbeat_persist_count.saturating_add(1);
profile.heartbeat_progress_duration += started.elapsed();
}
*conversations_since_progress_persist = 0;
*last_progress_persist = Instant::now();
Ok(true)
}
fn lexical_rebuild_content_fingerprint_value(
total_conversations: usize,
max_conversation_id: i64,
max_message_id: i64,
) -> String {
format!("content-v1:{total_conversations}:{max_conversation_id}:{max_message_id}")
}
fn lexical_rebuild_deferred_content_fingerprint(total_conversations: usize) -> String {
format!("content-pending-v1:{total_conversations}")
}
fn lexical_rebuild_content_fingerprint(
storage: &FrankenStorage,
total_conversations: usize,
) -> Result<String> {
let prep_profile = std::env::var_os("CASS_PREP_PROFILE").is_some();
let conversations_started = Instant::now();
let max_conversation_id: i64 = storage
.raw()
.query_row_map(
"SELECT COALESCE(MAX(id), 0) FROM conversations",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.context("computing lexical rebuild conversation fingerprint")?;
if prep_profile {
eprintln!(
"CASS_PREP_PROFILE step=fingerprint_conversations step_ms={}",
conversations_started.elapsed().as_millis()
);
}
let messages_started = Instant::now();
let max_message_id: i64 = storage
.raw()
.query_row_map(
"SELECT COALESCE(MAX(id), 0) FROM messages",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.context("computing lexical rebuild message fingerprint")?;
if prep_profile {
eprintln!(
"CASS_PREP_PROFILE step=fingerprint_messages step_ms={}",
messages_started.elapsed().as_millis()
);
}
Ok(lexical_rebuild_content_fingerprint_value(
total_conversations,
max_conversation_id,
max_message_id,
))
}
fn lexical_rebuild_storage_fingerprint(db_path: &Path) -> Result<String> {
let mut storage = FrankenStorage::open_readonly(db_path).with_context(|| {
format!(
"opening readonly storage to compute lexical fingerprint for {}",
db_path.display()
)
})?;
let total_conversations = count_total_conversations_exact(&storage)?;
let fingerprint = lexical_rebuild_content_fingerprint(&storage, total_conversations)?;
storage.close_best_effort_in_place();
Ok(fingerprint)
}
fn count_total_conversations_exact(storage: &FrankenStorage) -> Result<usize> {
let total_conversations: i64 = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM conversations",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.context("counting canonical conversations for lexical rebuild state")?;
Ok(usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX))
}
fn semantic_indexing_now_ms() -> i64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_or(0, |d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
}
fn system_time_to_epoch_millis(time: SystemTime) -> Option<i64> {
time.duration_since(UNIX_EPOCH)
.ok()
.map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
}
fn semantic_tier_for_embedder_id(embedder_id: &str) -> Option<SemanticTierKind> {
match embedder_id {
"minilm-384" => Some(SemanticTierKind::Quality),
"fnv1a-384" => Some(SemanticTierKind::Fast),
_ => None,
}
}
fn semantic_model_revision_for_embedder_id(embedder_id: &str) -> String {
if embedder_id == "fnv1a-384" {
"hash".to_string()
} else {
crate::search::model_download::ModelManifest::minilm_v2()
.revision
.clone()
}
}
#[allow(clippy::too_many_arguments)]
fn publish_direct_semantic_artifact(
storage: &FrankenStorage,
data_dir: &Path,
index_path: &Path,
embedder_id: &str,
embedder_dimension: usize,
embedded_doc_count: u64,
build_started_at_ms: i64,
) -> Result<()> {
let Some(tier) = semantic_tier_for_embedder_id(embedder_id) else {
tracing::debug!(
embedder = embedder_id,
"skipping direct semantic manifest publish: unknown embedder tier"
);
return Ok(());
};
let total_conversations_raw = count_total_conversations_exact(storage)?;
let db_fingerprint = lexical_rebuild_content_fingerprint(storage, total_conversations_raw)?;
let total_conversations = u64::try_from(total_conversations_raw).unwrap_or(u64::MAX);
let size_bytes = fs::metadata(index_path)
.with_context(|| {
format!(
"stat published semantic index {} for direct manifest publish",
index_path.display()
)
})?
.len();
let relative_index_path = index_path
.strip_prefix(data_dir)
.unwrap_or(index_path)
.to_string_lossy()
.into_owned();
let model_revision = semantic_model_revision_for_embedder_id(embedder_id);
let mut manifest = SemanticManifest::load_or_default(data_dir).map_err(|err| {
anyhow::anyhow!("loading semantic manifest for direct artifact publish: {err}")
})?;
let now = semantic_indexing_now_ms();
manifest.publish_artifact(ArtifactRecord {
tier,
embedder_id: embedder_id.to_string(),
model_revision,
schema_version: SEMANTIC_SCHEMA_VERSION,
chunking_version: CHUNKING_STRATEGY_VERSION,
dimension: embedder_dimension,
doc_count: embedded_doc_count,
conversation_count: total_conversations,
db_fingerprint: db_fingerprint.clone(),
index_path: relative_index_path,
size_bytes,
started_at_ms: build_started_at_ms,
completed_at_ms: now,
ready: true,
});
manifest.refresh_backlog(total_conversations, &db_fingerprint);
manifest
.save(data_dir)
.map_err(|err| anyhow::anyhow!("saving semantic manifest after direct publish: {err}"))?;
tracing::info!(
embedder = embedder_id,
tier = tier.as_str(),
doc_count = embedded_doc_count,
conversation_count = total_conversations,
"published direct semantic artifact to manifest"
);
Ok(())
}
fn count_total_messages_exact(storage: &FrankenStorage) -> Result<usize> {
let total_messages: i64 = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM messages",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.context("counting canonical messages for lexical rebuild state")?;
Ok(usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX))
}
fn max_conversation_id_exact(storage: &FrankenStorage) -> Result<Option<i64>> {
let max_conversation_id: i64 = storage
.raw()
.query_row_map(
"SELECT COALESCE(MAX(id), 0) FROM conversations",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.context("computing lexical rebuild max conversation id")?;
Ok((max_conversation_id > 0).then_some(max_conversation_id))
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct IncrementalCanonicalLexicalRepairPlan {
canonical_messages: usize,
observed_tantivy_docs: Option<usize>,
reason: &'static str,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct IncrementalCanonicalLexicalRepairContext {
full_refresh: bool,
force_rebuild: bool,
resume_lexical_rebuild: bool,
targeted_watch_once_only: bool,
salvage_messages_imported: usize,
canonical_messages: usize,
tantivy_requires_rebuild: bool,
observed_tantivy_docs: Option<usize>,
published_index_validated_for_current_data: bool,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct LexicalRebuildOutcome {
pub indexed_docs: usize,
pub observed_messages: Option<usize>,
pub exact_checkpoint_persisted: bool,
pub equivalence: Option<LexicalRebuildEquivalenceEvidence>,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
struct LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: bool,
}
fn should_evaluate_incremental_canonical_lexical_repair(
context: &IncrementalCanonicalLexicalRepairContext,
) -> bool {
!context.full_refresh
&& !context.force_rebuild
&& !context.resume_lexical_rebuild
&& !context.targeted_watch_once_only
&& context.salvage_messages_imported == 0
}
fn choose_incremental_canonical_lexical_repair_plan(
context: IncrementalCanonicalLexicalRepairContext,
) -> Option<IncrementalCanonicalLexicalRepairPlan> {
if context.full_refresh
|| context.force_rebuild
|| context.resume_lexical_rebuild
|| context.targeted_watch_once_only
|| context.salvage_messages_imported > 0
|| context.canonical_messages == 0
{
return None;
}
if context.tantivy_requires_rebuild {
return Some(IncrementalCanonicalLexicalRepairPlan {
canonical_messages: context.canonical_messages,
observed_tantivy_docs: context.observed_tantivy_docs,
reason: "incremental_index_repairs_missing_or_invalid_tantivy_from_authoritative_canonical_db_before_scan",
});
}
let observed_tantivy_docs = context.observed_tantivy_docs?;
if observed_tantivy_docs < context.canonical_messages {
if context.published_index_validated_for_current_data {
tracing::warn!(
canonical_messages = context.canonical_messages,
observed_tantivy_docs,
"completed lexical checkpoint matches the canonical DB, but the live lexical index is sparse; repairing derived search assets from SQLite"
);
}
return Some(IncrementalCanonicalLexicalRepairPlan {
canonical_messages: context.canonical_messages,
observed_tantivy_docs: Some(observed_tantivy_docs),
reason: "incremental_index_repairs_sparse_tantivy_from_authoritative_canonical_db_before_scan",
});
}
None
}
fn should_salvage_historical_databases(
canonical_storage_rebuilt: bool,
canonical_sessions_before_salvage: usize,
has_pending_historical_bundles: bool,
canonical_only_full_rebuild: bool,
) -> bool {
if canonical_only_full_rebuild {
return false;
}
canonical_storage_rebuilt
|| canonical_sessions_before_salvage == 0
|| has_pending_historical_bundles
}
fn should_run_targeted_watch_once_only(
has_watch_once_paths: bool,
watch_enabled: bool,
full_rebuild: bool,
needs_rebuild: bool,
canonical_sessions_before_salvage: usize,
) -> bool {
if !has_watch_once_paths || watch_enabled || full_rebuild {
return false;
}
!needs_rebuild || canonical_sessions_before_salvage == 0
}
fn should_skip_absent_explicit_watch_once_paths(opts: &IndexOptions) -> bool {
if opts.watch || opts.full || opts.force_rebuild || opts.semantic || opts.build_hnsw {
return false;
}
let Some(paths) = opts
.watch_once_paths
.as_ref()
.filter(|paths| !paths.is_empty())
else {
return false;
};
paths
.iter()
.all(|path| matches!(path.try_exists(), Ok(false)))
}
fn can_skip_absent_explicit_watch_once_index_run(opts: &IndexOptions) -> bool {
if !should_skip_absent_explicit_watch_once_paths(opts) {
return false;
}
let Ok(storage) = FrankenStorage::open_readonly(&opts.db_path) else {
return false;
};
let db_schema_current = matches!(
storage.schema_version(),
Ok(crate::storage::sqlite::CURRENT_SCHEMA_VERSION)
);
let _ = storage.close();
if !db_schema_current {
return false;
}
let index_path = crate::search::tantivy::expected_index_dir(&opts.data_dir);
let schema_hash_path = index_path.join("schema_hash.json");
let schema_matches = schema_hash_path.exists()
&& std::fs::read_to_string(&schema_hash_path)
.ok()
.and_then(|content| serde_json::from_str::<serde_json::Value>(&content).ok())
.and_then(|json| {
json.get("schema_hash")
.and_then(|value| value.as_str())
.map(schema_hash_matches)
})
.unwrap_or(false);
if !schema_matches {
return false;
}
matches!(
crate::search::tantivy::searchable_index_summary(&index_path),
Ok(Some(_))
)
}
fn current_searchable_index_summary_available(index_path: &Path) -> bool {
let schema_hash_path = index_path.join("schema_hash.json");
let schema_matches = schema_hash_path.exists()
&& std::fs::read_to_string(&schema_hash_path)
.ok()
.and_then(|content| serde_json::from_str::<serde_json::Value>(&content).ok())
.and_then(|json| {
json.get("schema_hash")
.and_then(|value| value.as_str())
.map(schema_hash_matches)
})
.unwrap_or(false);
schema_matches
&& matches!(
crate::search::tantivy::searchable_index_summary(index_path),
Ok(Some(_))
)
}
fn should_skip_unchanged_explicit_watch_once_paths(
opts: &IndexOptions,
storage: &FrankenStorage,
roots: &[(ConnectorKind, ScanRoot)],
) -> Result<bool> {
if opts.watch || opts.full || opts.force_rebuild || opts.semantic || opts.build_hnsw {
return Ok(false);
}
let Some(paths) = opts
.watch_once_paths
.as_ref()
.filter(|paths| !paths.is_empty())
else {
return Ok(false);
};
let triggers = classify_paths(paths.clone(), roots, true);
if triggers.is_empty() {
return Ok(true);
}
for (_, root, _, _) in triggers {
if !explicit_watch_once_root_unchanged_after_last_index(storage, &root)? {
return Ok(false);
}
}
Ok(true)
}
fn can_skip_unchanged_explicit_watch_once_index_run(
opts: &IndexOptions,
storage: &FrankenStorage,
index_path: &Path,
) -> Result<bool> {
if opts.watch || opts.full || opts.force_rebuild || opts.semantic || opts.build_hnsw {
return Ok(false);
}
if opts
.watch_once_paths
.as_ref()
.is_none_or(|paths| paths.is_empty())
{
return Ok(false);
}
let additional_scan_roots = additional_scan_roots_for_scan_or_watch(storage, &opts.data_dir);
let watch_roots = build_watch_roots(additional_scan_roots);
if !should_skip_unchanged_explicit_watch_once_paths(opts, storage, &watch_roots)? {
return Ok(false);
}
Ok(current_searchable_index_summary_available(index_path))
}
fn should_skip_broad_scan_after_watch_once_authoritative_repair(
has_watch_once_paths: bool,
watch_enabled: bool,
full_rebuild: bool,
repaired_from_authoritative_canonical_db: bool,
) -> bool {
has_watch_once_paths
&& !watch_enabled
&& !full_rebuild
&& repaired_from_authoritative_canonical_db
}
fn should_repair_fallback_fts_after_full_index_run(
full_rebuild: bool,
canonical_only_full_rebuild: bool,
) -> bool {
full_rebuild && !canonical_only_full_rebuild
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum FallbackFtsRepairOutcome {
SkippedKnownHealthyForFingerprint { archive_fingerprint: String },
Repaired(FtsConsistencyRepair),
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum DailyStatsRepairOutcome {
SkippedKnownHealthyForFingerprint {
archive_fingerprint: String,
},
AlreadyHealthy,
Rebuilt {
rows_created: i64,
total_sessions: i64,
},
}
fn repair_fallback_fts_after_full_index_run(
_storage: &FrankenStorage,
db_path: &Path,
full_rebuild: bool,
canonical_only_full_rebuild: bool,
known_archive_fingerprint: Option<&str>,
) -> Result<Option<FallbackFtsRepairOutcome>> {
if !should_repair_fallback_fts_after_full_index_run(full_rebuild, canonical_only_full_rebuild) {
return Ok(None);
}
let fresh_storage = crate::storage::sqlite::open_franken_storage_with_timeout(
db_path,
std::time::Duration::from_secs(10),
)
.with_context(|| {
format!(
"opening fresh frankensqlite connection for fallback FTS repair at {}",
db_path.display()
)
})?;
if let Some(archive_fingerprint) = known_archive_fingerprint
&& fresh_storage
.fallback_fts_is_known_healthy_for_archive_fingerprint(archive_fingerprint)?
{
return Ok(Some(
FallbackFtsRepairOutcome::SkippedKnownHealthyForFingerprint {
archive_fingerprint: archive_fingerprint.to_string(),
},
));
}
let repair = fresh_storage.ensure_search_fallback_fts_consistency()?;
if let Some(archive_fingerprint) = known_archive_fingerprint {
fresh_storage.record_search_fallback_fts_archive_fingerprint(archive_fingerprint)?;
}
Ok(Some(FallbackFtsRepairOutcome::Repaired(repair)))
}
fn full_rebuild_requires_historical_restart(
_storage: &FrankenStorage,
_db_path: &Path,
_canonical_sessions_before_salvage: usize,
) -> Result<bool> {
Ok(false)
}
fn lexical_rebuild_db_state_with_total_conversations(
storage: &FrankenStorage,
db_path: &Path,
total_conversations: usize,
) -> Result<LexicalRebuildDbState> {
let prep_profile = std::env::var_os("CASS_PREP_PROFILE").is_some();
let normalize_started = Instant::now();
let normalized_db_path = crate::normalize_path_identity(db_path)
.to_string_lossy()
.into_owned();
if prep_profile {
eprintln!(
"CASS_PREP_PROFILE step=normalize_db_path step_ms={}",
normalize_started.elapsed().as_millis()
);
}
let fingerprint_started = Instant::now();
let storage_fingerprint = lexical_rebuild_content_fingerprint(storage, total_conversations)?;
if prep_profile {
eprintln!(
"CASS_PREP_PROFILE step=compute_storage_fingerprint step_ms={}",
fingerprint_started.elapsed().as_millis()
);
}
Ok(lexical_rebuild_db_state_from_storage_fingerprint(
&normalized_db_path,
total_conversations,
0,
storage_fingerprint,
))
}
fn lexical_rebuild_db_state_from_storage_fingerprint(
normalized_db_path: &str,
total_conversations: usize,
total_messages: usize,
storage_fingerprint: String,
) -> LexicalRebuildDbState {
LexicalRebuildDbState {
db_path: normalized_db_path.to_string(),
total_conversations,
total_messages,
storage_fingerprint,
}
}
fn lexical_rebuild_db_state_with_exact_totals(
storage: &FrankenStorage,
db_path: &Path,
total_conversations: usize,
total_messages: usize,
) -> Result<LexicalRebuildDbState> {
let normalized_db_path = crate::normalize_path_identity(db_path)
.to_string_lossy()
.into_owned();
Ok(lexical_rebuild_db_state_from_storage_fingerprint(
&normalized_db_path,
total_conversations,
total_messages,
lexical_rebuild_content_fingerprint(storage, total_conversations)?,
))
}
fn deferred_lexical_rebuild_db_state(
db_path: &Path,
total_conversations: usize,
) -> LexicalRebuildDbState {
LexicalRebuildDbState {
db_path: crate::normalize_path_identity(db_path)
.to_string_lossy()
.into_owned(),
total_conversations,
total_messages: 0,
storage_fingerprint: lexical_rebuild_deferred_content_fingerprint(total_conversations),
}
}
#[cfg(test)]
fn lexical_rebuild_db_state(
storage: &FrankenStorage,
db_path: &Path,
) -> Result<LexicalRebuildDbState> {
let total_conversations = count_total_conversations_exact(storage)?;
lexical_rebuild_db_state_with_total_conversations(storage, db_path, total_conversations)
}
#[cfg(test)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub(crate) struct LexicalRebuildSnapshot {
pub db_path: String,
pub total_conversations: usize,
pub storage_fingerprint: String,
pub committed_offset: i64,
pub committed_conversation_id: Option<i64>,
pub processed_conversations: usize,
pub indexed_docs: usize,
pub completed: bool,
pub updated_at_ms: i64,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub(crate) struct LexicalRebuildCheckpoint {
pub db_path: String,
pub total_conversations: usize,
pub storage_fingerprint: String,
pub committed_offset: i64,
pub committed_conversation_id: Option<i64>,
pub processed_conversations: usize,
pub indexed_docs: usize,
pub schema_hash: String,
pub page_size: i64,
pub completed: bool,
pub updated_at_ms: i64,
}
const LEXICAL_SHARD_PLAN_VERSION: u8 = 1;
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
pub(crate) struct LexicalShardPlannerBudgets {
pub max_conversations_per_shard: usize,
pub max_messages_per_shard: usize,
pub max_message_bytes_per_shard: usize,
}
impl LexicalShardPlannerBudgets {
fn normalized(self) -> Self {
Self {
max_conversations_per_shard: self.max_conversations_per_shard.max(1),
max_messages_per_shard: self.max_messages_per_shard.max(1),
max_message_bytes_per_shard: self.max_message_bytes_per_shard.max(1),
}
}
}
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
pub(crate) struct LexicalShardPlannerConversation {
pub conversation_id: i64,
pub message_count: usize,
pub message_bytes: usize,
}
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub(crate) struct LexicalShardPlanShard {
pub shard_index: usize,
pub first_conversation_id: i64,
pub last_conversation_id: i64,
pub conversation_count: usize,
pub message_count: usize,
pub message_bytes: usize,
pub conversation_id_fingerprint: String,
pub oversized_single_conversation: bool,
}
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub(crate) struct LexicalShardPlan {
pub planner_version: u8,
pub plan_id: String,
pub budgets: LexicalShardPlannerBudgets,
pub total_conversations: usize,
pub total_messages: usize,
pub total_message_bytes: usize,
pub oversized_conversation_ids: Vec<i64>,
pub shards: Vec<LexicalShardPlanShard>,
}
const LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT: usize = usize::MAX;
fn lexical_shard_message_count_is_known(message_count: usize) -> bool {
message_count != LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT
}
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn plan_lexical_rebuild_shards(
conversations: &[LexicalShardPlannerConversation],
budgets: LexicalShardPlannerBudgets,
) -> LexicalShardPlan {
let budgets = budgets.normalized();
let mut ordered = conversations.to_vec();
ordered.sort_by_key(|conversation| conversation.conversation_id);
let total_conversations = ordered.len();
let total_messages = ordered
.iter()
.map(|conversation| conversation.message_count)
.sum();
let total_message_bytes = ordered
.iter()
.map(|conversation| conversation.message_bytes)
.sum();
let mut shards = Vec::new();
let mut oversized_conversation_ids = Vec::new();
let mut first_conversation_id = None;
let mut last_conversation_id = None;
let mut conversation_count = 0usize;
let mut message_count = 0usize;
let mut message_bytes = 0usize;
let mut conversation_ids = Vec::new();
let flush_current_shard = |shards: &mut Vec<LexicalShardPlanShard>,
first_conversation_id: &mut Option<i64>,
last_conversation_id: &mut Option<i64>,
conversation_count: &mut usize,
message_count: &mut usize,
message_bytes: &mut usize,
conversation_ids: &mut Vec<i64>,
oversized_single_conversation: bool| {
if *conversation_count == 0 {
return;
}
debug_assert_eq!(*conversation_count, conversation_ids.len());
shards.push(LexicalShardPlanShard {
shard_index: shards.len(),
first_conversation_id: (*first_conversation_id)
.expect("non-empty shard should have a first conversation id"),
last_conversation_id: (*last_conversation_id)
.expect("non-empty shard should have a last conversation id"),
conversation_count: *conversation_count,
message_count: *message_count,
message_bytes: *message_bytes,
conversation_id_fingerprint: lexical_shard_conversation_ids_fingerprint(
conversation_ids,
),
oversized_single_conversation,
});
*first_conversation_id = None;
*last_conversation_id = None;
*conversation_count = 0;
*message_count = 0;
*message_bytes = 0;
conversation_ids.clear();
};
for conversation in ordered {
let exceeds_budget = conversation.message_count > budgets.max_messages_per_shard
|| conversation.message_bytes > budgets.max_message_bytes_per_shard;
let would_overflow_current = conversation_count > 0
&& (conversation_count.saturating_add(1) > budgets.max_conversations_per_shard
|| message_count.saturating_add(conversation.message_count)
> budgets.max_messages_per_shard
|| message_bytes.saturating_add(conversation.message_bytes)
> budgets.max_message_bytes_per_shard);
if would_overflow_current {
flush_current_shard(
&mut shards,
&mut first_conversation_id,
&mut last_conversation_id,
&mut conversation_count,
&mut message_count,
&mut message_bytes,
&mut conversation_ids,
false,
);
}
if exceeds_budget {
oversized_conversation_ids.push(conversation.conversation_id);
flush_current_shard(
&mut shards,
&mut first_conversation_id,
&mut last_conversation_id,
&mut conversation_count,
&mut message_count,
&mut message_bytes,
&mut conversation_ids,
false,
);
first_conversation_id = Some(conversation.conversation_id);
last_conversation_id = Some(conversation.conversation_id);
conversation_count = 1;
message_count = conversation.message_count;
message_bytes = conversation.message_bytes;
conversation_ids.push(conversation.conversation_id);
flush_current_shard(
&mut shards,
&mut first_conversation_id,
&mut last_conversation_id,
&mut conversation_count,
&mut message_count,
&mut message_bytes,
&mut conversation_ids,
true,
);
continue;
}
if first_conversation_id.is_none() {
first_conversation_id = Some(conversation.conversation_id);
}
last_conversation_id = Some(conversation.conversation_id);
conversation_count = conversation_count.saturating_add(1);
message_count = message_count.saturating_add(conversation.message_count);
message_bytes = message_bytes.saturating_add(conversation.message_bytes);
conversation_ids.push(conversation.conversation_id);
}
flush_current_shard(
&mut shards,
&mut first_conversation_id,
&mut last_conversation_id,
&mut conversation_count,
&mut message_count,
&mut message_bytes,
&mut conversation_ids,
false,
);
let plan_id = lexical_shard_plan_id(
budgets,
&shards,
total_conversations,
total_messages,
total_message_bytes,
&oversized_conversation_ids,
);
LexicalShardPlan {
planner_version: LEXICAL_SHARD_PLAN_VERSION,
plan_id,
budgets,
total_conversations,
total_messages,
total_message_bytes,
oversized_conversation_ids,
shards,
}
}
fn lexical_shard_conversation_ids_fingerprint(conversation_ids: &[i64]) -> String {
let mut hasher = blake3::Hasher::new();
for conversation_id in conversation_ids {
hasher.update(&conversation_id.to_le_bytes());
}
hasher.finalize().to_hex().to_string()
}
fn lexical_shard_plan_id(
budgets: LexicalShardPlannerBudgets,
shards: &[LexicalShardPlanShard],
total_conversations: usize,
total_messages: usize,
total_message_bytes: usize,
oversized_conversation_ids: &[i64],
) -> String {
let mut hasher = blake3::Hasher::new();
hasher.update(&[LEXICAL_SHARD_PLAN_VERSION]);
hasher.update(&budgets.max_conversations_per_shard.to_le_bytes());
hasher.update(&budgets.max_messages_per_shard.to_le_bytes());
hasher.update(&budgets.max_message_bytes_per_shard.to_le_bytes());
hasher.update(&total_conversations.to_le_bytes());
hasher.update(&total_messages.to_le_bytes());
hasher.update(&total_message_bytes.to_le_bytes());
for shard in shards {
hasher.update(&shard.shard_index.to_le_bytes());
hasher.update(&shard.first_conversation_id.to_le_bytes());
hasher.update(&shard.last_conversation_id.to_le_bytes());
hasher.update(&shard.conversation_count.to_le_bytes());
hasher.update(&shard.message_count.to_le_bytes());
hasher.update(&shard.message_bytes.to_le_bytes());
hasher.update(shard.conversation_id_fingerprint.as_bytes());
hasher.update(&[u8::from(shard.oversized_single_conversation)]);
}
for conversation_id in oversized_conversation_ids {
hasher.update(&conversation_id.to_le_bytes());
}
hasher.finalize().to_hex().to_string()
}
fn lexical_rebuild_target_shard_count(
worker_parallelism: usize,
tantivy_writer_threads: usize,
) -> usize {
worker_parallelism
.max(tantivy_writer_threads)
.clamp(1, 64)
.saturating_mul(4)
.clamp(1, 256)
}
fn lexical_rebuild_default_shard_budget(
total: usize,
target_shards: usize,
min_budget: usize,
max_budget: usize,
) -> usize {
let min_budget = min_budget.max(1);
let max_budget = max_budget.max(min_budget);
total
.max(1)
.div_ceil(target_shards.max(1))
.clamp(min_budget, max_budget)
}
const LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_FLOOR: usize = 16 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_DEFAULT: usize = 64 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_CEILING: usize = 128 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_MEMORY_FRACTION: u64 = 2_048;
fn lexical_rebuild_default_staged_shard_max_message_bytes_for_available_memory(
available_memory_bytes: Option<u64>,
) -> usize {
let Some(available_memory_bytes) = available_memory_bytes else {
return LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_DEFAULT;
};
let scaled =
available_memory_bytes / LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_MEMORY_FRACTION;
let scaled =
usize::try_from(scaled).unwrap_or(LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_CEILING);
scaled.clamp(
LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_FLOOR,
LEXICAL_REBUILD_STAGED_SHARD_MESSAGE_BYTES_CEILING,
)
}
fn lexical_rebuild_staged_shard_max_message_bytes(
settings: &LexicalRebuildPipelineSettingsSnapshot,
) -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_STAGED_SHARD_MAX_MESSAGE_BYTES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_default_staged_shard_max_message_bytes_for_available_memory(
responsiveness::available_memory_bytes(),
)
})
.min(settings.steady_commit_every_message_bytes.max(1))
.max(1)
}
fn lexical_rebuild_pending_shard_build_max_message_bytes(
settings: &LexicalRebuildPipelineSettingsSnapshot,
) -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_PENDING_SHARD_BUILD_MAX_MESSAGE_BYTES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
settings
.pipeline_max_message_bytes_in_flight
.max(lexical_rebuild_staged_shard_max_message_bytes(settings))
})
.max(1)
}
fn lexical_rebuild_pending_shard_build_max_jobs(
settings: &LexicalRebuildPipelineSettingsSnapshot,
) -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_PENDING_SHARD_BUILD_MAX_JOBS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| settings.staged_shard_builders.max(1).saturating_mul(32))
.clamp(1, 256)
}
fn lexical_rebuild_default_shard_planner_budgets_for_totals(
settings: &LexicalRebuildPipelineSettingsSnapshot,
total_conversations: usize,
total_messages: usize,
total_message_bytes: usize,
) -> LexicalShardPlannerBudgets {
let target_shards =
lexical_rebuild_target_shard_count(settings.workers, settings.tantivy_writer_threads);
let max_message_bytes_per_shard = lexical_rebuild_staged_shard_max_message_bytes(settings);
let min_message_bytes_per_shard = settings
.startup_commit_every_message_bytes
.min(max_message_bytes_per_shard)
.max(1);
LexicalShardPlannerBudgets {
max_conversations_per_shard: lexical_rebuild_default_shard_budget(
total_conversations,
target_shards,
settings.startup_batch_fetch_conversations,
settings.steady_commit_every_conversations,
),
max_messages_per_shard: lexical_rebuild_default_shard_budget(
total_messages,
target_shards,
settings.startup_commit_every_messages,
settings.steady_commit_every_messages,
),
max_message_bytes_per_shard: lexical_rebuild_default_shard_budget(
total_message_bytes,
target_shards,
min_message_bytes_per_shard,
max_message_bytes_per_shard,
),
}
}
fn lexical_rebuild_shard_planner_conversations_from_storage(
storage: &FrankenStorage,
) -> Result<Vec<LexicalShardPlannerConversation>> {
Ok(storage
.list_conversation_footprints_for_lexical_rebuild()
.with_context(|| "listing canonical lexical rebuild conversation footprints")?
.into_iter()
.map(|footprint| LexicalShardPlannerConversation {
conversation_id: footprint.conversation_id,
message_count: footprint.message_count,
message_bytes: footprint.message_bytes,
})
.collect())
}
const LEXICAL_REBUILD_ID_ONLY_MAX_CONVERSATIONS_PER_SHARD: usize = 64;
fn plan_lexical_rebuild_shards_from_conversation_ids_with_settings(
storage: &FrankenStorage,
settings: &LexicalRebuildPipelineSettingsSnapshot,
total_conversations: usize,
total_messages: usize,
) -> Result<LexicalShardPlan> {
let conversation_ids = storage
.list_conversation_ids_for_lexical_rebuild()
.with_context(|| "listing canonical conversation ids for id-only lexical shard planning")?;
let exact_total_conversations = conversation_ids.len();
let average_messages_per_conversation = if exact_total_conversations == 0 {
0
} else {
total_messages.div_ceil(exact_total_conversations).max(1)
};
let estimated_message_bytes_per_conversation = average_messages_per_conversation
.saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE);
let max_message_bytes_per_shard = lexical_rebuild_staged_shard_max_message_bytes(settings);
let max_conversations_by_average_bytes = if estimated_message_bytes_per_conversation == 0 {
LEXICAL_REBUILD_ID_ONLY_MAX_CONVERSATIONS_PER_SHARD
} else {
max_message_bytes_per_shard
.saturating_div(estimated_message_bytes_per_conversation)
.max(1)
};
let max_conversations_per_shard = settings
.steady_commit_every_conversations
.clamp(1, LEXICAL_REBUILD_ID_ONLY_MAX_CONVERSATIONS_PER_SHARD)
.min(max_conversations_by_average_bytes.max(1))
.max(1);
let max_messages_per_shard = average_messages_per_conversation
.saturating_mul(max_conversations_per_shard)
.max(1);
let estimated_message_bytes_per_shard = estimated_message_bytes_per_conversation
.saturating_mul(max_conversations_per_shard)
.max(1)
.min(max_message_bytes_per_shard.max(1));
tracing::info!(
total_conversations,
exact_total_conversations,
total_messages,
average_messages_per_conversation,
estimated_message_bytes_per_conversation,
max_conversations_per_shard,
max_messages_per_shard,
max_message_bytes_per_shard = estimated_message_bytes_per_shard,
"using conservative conversation-id-only lexical shard plan because canonical DB has no tail footprint metadata"
);
let conversations = conversation_ids
.into_iter()
.map(|conversation_id| LexicalShardPlannerConversation {
conversation_id,
message_count: average_messages_per_conversation,
message_bytes: estimated_message_bytes_per_conversation,
})
.collect::<Vec<_>>();
let budgets = LexicalShardPlannerBudgets {
max_conversations_per_shard,
max_messages_per_shard,
max_message_bytes_per_shard: estimated_message_bytes_per_shard,
};
let mut plan = plan_lexical_rebuild_shards(&conversations, budgets);
for shard in &mut plan.shards {
shard.message_count = LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT;
}
plan.total_conversations = exact_total_conversations;
plan.total_messages = total_messages;
plan.total_message_bytes =
total_messages.saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE);
plan.plan_id = lexical_shard_plan_id(
plan.budgets,
&plan.shards,
plan.total_conversations,
plan.total_messages,
plan.total_message_bytes,
&plan.oversized_conversation_ids,
);
Ok(plan)
}
fn plan_lexical_rebuild_shards_from_storage_with_settings(
storage: &FrankenStorage,
settings: &LexicalRebuildPipelineSettingsSnapshot,
total_conversations: usize,
) -> Result<LexicalShardPlan> {
if !storage.lexical_rebuild_has_tail_footprint_metadata()? {
let total_messages = count_total_messages_exact(storage)?;
return plan_lexical_rebuild_shards_from_conversation_ids_with_settings(
storage,
settings,
total_conversations,
total_messages,
);
}
let conversations = lexical_rebuild_shard_planner_conversations_from_storage(storage)?;
let total_conversations = conversations.len();
let total_messages = conversations
.iter()
.map(|conversation| conversation.message_count)
.sum();
let total_message_bytes = conversations
.iter()
.map(|conversation| conversation.message_bytes)
.sum();
let budgets = lexical_rebuild_default_shard_planner_budgets_for_totals(
settings,
total_conversations,
total_messages,
total_message_bytes,
);
let mut plan = plan_lexical_rebuild_shards(&conversations, budgets);
for shard in &mut plan.shards {
shard.message_count = LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT;
}
plan.plan_id = lexical_shard_plan_id(
plan.budgets,
&plan.shards,
plan.total_conversations,
plan.total_messages,
plan.total_message_bytes,
&plan.oversized_conversation_ids,
);
Ok(plan)
}
#[derive(Debug, Clone)]
struct LexicalRebuildPlannedShardCursor {
shards: Vec<LexicalShardPlanShard>,
next_shard_index: usize,
}
impl LexicalRebuildPlannedShardCursor {
fn for_resume(
mut shards: Vec<LexicalShardPlanShard>,
resume_after_conversation_id: Option<i64>,
) -> Option<Self> {
if shards.is_empty() {
return None;
}
shards.sort_by_key(|shard| shard.shard_index);
let resume_after_conversation_id = resume_after_conversation_id.unwrap_or(0);
let next_shard_index = shards
.iter()
.position(|shard| shard.last_conversation_id > resume_after_conversation_id)
.unwrap_or(shards.len());
if next_shard_index >= shards.len() {
None
} else {
Some(Self {
shards,
next_shard_index,
})
}
}
fn current(&self) -> Option<&LexicalShardPlanShard> {
self.shards.get(self.next_shard_index)
}
fn skip_completed(&mut self, last_conversation_id: i64) {
while let Some(shard) = self.current() {
if last_conversation_id >= shard.last_conversation_id {
self.advance();
} else {
break;
}
}
}
fn advance(&mut self) {
self.next_shard_index = self.next_shard_index.saturating_add(1);
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct LexicalRebuildShardBuilderSettings {
max_builders: usize,
writer_parallelism_budget: usize,
}
const LEXICAL_REBUILD_STAGED_SHARD_BUILDER_MEMORY_SLOT_BYTES: u64 = 32 * 1024 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_BUILDER_MEMORY_BUDGET_NUMERATOR: u64 = 2;
const LEXICAL_REBUILD_STAGED_SHARD_BUILDER_MEMORY_BUDGET_DENOMINATOR: u64 = 3;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_FRACTION: u64 = 8;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_FALLBACK_BYTES: u64 = 16 * 1024 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_FLOOR_BYTES: u64 = 4 * 1024 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_CEILING_BYTES: u64 = 32 * 1024 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_RESERVE_FRACTION: u64 = 32;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_FALLBACK_BYTES: u64 = 4 * 1024 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_FLOOR_BYTES: u64 = 1024 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_CEILING_BYTES: u64 = 8 * 1024 * 1024 * 1024;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_AMPLIFICATION_FLOOR_MILLI: u64 = 8_000;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_AMPLIFICATION_HEADROOM_MILLI: u64 = 1_500;
const LEXICAL_REBUILD_STAGED_SHARD_BUILD_MIN_ESTIMATED_BYTES: u64 = 512 * 1024 * 1024;
fn lexical_rebuild_default_staged_shard_builder_parallelism_for_workers_and_memory(
workers: usize,
available_memory_bytes: Option<u64>,
) -> usize {
let cpu_ceiling = workers.clamp(1, 8);
let Some(available_memory_bytes) = available_memory_bytes else {
return cpu_ceiling;
};
let memory_budget = available_memory_bytes
.saturating_mul(LEXICAL_REBUILD_STAGED_SHARD_BUILDER_MEMORY_BUDGET_NUMERATOR)
/ LEXICAL_REBUILD_STAGED_SHARD_BUILDER_MEMORY_BUDGET_DENOMINATOR.max(1);
let memory_ceiling = usize::try_from(
memory_budget / LEXICAL_REBUILD_STAGED_SHARD_BUILDER_MEMORY_SLOT_BYTES.max(1),
)
.unwrap_or(usize::MAX)
.clamp(1, 8);
cpu_ceiling.min(memory_ceiling).max(1)
}
fn lexical_rebuild_default_staged_shard_builder_parallelism_for_workers(workers: usize) -> usize {
lexical_rebuild_default_staged_shard_builder_parallelism_for_workers_and_memory(
workers,
responsiveness::available_memory_bytes(),
)
}
fn usize_from_u64_saturating(value: u64) -> usize {
usize::try_from(value).unwrap_or(usize::MAX)
}
fn lexical_rebuild_default_staged_shard_build_memory_reserve_bytes_for_total_memory(
total_memory_bytes: Option<u64>,
) -> usize {
let reserve = total_memory_bytes
.map(|total| total / LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_FRACTION.max(1))
.unwrap_or(LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_FALLBACK_BYTES)
.clamp(
LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_FLOOR_BYTES,
LEXICAL_REBUILD_STAGED_SHARD_BUILD_RESERVE_CEILING_BYTES,
);
usize_from_u64_saturating(reserve)
}
fn lexical_rebuild_staged_shard_build_memory_reserve_bytes() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_STAGED_SHARD_BUILD_MEMORY_RESERVE_BYTES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_default_staged_shard_build_memory_reserve_bytes_for_total_memory(
responsiveness::total_memory_bytes(),
)
})
}
fn lexical_rebuild_default_staged_shard_build_emergency_memory_reserve_bytes_for_total_memory(
total_memory_bytes: Option<u64>,
) -> usize {
let reserve = total_memory_bytes
.map(|total| total / LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_RESERVE_FRACTION.max(1))
.unwrap_or(LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_FALLBACK_BYTES)
.clamp(
LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_FLOOR_BYTES,
LEXICAL_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_CEILING_BYTES,
);
usize_from_u64_saturating(reserve)
}
fn lexical_rebuild_staged_shard_build_emergency_memory_reserve_bytes() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_STAGED_SHARD_BUILD_EMERGENCY_MEMORY_RESERVE_BYTES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_default_staged_shard_build_emergency_memory_reserve_bytes_for_total_memory(
responsiveness::total_memory_bytes(),
)
})
}
fn lexical_rebuild_staged_shard_builder_parallelism() -> usize {
let raw = lexical_rebuild_staged_shard_builder_parallelism_configured();
responsiveness::effective_worker_count(raw).max(1)
}
fn lexical_rebuild_staged_shard_builder_parallelism_configured() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_STAGED_SHARD_BUILDERS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_default_staged_shard_builder_parallelism_for_workers(
lexical_rebuild_worker_parallelism(),
)
})
}
fn lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(workers: usize) -> usize {
workers
.saturating_sub(workers.saturating_mul(3).div_ceil(4))
.clamp(1, 8)
}
fn lexical_rebuild_staged_merge_worker_parallelism() -> usize {
let raw = lexical_rebuild_staged_merge_worker_parallelism_configured();
responsiveness::effective_worker_count(raw).max(1)
}
fn lexical_rebuild_staged_merge_worker_parallelism_configured() -> usize {
dotenvy::var("CASS_TANTIVY_REBUILD_STAGED_MERGE_WORKERS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.unwrap_or_else(|| {
lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(
lexical_rebuild_worker_parallelism(),
)
})
}
fn lexical_rebuild_staged_shard_builder_settings(
settings: &LexicalRebuildPipelineSettingsSnapshot,
planned_shard_count: usize,
) -> LexicalRebuildShardBuilderSettings {
let planned_shard_count = planned_shard_count.max(1);
let writer_parallelism_budget = settings.tantivy_writer_threads.max(1);
let max_builders = planned_shard_count
.min(settings.staged_shard_builders.max(1))
.min(writer_parallelism_budget)
.max(1);
LexicalRebuildShardBuilderSettings {
max_builders,
writer_parallelism_budget,
}
}
fn lexical_rebuild_staged_shard_builder_writer_parallelism_for_dispatch(
writer_parallelism_budget: usize,
allowed_jobs: usize,
dispatch_slot_index: usize,
) -> usize {
let writer_parallelism_budget = writer_parallelism_budget.max(1);
let allowed_jobs = allowed_jobs.max(1);
if dispatch_slot_index >= allowed_jobs {
return 1;
}
if allowed_jobs >= writer_parallelism_budget {
return 1;
}
let base = writer_parallelism_budget / allowed_jobs;
let remainder = writer_parallelism_budget % allowed_jobs;
base.saturating_add(usize::from(dispatch_slot_index < remainder))
.max(1)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct LexicalRebuildShardMergeSettings {
workers: usize,
}
fn lexical_rebuild_staged_shard_merge_settings(
settings: &LexicalRebuildPipelineSettingsSnapshot,
planned_shard_count: usize,
) -> LexicalRebuildShardMergeSettings {
let eager_merge_groups =
planned_shard_count / LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN;
let workers = if eager_merge_groups == 0 {
1
} else {
settings.staged_merge_workers.min(eager_merge_groups).max(1)
};
LexicalRebuildShardMergeSettings { workers }
}
pub(crate) fn load_lexical_rebuild_checkpoint(
index_path: &Path,
) -> Result<Option<LexicalRebuildCheckpoint>> {
let Some(state) = load_lexical_rebuild_state(index_path)? else {
return Ok(None);
};
let processed_conversations = state.reported_processed_conversations();
let committed_conversation_id = state.reported_committed_conversation_id();
let indexed_docs = state.reported_indexed_docs();
Ok(Some(LexicalRebuildCheckpoint {
db_path: state.db.db_path,
total_conversations: state.db.total_conversations,
storage_fingerprint: state.db.storage_fingerprint,
committed_offset: state.committed_offset,
committed_conversation_id,
processed_conversations,
indexed_docs,
schema_hash: state.schema_hash,
page_size: state.page_size,
completed: state.completed,
updated_at_ms: state.updated_at_ms,
}))
}
pub(crate) fn load_active_lexical_rebuild_pipeline_runtime(
index_path: &Path,
db_path: &Path,
) -> Result<Option<LexicalRebuildPipelineRuntimeSnapshot>> {
let Some(state) = load_lexical_rebuild_state(index_path)? else {
return Ok(None);
};
if state.completed || !crate::stored_path_identity_matches(&state.db.db_path, db_path) {
return Ok(None);
}
if !state.runtime.is_observed() {
return Ok(None);
}
Ok(Some(state.runtime))
}
pub(crate) fn lexical_storage_fingerprint_for_db(db_path: &Path) -> Result<String> {
lexical_rebuild_storage_fingerprint(db_path)
}
fn published_lexical_index_validated_for_current_data(index_path: &Path, db_path: &Path) -> bool {
let checkpoint = match load_lexical_rebuild_checkpoint(index_path) {
Ok(Some(checkpoint)) => checkpoint,
Ok(None) => return false,
Err(err) => {
tracing::debug!(
error = %err,
"could not load lexical rebuild checkpoint while validating published index"
);
return false;
}
};
if !checkpoint.completed || !crate::stored_path_identity_matches(&checkpoint.db_path, db_path) {
return false;
}
match lexical_storage_fingerprint_for_db(db_path) {
Ok(current_fingerprint) => current_fingerprint == checkpoint.storage_fingerprint,
Err(err) => {
tracing::debug!(
error = %err,
"could not compute current storage fingerprint while validating published index"
);
false
}
}
}
fn persist_completed_lexical_rebuild_checkpoint_from_observations(
index_path: &Path,
db_state: LexicalRebuildDbState,
total_messages: usize,
max_conversation_id: Option<i64>,
observed_tantivy_docs: Option<usize>,
) -> Result<()> {
let total_conversations = db_state.total_conversations;
let observed_tantivy_docs = match observed_tantivy_docs {
Some(observed_tantivy_docs) => observed_tantivy_docs,
None => {
let Some(observed_tantivy_docs) = live_tantivy_doc_count(index_path)? else {
return Ok(());
};
observed_tantivy_docs
}
};
if observed_tantivy_docs != total_messages {
tracing::debug!(
path = %index_path.display(),
observed_tantivy_docs,
canonical_messages = total_messages,
"skipping lexical checkpoint refresh because the live Tantivy index does not match the canonical message count"
);
return Ok(());
}
let committed_meta_fingerprint = index_meta_fingerprint(index_path)?;
let db_path_string = db_state.db_path.clone();
let mut state = match load_lexical_rebuild_state(index_path)? {
Some(state)
if state.version == LEXICAL_REBUILD_STATE_VERSION
&& state.schema_hash == crate::search::tantivy::SCHEMA_HASH
&& lexical_rebuild_db_paths_match(&state.db.db_path, &db_path_string) =>
{
state
}
Some(state) => {
tracing::info!(
path = %index_path.display(),
existing_db_path = %state.db.db_path,
existing_completed = state.completed,
"replacing stale lexical rebuild checkpoint from the live Tantivy index"
);
LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE)
}
None => {
tracing::info!(
path = %index_path.display(),
total_conversations,
total_messages,
"bootstrapping missing lexical rebuild checkpoint from the live Tantivy index"
);
LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE)
}
};
let target_committed_offset = i64::try_from(total_conversations).unwrap_or(i64::MAX);
if state.db == db_state
&& state.page_size == LEXICAL_REBUILD_PAGE_SIZE
&& state.committed_offset == target_committed_offset
&& state.committed_conversation_id == max_conversation_id
&& state.processed_conversations == total_conversations
&& state.indexed_docs == total_messages
&& state.committed_meta_fingerprint == committed_meta_fingerprint
&& state.pending.is_none()
&& state.completed
&& state.execution_mode == LexicalRebuildExecutionMode::SharedWriter
&& state.runtime == LexicalRebuildPipelineRuntimeSnapshot::default()
{
tracing::debug!(
path = %index_path.display(),
total_conversations,
total_messages,
"skipping lexical checkpoint rewrite because the completed state already matches the live Tantivy index"
);
return Ok(());
}
state.db = db_state;
state.page_size = LEXICAL_REBUILD_PAGE_SIZE;
state.committed_offset = target_committed_offset;
state.committed_conversation_id = max_conversation_id;
state.processed_conversations = total_conversations;
state.indexed_docs = total_messages;
state.pending = None;
state.completed = true;
state.committed_meta_fingerprint = committed_meta_fingerprint;
state.updated_at_ms = FrankenStorage::now_millis();
persist_lexical_rebuild_state(index_path, &state)
}
fn refresh_completed_lexical_rebuild_checkpoint(
storage: &FrankenStorage,
db_path: &Path,
data_dir: &Path,
) -> Result<()> {
let total_conversations = count_total_conversations_exact(storage)?;
let index_path = index_dir(data_dir)?;
let total_messages = count_total_messages_exact(storage)?;
let max_conversation_id = max_conversation_id_exact(storage)?;
let db_state = lexical_rebuild_db_state_with_exact_totals(
storage,
db_path,
total_conversations,
total_messages,
)?;
persist_completed_lexical_rebuild_checkpoint_from_observations(
&index_path,
db_state,
total_messages,
max_conversation_id,
None,
)
}
fn refresh_completed_lexical_rebuild_checkpoint_for_final_state(
storage: &mut FrankenStorage,
db_path: &Path,
data_dir: &Path,
keep_storage_open: bool,
exact_counts: Option<(usize, usize)>,
) -> Result<()> {
if let Some((total_conversations, total_messages)) = exact_counts {
let max_conversation_id = max_conversation_id_exact(storage)?;
let db_state = lexical_rebuild_db_state_with_exact_totals(
storage,
db_path,
total_conversations,
total_messages,
)?;
let index_path = index_dir(data_dir)?;
return persist_completed_lexical_rebuild_checkpoint_from_observations(
&index_path,
db_state,
total_messages,
max_conversation_id,
Some(total_messages),
);
}
if keep_storage_open {
return refresh_completed_lexical_rebuild_checkpoint(storage, db_path, data_dir);
}
storage.close_best_effort_in_place();
let mut settled = FrankenStorage::open_readonly(db_path).with_context(|| {
format!(
"reopening readonly storage to refresh settled lexical checkpoint for {}",
db_path.display()
)
})?;
let total_conversations = count_total_conversations_exact(&settled)?;
let total_messages = count_total_messages_exact(&settled)?;
let max_conversation_id = max_conversation_id_exact(&settled)?;
let db_state = lexical_rebuild_db_state_with_exact_totals(
&settled,
db_path,
total_conversations,
total_messages,
)?;
settled.close_best_effort_in_place();
let index_path = index_dir(data_dir)?;
persist_completed_lexical_rebuild_checkpoint_from_observations(
&index_path,
db_state,
total_messages,
max_conversation_id,
None,
)
}
fn persist_final_index_run_metadata(
storage: &FrankenStorage,
db_path: &Path,
performed_scan: bool,
scan_start_ts: i64,
now_ms: i64,
) -> Result<()> {
persist_final_index_run_metadata_with_writer(
db_path,
performed_scan,
scan_start_ts,
now_ms,
|| {
persist::with_concurrent_retry(persist::begin_concurrent_retry_limit(), || {
persist::with_ephemeral_writer(
storage,
false,
"updating final index run metadata",
|writer| {
if performed_scan {
writer.set_last_scan_ts(scan_start_ts)?;
}
writer.set_last_indexed_at(now_ms)
},
)
})
},
)
}
fn persist_final_index_run_metadata_with_writer<F>(
db_path: &Path,
performed_scan: bool,
scan_start_ts: i64,
now_ms: i64,
writer_fn: F,
) -> Result<()>
where
F: FnOnce() -> Result<()>,
{
match writer_fn() {
Ok(()) => {
if performed_scan {
tracing::info!(
scan_start_ts,
"updated last_scan_ts for incremental indexing"
);
} else {
tracing::info!(
db_path = %db_path.display(),
"preserving last_scan_ts because this run only resumed the lexical rebuild"
);
}
tracing::info!(now_ms, "updated last_indexed_at for status display");
Ok(())
}
Err(err) => {
tracing::warn!(
db_path = %db_path.display(),
performed_scan,
scan_start_ts,
now_ms,
error = %format!("{err:#}"),
"deferred final index-run metadata update after retries exhausted; \
index and lexical artifacts are committed, status markers will be \
rewritten on the next incremental run once peer contention clears"
);
Ok(())
}
}
}
#[cfg(test)]
pub(crate) fn load_lexical_rebuild_snapshot(
index_path: &Path,
db_path: &Path,
) -> Result<Option<LexicalRebuildSnapshot>> {
let Some(state) = load_lexical_rebuild_state(index_path)? else {
return Ok(None);
};
if state.completed || !crate::stored_path_identity_matches(&state.db.db_path, db_path) {
return Ok(None);
}
let processed_conversations = state.reported_processed_conversations();
let committed_conversation_id = state.reported_committed_conversation_id();
let indexed_docs = state.reported_indexed_docs();
Ok(Some(LexicalRebuildSnapshot {
db_path: state.db.db_path,
total_conversations: state.db.total_conversations,
storage_fingerprint: state.db.storage_fingerprint,
committed_offset: state.committed_offset,
committed_conversation_id,
processed_conversations,
indexed_docs,
completed: state.completed,
updated_at_ms: state.updated_at_ms,
}))
}
fn repair_daily_stats_if_drifted(
storage: &FrankenStorage,
db_path: &Path,
known_archive_fingerprint: Option<&str>,
) -> Result<DailyStatsRepairOutcome> {
if let Some(archive_fingerprint) = known_archive_fingerprint
&& storage.daily_stats_is_known_healthy_for_archive_fingerprint(archive_fingerprint)?
{
return Ok(DailyStatsRepairOutcome::SkippedKnownHealthyForFingerprint {
archive_fingerprint: archive_fingerprint.to_string(),
});
}
let health = storage.daily_stats_health().with_context(|| {
format!(
"checking daily_stats health before index planning for {}",
db_path.display()
)
})?;
if health.populated && health.drift == 0 {
if let Some(archive_fingerprint) = known_archive_fingerprint {
storage.record_daily_stats_archive_fingerprint(archive_fingerprint)?;
}
return Ok(DailyStatsRepairOutcome::AlreadyHealthy);
}
tracing::warn!(
db_path = %db_path.display(),
populated = health.populated,
row_count = health.row_count,
conversation_count = health.conversation_count,
materialized_total = health.materialized_total,
drift = health.drift,
"daily_stats is missing or drifted; rebuilding from canonical conversations"
);
let rebuilt = match rebuild_daily_stats_from_conversation_packets(storage, db_path) {
Ok(rebuilt) => rebuilt,
Err(error) if error_is_out_of_memory(&error) => {
tracing::warn!(
db_path = %db_path.display(),
error = %error,
"packet daily_stats rebuild ran out of memory; falling back to bounded storage rebuild"
);
storage.rebuild_daily_stats().with_context(|| {
format!(
"rebuilding daily_stats with bounded fallback before index planning for {}",
db_path.display()
)
})?
}
Err(error) => {
return Err(error).with_context(|| {
format!(
"rebuilding daily_stats before index planning for {}",
db_path.display()
)
});
}
};
tracing::info!(
db_path = %db_path.display(),
rows_created = rebuilt.rows_created,
total_sessions = rebuilt.total_sessions,
"rebuilt daily_stats before index planning"
);
if let Some(archive_fingerprint) = known_archive_fingerprint {
storage.record_daily_stats_archive_fingerprint(archive_fingerprint)?;
}
Ok(DailyStatsRepairOutcome::Rebuilt {
rows_created: rebuilt.rows_created,
total_sessions: rebuilt.total_sessions,
})
}
const PACKET_DAILY_STATS_REBUILD_BATCH_SIZE: i64 = 256;
fn packet_daily_stats_provenance(
conversation: &crate::storage::sqlite::LexicalRebuildConversationRow,
) -> LexicalRebuildPacketProvenance {
let source_kind = if conversation.source_id == LOCAL_SOURCE_ID {
SourceKind::Local
} else {
SourceKind::Ssh
};
LexicalRebuildPacketProvenance {
source_id: conversation.source_id.clone(),
origin_kind: source_kind.as_str().to_string(),
origin_host: conversation.origin_host.clone(),
}
}
fn packet_daily_stats_message_count(projections: &ConversationPacketSinkProjections) -> i64 {
let analytics = &projections.analytics;
i64::try_from(
analytics.user_messages
+ analytics.assistant_messages
+ analytics.tool_messages
+ analytics.system_messages
+ analytics.other_messages,
)
.unwrap_or(i64::MAX)
}
fn packet_daily_stats_total_chars(projections: &ConversationPacketSinkProjections) -> i64 {
i64::try_from(projections.lexical.total_content_bytes).unwrap_or(i64::MAX)
}
fn packet_update_daily_stats_batched_in_tx(
tx: &FrankenTransaction<'_>,
entries: &[(i64, String, String, StatsDelta)],
) -> Result<usize> {
if entries.is_empty() {
return Ok(0);
}
let now = FrankenStorage::now_millis();
let mut total_affected = 0usize;
for (day_id, agent_slug, source_id, delta) in entries {
total_affected += tx.execute_compat(
"INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
VALUES(?1,?2,?3,?4,?5,?6,?7)
ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
session_count = session_count + excluded.session_count,
message_count = message_count + excluded.message_count,
total_chars = total_chars + excluded.total_chars,
last_updated = excluded.last_updated",
&[
ParamValue::from(*day_id),
ParamValue::from(agent_slug.as_str()),
ParamValue::from(source_id.as_str()),
ParamValue::from(delta.session_count_delta),
ParamValue::from(delta.message_count_delta),
ParamValue::from(delta.total_chars_delta),
ParamValue::from(now),
],
)?;
}
Ok(total_affected)
}
fn rebuild_daily_stats_from_conversation_packets(
storage: &FrankenStorage,
db_path: &Path,
) -> Result<DailyStatsRebuildResult> {
#[cfg(test)]
if dotenvy::var("CASS_TEST_PACKET_DAILY_STATS_REBUILD_OOM").is_ok() {
anyhow::bail!("out of memory");
}
let (agent_slugs, workspace_paths) = storage
.build_lexical_rebuild_lookups()
.with_context(|| format!("building packet rebuild lookups for {}", db_path.display()))?;
let mut tx = storage.raw().transaction().with_context(|| {
format!(
"opening packet daily_stats rebuild transaction for {}",
db_path.display()
)
})?;
tx.execute("DELETE FROM daily_stats").with_context(|| {
format!(
"clearing daily_stats before packet rebuild for {}",
db_path.display()
)
})?;
let mut last_conversation_id = 0_i64;
let mut conversation_batches = 0usize;
let mut conversations_processed = 0usize;
let mut messages_projected = 0usize;
let mut raw_entries_flushed = 0usize;
let mut expanded_entries_flushed = 0usize;
loop {
let conversation_rows = storage
.list_conversations_for_lexical_rebuild_after_id(
PACKET_DAILY_STATS_REBUILD_BATCH_SIZE,
last_conversation_id,
&agent_slugs,
&workspace_paths,
)
.with_context(|| {
format!(
"listing canonical conversations for packet daily_stats rebuild after id {}",
last_conversation_id
)
})?;
if conversation_rows.is_empty() {
break;
}
let conversation_ids = conversation_rows
.iter()
.map(|conversation| {
conversation.id.ok_or_else(|| {
anyhow::anyhow!(
"packet daily_stats rebuild encountered conversation without id after {}",
last_conversation_id
)
})
})
.collect::<Result<Vec<_>>>()?;
let mut grouped_messages = storage
.fetch_messages_for_lexical_rebuild_batch(&conversation_ids, None, None)
.with_context(|| {
format!(
"fetching canonical message batch for packet daily_stats rebuild after id {}",
last_conversation_id
)
})?;
let mut aggregate = StatsAggregator::new();
for conversation in conversation_rows {
let conversation_id = conversation.id.ok_or_else(|| {
anyhow::anyhow!(
"packet daily_stats rebuild encountered conversation without id after {}",
last_conversation_id
)
})?;
last_conversation_id = conversation_id;
let canonical_messages = grouped_messages
.remove(&conversation_id)
.unwrap_or_default();
let provenance = packet_daily_stats_provenance(&conversation);
let packet = lexical_rebuild_contract_from_canonical_messages(
&conversation,
&provenance,
canonical_messages,
);
let message_count = packet_daily_stats_message_count(&packet.projections);
let total_chars = packet_daily_stats_total_chars(&packet.projections);
let day_id = conversation
.started_at
.map(FrankenStorage::day_id_from_millis)
.unwrap_or(0);
aggregate.record(
&conversation.agent_slug,
&conversation.source_id,
day_id,
message_count,
total_chars,
);
conversations_processed += 1;
messages_projected = messages_projected.saturating_add(message_count.max(0) as usize);
}
conversation_batches += 1;
raw_entries_flushed += aggregate.raw_entry_count();
let entries = aggregate.expand();
expanded_entries_flushed += entries.len();
if !entries.is_empty() {
packet_update_daily_stats_batched_in_tx(&tx, &entries)?;
}
if conversation_batches.is_multiple_of(25) {
tracing::info!(
target: "cass::perf::daily_stats",
conversation_batches,
batch_size = PACKET_DAILY_STATS_REBUILD_BATCH_SIZE,
last_conversation_id,
conversations_processed,
messages_projected,
"packet daily_stats rebuild progress"
);
}
}
let rows_created: i64 = tx.query_row_map(
"SELECT COUNT(*) FROM daily_stats",
&[] as &[ParamValue],
|row| row.get_typed(0),
)?;
let total_sessions: i64 = tx.query_row_map(
"SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
&[] as &[ParamValue],
|row| row.get_typed(0),
)?;
tx.commit()?;
tracing::info!(
target: "cass::perf::daily_stats",
db_path = %db_path.display(),
rows_created,
total_sessions,
conversation_batches,
batch_size = PACKET_DAILY_STATS_REBUILD_BATCH_SIZE,
conversations_processed,
messages_projected,
raw_entries_flushed,
expanded_entries_flushed,
"daily_stats rebuilt from canonical ConversationPacket projections"
);
Ok(DailyStatsRebuildResult {
rows_created,
total_sessions,
})
}
fn should_repair_daily_stats_after_historical_salvage(
checked_pre_scan: bool,
full_refresh: bool,
rebuild_from_canonical_only: bool,
salvage_messages_imported: usize,
) -> bool {
salvage_messages_imported > 0
|| (full_refresh && !rebuild_from_canonical_only && !checked_pre_scan)
}
pub enum IndexMessage {
Batch {
connector_name: &'static str,
conversations: Vec<NormalizedConversation>,
is_discovered: bool,
message_count: usize,
byte_reservation: usize,
},
ScanError {
connector_name: &'static str,
error: String,
},
Done {
connector_name: &'static str,
scan_ms: u64,
is_discovered: bool,
},
}
const STREAMING_CHANNEL_SIZE: usize = 32;
#[derive(Debug, Clone, Copy)]
struct StreamingBatchLimits {
max_conversations: usize,
max_messages: usize,
max_chars: usize,
}
const DEFAULT_STREAMING_BATCH_LIMITS: StreamingBatchLimits = StreamingBatchLimits {
max_conversations: 64,
max_messages: 2_000,
max_chars: 4 * 1024 * 1024,
};
const STREAMING_MAX_BYTES_IN_FLIGHT: usize =
STREAMING_CHANNEL_SIZE * DEFAULT_STREAMING_BATCH_LIMITS.max_chars;
#[derive(Debug)]
struct StreamingByteLimiterState {
bytes_in_flight: usize,
closed: bool,
}
#[derive(Debug)]
struct StreamingByteLimiter {
max_bytes_in_flight: AtomicUsize,
state: Mutex<StreamingByteLimiterState>,
cv: Condvar,
}
impl StreamingByteLimiter {
fn new(max_bytes_in_flight: usize) -> Self {
debug_assert!(max_bytes_in_flight > 0);
Self {
max_bytes_in_flight: AtomicUsize::new(max_bytes_in_flight.max(1)),
state: Mutex::new(StreamingByteLimiterState {
bytes_in_flight: 0,
closed: false,
}),
cv: Condvar::new(),
}
}
fn acquire(&self, requested_bytes: usize) -> Result<usize> {
self.acquire_with_wait(requested_bytes)
.map(|(reservation, _, _)| reservation)
}
fn acquire_with_wait(&self, requested_bytes: usize) -> Result<(usize, Duration, bool)> {
if requested_bytes == 0 {
return Ok((0, Duration::ZERO, false));
}
let mut state = self.state.lock().unwrap_or_else(|e| e.into_inner());
let mut waited = false;
let wait_started = Instant::now();
loop {
if state.closed {
return Err(anyhow::anyhow!(
"streaming byte limiter closed while waiting for capacity"
));
}
let max_bytes_in_flight = self.max_bytes_in_flight.load(Ordering::Acquire).max(1);
let reservation = requested_bytes.min(max_bytes_in_flight);
if state.bytes_in_flight.saturating_add(reservation) <= max_bytes_in_flight {
state.bytes_in_flight += reservation;
let wait_duration = if waited {
wait_started.elapsed()
} else {
Duration::default()
};
return Ok((reservation, wait_duration, waited));
}
waited = true;
state = self.cv.wait(state).unwrap_or_else(|e| e.into_inner());
}
}
fn release(&self, reserved_bytes: usize) {
if reserved_bytes == 0 {
return;
}
let mut state = self.state.lock().unwrap_or_else(|e| e.into_inner());
state.bytes_in_flight = state.bytes_in_flight.saturating_sub(reserved_bytes);
self.cv.notify_all();
}
fn bytes_in_flight(&self) -> usize {
self.state
.lock()
.unwrap_or_else(|e| e.into_inner())
.bytes_in_flight
}
fn update_max_bytes_in_flight(&self, max_bytes_in_flight: usize) {
let _state_guard = self.state.lock().unwrap_or_else(|e| e.into_inner());
self.max_bytes_in_flight
.store(max_bytes_in_flight.max(1), Ordering::Release);
self.cv.notify_all();
}
fn max_bytes_in_flight(&self) -> usize {
self.max_bytes_in_flight.load(Ordering::Acquire)
}
fn close(&self) {
let mut state = self.state.lock().unwrap_or_else(|e| e.into_inner());
state.closed = true;
self.cv.notify_all();
}
}
#[derive(Debug)]
struct LexicalRebuildReservationOrderState {
next_sequence: u64,
closed: bool,
}
#[derive(Debug)]
struct LexicalRebuildReservationOrder {
state: Mutex<LexicalRebuildReservationOrderState>,
cv: Condvar,
}
impl LexicalRebuildReservationOrder {
fn new() -> Self {
Self {
state: Mutex::new(LexicalRebuildReservationOrderState {
next_sequence: 0,
closed: false,
}),
cv: Condvar::new(),
}
}
fn wait_for_turn(&self, sequence: u64) -> Result<()> {
let mut state = self.state.lock().unwrap_or_else(|e| e.into_inner());
loop {
if state.closed {
return Err(anyhow::anyhow!(
"lexical rebuild reservation order closed while waiting for sequence {}",
sequence
));
}
if sequence < state.next_sequence {
return Err(anyhow::anyhow!(
"lexical rebuild page sequence {} tried to reserve after sequence {}",
sequence,
state.next_sequence
));
}
if sequence == state.next_sequence {
return Ok(());
}
state = self.cv.wait(state).unwrap_or_else(|e| e.into_inner());
}
}
fn finish_turn(&self, sequence: u64) {
let mut state = self.state.lock().unwrap_or_else(|e| e.into_inner());
if state.next_sequence == sequence {
state.next_sequence = state.next_sequence.saturating_add(1);
}
self.cv.notify_all();
}
fn close(&self) {
let mut state = self.state.lock().unwrap_or_else(|e| e.into_inner());
state.closed = true;
self.cv.notify_all();
}
}
fn acquire_ordered_lexical_rebuild_page_budget(
reservation_order: &LexicalRebuildReservationOrder,
flow_limiter: &StreamingByteLimiter,
sequence: u64,
page_message_bytes: usize,
) -> Result<(usize, Duration, bool)> {
reservation_order.wait_for_turn(sequence)?;
match flow_limiter.acquire_with_wait(page_message_bytes) {
Ok(acquired) => {
reservation_order.finish_turn(sequence);
Ok(acquired)
}
Err(err) => {
reservation_order.close();
Err(err)
}
}
}
fn conversation_batch_footprint(conv: &NormalizedConversation) -> (usize, usize) {
let message_count = conv.messages.len();
let char_count = conv.messages.iter().map(|msg| msg.content.len()).sum();
(message_count, char_count)
}
#[cfg(test)]
fn next_streaming_batch(
conversations: &mut Peekable<std::vec::IntoIter<NormalizedConversation>>,
limits: StreamingBatchLimits,
) -> Option<(Vec<NormalizedConversation>, usize)> {
let first = conversations.next()?;
let (first_messages, first_chars) = conversation_batch_footprint(&first);
let mut batch = vec![first];
let mut total_messages = first_messages;
let mut total_chars = first_chars;
while let Some(next) = conversations.peek() {
let (next_messages, next_chars) = conversation_batch_footprint(next);
let would_exceed_limits = batch.len() >= limits.max_conversations
|| total_messages.saturating_add(next_messages) > limits.max_messages
|| total_chars.saturating_add(next_chars) > limits.max_chars;
if would_exceed_limits {
break;
}
let conv = conversations
.next()
.expect("peek indicated another conversation existed");
total_messages += next_messages;
total_chars += next_chars;
batch.push(conv);
}
Some((batch, total_messages))
}
struct StreamingBatchSender<'a> {
tx: &'a Sender<IndexMessage>,
flow_limiter: Arc<StreamingByteLimiter>,
connector_name: &'static str,
next_batch_is_discovered: bool,
conversations: Vec<NormalizedConversation>,
message_count: usize,
char_count: usize,
byte_reservation: usize,
}
fn remember_discovered_connector(discovered_names: &mut Vec<String>, connector_name: &'static str) {
if !discovered_names.iter().any(|name| name == connector_name) {
discovered_names.push(connector_name.to_string());
}
}
impl<'a> StreamingBatchSender<'a> {
fn new(
tx: &'a Sender<IndexMessage>,
flow_limiter: Arc<StreamingByteLimiter>,
connector_name: &'static str,
is_discovered: bool,
) -> Self {
Self {
tx,
flow_limiter,
connector_name,
next_batch_is_discovered: is_discovered,
conversations: Vec::new(),
message_count: 0,
char_count: 0,
byte_reservation: 0,
}
}
fn mark_next_batch_discovered(&mut self) {
self.next_batch_is_discovered = true;
}
fn push(&mut self, conversation: NormalizedConversation) -> Result<()> {
let (message_count, char_count) = conversation_batch_footprint(&conversation);
let would_exceed_limits = !self.conversations.is_empty()
&& (self.conversations.len() >= DEFAULT_STREAMING_BATCH_LIMITS.max_conversations
|| self.message_count.saturating_add(message_count)
> DEFAULT_STREAMING_BATCH_LIMITS.max_messages
|| self.char_count.saturating_add(char_count)
> DEFAULT_STREAMING_BATCH_LIMITS.max_chars);
if would_exceed_limits {
self.flush()?;
}
let byte_reservation = self.flow_limiter.acquire(char_count).map_err(|_| {
anyhow::Error::new(StreamingConsumerDisconnected {
connector_name: self.connector_name,
})
})?;
self.message_count += message_count;
self.char_count += char_count;
self.byte_reservation = self.byte_reservation.saturating_add(byte_reservation);
self.conversations.push(conversation);
let single_conversation_exceeds_limits = self.conversations.len() == 1
&& (self.message_count > DEFAULT_STREAMING_BATCH_LIMITS.max_messages
|| self.char_count > DEFAULT_STREAMING_BATCH_LIMITS.max_chars);
if single_conversation_exceeds_limits {
self.flush()?;
}
Ok(())
}
fn flush(&mut self) -> Result<()> {
if self.conversations.is_empty() {
if self.byte_reservation > 0 {
self.flow_limiter.release(self.byte_reservation);
self.byte_reservation = 0;
}
return Ok(());
}
let message_count = self.message_count;
let byte_reservation = self.byte_reservation;
let conversations = std::mem::take(&mut self.conversations);
if let Err(_send_error) = self.tx.send(IndexMessage::Batch {
connector_name: self.connector_name,
conversations,
is_discovered: self.next_batch_is_discovered,
message_count,
byte_reservation,
}) {
self.flow_limiter.release(byte_reservation);
self.message_count = 0;
self.char_count = 0;
self.byte_reservation = 0;
return Err(anyhow::Error::new(StreamingConsumerDisconnected {
connector_name: self.connector_name,
}));
}
self.message_count = 0;
self.char_count = 0;
self.byte_reservation = 0;
self.next_batch_is_discovered = false;
Ok(())
}
}
impl Drop for StreamingBatchSender<'_> {
fn drop(&mut self) {
if self.byte_reservation > 0 {
self.flow_limiter.release(self.byte_reservation);
self.byte_reservation = 0;
}
}
}
#[cfg(test)]
fn send_conversation_batches(
tx: &Sender<IndexMessage>,
connector_name: &'static str,
conversations: Vec<NormalizedConversation>,
is_discovered: bool,
) {
let mut sender = StreamingBatchSender::new(
tx,
Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
connector_name,
is_discovered,
);
for conversation in conversations {
sender
.push(conversation)
.expect("test batch sender should deliver to in-memory receiver");
}
sender
.flush()
.expect("test batch sender should flush to in-memory receiver");
}
pub fn streaming_index_enabled() -> bool {
dotenvy::var("CASS_STREAMING_INDEX")
.map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
.unwrap_or(true)
}
fn scan_path_exclusions_value_active(value: Option<&str>) -> bool {
value.is_some_and(|raw| {
raw.split([',', '\n'])
.map(str::trim)
.any(|part| !part.is_empty())
})
}
fn scan_path_exclusions_active() -> bool {
scan_path_exclusions_value_active(dotenvy::var("CASS_EXCLUDE_PATHS").ok().as_deref())
}
fn active_session_source_skip_observed() -> bool {
ACTIVE_SESSION_SOURCE_SKIP_OBSERVED.load(Ordering::Relaxed)
}
fn scan_watermark_preservation_active() -> bool {
scan_path_exclusions_active() || active_session_source_skip_observed()
}
fn panic_payload_message(payload: Box<dyn Any + Send>) -> String {
match payload.downcast::<String>() {
Ok(message) => *message,
Err(payload) => match payload.downcast::<&'static str>() {
Ok(message) => (*message).to_string(),
Err(_) => "non-string panic payload".to_string(),
},
}
}
#[derive(Debug)]
struct StreamingConsumerDisconnected {
connector_name: &'static str,
}
impl std::fmt::Display for StreamingConsumerDisconnected {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"streaming consumer disconnected while sending batch for {}",
self.connector_name
)
}
}
impl std::error::Error for StreamingConsumerDisconnected {}
fn is_streaming_consumer_disconnected(error: &anyhow::Error) -> bool {
error
.downcast_ref::<StreamingConsumerDisconnected>()
.is_some()
}
#[derive(Clone)]
struct StreamingProducerConfig {
flow_limiter: Arc<StreamingByteLimiter>,
data_dir: PathBuf,
additional_scan_roots: Vec<ScanRoot>,
since_ts: Option<i64>,
progress: Option<Arc<IndexingProgress>>,
active_source_filter: Arc<ActiveSessionSourceFilter>,
}
fn spawn_connector_producer(
name: &'static str,
factory: fn() -> Box<dyn Connector + Send>,
tx: Sender<IndexMessage>,
config: StreamingProducerConfig,
) -> JoinHandle<()> {
thread::spawn(move || {
let scan_start = std::time::Instant::now();
let conn = factory();
let detect = conn.detect();
let was_detected = detect.detected;
let mut is_discovered = false;
if detect.detected {
if let Some(p) = &config.progress {
p.discovered_agents.fetch_add(1, Ordering::Relaxed);
}
is_discovered = true;
let ctx = crate::connectors::ScanContext::local_default(
config.data_dir.clone(),
config.since_ts,
);
let local_origin = Origin::local();
let mut batch_sender =
StreamingBatchSender::new(&tx, config.flow_limiter.clone(), name, is_discovered);
let fallback_roots: Vec<ScanRoot> = detect
.root_paths
.iter()
.cloned()
.map(ScanRoot::local)
.collect();
capture_connector_sources_before_parse(
conn.as_ref(),
&ctx,
&config.data_dir,
name,
&fallback_roots,
config.since_ts,
config.active_source_filter.as_ref(),
);
match conn.scan_with_callback(&ctx, &mut |mut conversation| {
if should_skip_active_session_source(
config.active_source_filter.as_ref(),
LOCAL_SOURCE_ID,
&conversation.source_path,
) {
return Ok(());
}
inject_provenance(&mut conversation, &local_origin);
compact_large_connector_extras(name, &mut conversation);
attach_raw_mirror_capture(&config.data_dir, &mut conversation);
batch_sender.push(conversation)
}) {
Ok(()) => {
if let Err(error) = batch_sender.flush() {
if is_streaming_consumer_disconnected(&error) {
tracing::info!(
connector = name,
"streaming consumer disconnected; stopping producer"
);
return;
}
tracing::warn!(connector = name, "local flush failed: {}", error);
let _ = tx.send(IndexMessage::ScanError {
connector_name: name,
error: format!("local flush failed: {error}"),
});
}
}
Err(e) => {
if let Err(flush_error) = batch_sender.flush()
&& !is_streaming_consumer_disconnected(&flush_error)
{
tracing::warn!(connector = name, "local flush failed: {}", flush_error);
}
if is_streaming_consumer_disconnected(&e) {
tracing::info!(
connector = name,
"streaming consumer disconnected; stopping producer"
);
return;
}
tracing::warn!(connector = name, "local scan failed: {}", e);
let _ = tx.send(IndexMessage::ScanError {
connector_name: name,
error: e.to_string(),
});
}
}
}
for root in &config.additional_scan_roots {
let ctx = crate::connectors::ScanContext::with_roots(
root.path.clone(),
vec![root.clone()],
config.since_ts,
);
let mut batch_sender =
StreamingBatchSender::new(&tx, config.flow_limiter.clone(), name, is_discovered);
capture_connector_sources_before_parse(
conn.as_ref(),
&ctx,
&config.data_dir,
name,
std::slice::from_ref(root),
config.since_ts,
config.active_source_filter.as_ref(),
);
match conn.scan_with_callback(&ctx, &mut |mut conversation| {
if should_skip_active_session_source(
config.active_source_filter.as_ref(),
&root.origin.source_id,
&conversation.source_path,
) {
return Ok(());
}
inject_provenance(&mut conversation, &root.origin);
apply_workspace_rewrite(&mut conversation, root);
compact_large_connector_extras(name, &mut conversation);
attach_raw_mirror_capture(&config.data_dir, &mut conversation);
if !was_detected && !is_discovered {
if let Some(p) = &config.progress {
p.discovered_agents.fetch_add(1, Ordering::Relaxed);
}
is_discovered = true;
batch_sender.mark_next_batch_discovered();
}
batch_sender.push(conversation)
}) {
Ok(()) => {
if let Err(error) = batch_sender.flush() {
if is_streaming_consumer_disconnected(&error) {
tracing::info!(
connector = name,
"streaming consumer disconnected; stopping producer"
);
return;
}
tracing::warn!(
connector = name,
root = %root.path.display(),
"remote flush failed: {}",
error
);
let _ = tx.send(IndexMessage::ScanError {
connector_name: name,
error: format!(
"remote flush failed for {}: {}",
root.path.display(),
error
),
});
}
}
Err(e) => {
if let Err(flush_error) = batch_sender.flush()
&& !is_streaming_consumer_disconnected(&flush_error)
{
tracing::warn!(
connector = name,
root = %root.path.display(),
"remote flush failed: {}",
flush_error
);
}
if is_streaming_consumer_disconnected(&e) {
tracing::info!(
connector = name,
"streaming consumer disconnected; stopping producer"
);
return;
}
tracing::warn!(
connector = name,
root = %root.path.display(),
"remote scan failed: {}", e
);
let _ = tx.send(IndexMessage::ScanError {
connector_name: name,
error: format!("remote scan failed for {}: {}", root.path.display(), e),
});
}
}
}
let scan_ms = scan_start.elapsed().as_millis() as u64;
tracing::info!(
connector = name,
discovered = is_discovered,
scan_ms,
"streaming_scan_complete"
);
let _ = tx.send(IndexMessage::Done {
connector_name: name,
scan_ms,
is_discovered,
});
})
}
fn streaming_consumer_commit_base_secs() -> u64 {
dotenvy::var("CASS_STREAMING_CONSUMER_COMMIT_SECS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(5)
}
fn streaming_consumer_commit_interval() -> Duration {
let base = streaming_consumer_commit_base_secs();
let capacity = responsiveness::current_capacity_pct().clamp(1, 100) as u64;
let scaled = base.saturating_mul(capacity) / 100;
Duration::from_secs(scaled.max(1))
}
fn streaming_combine_enabled() -> bool {
match dotenvy::var("CASS_STREAMING_CONSUMER_COMBINE")
.ok()
.map(|v| v.trim().to_ascii_lowercase())
.as_deref()
{
Some("0" | "false" | "no" | "off") => false,
_ => true,
}
}
fn streaming_combine_max_messages() -> usize {
dotenvy::var("CASS_STREAMING_COMBINE_MAX")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.map(|v| v.clamp(1, 1024))
.unwrap_or(64)
}
fn streaming_combine_max_bytes() -> usize {
dotenvy::var("CASS_STREAMING_COMBINE_MAX_BYTES")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.filter(|value| *value > 0)
.map(|v| v.clamp(1024 * 1024, STREAMING_MAX_BYTES_IN_FLIGHT))
.unwrap_or(STREAMING_MAX_BYTES_IN_FLIGHT / 2)
}
#[allow(clippy::too_many_arguments)]
fn run_streaming_consumer(
rx: Receiver<IndexMessage>,
num_producers: usize,
storage: &FrankenStorage,
data_dir: &Path,
mut t_index: Option<&mut TantivyIndex>,
flow_limiter: Arc<StreamingByteLimiter>,
progress: &Option<Arc<IndexingProgress>>,
lexical_strategy: LexicalPopulationStrategy,
scan_start_ts: Option<i64>,
) -> Result<(Vec<String>, NonWatchIngestOutcome)> {
use std::collections::HashMap;
let mut active_producers = num_producers;
let mut discovered_names: Vec<String> = Vec::new();
let mut total_conversations = 0usize;
let mut total_messages = 0usize;
let mut switched_to_indexing = false;
let mut last_commit = std::time::Instant::now();
let index_start = std::time::Instant::now();
let mut ingest_outcome = NonWatchIngestOutcome::default();
let defer_streaming_checkpoints = true;
let mut connector_stats: HashMap<String, ConnectorStats> = HashMap::new();
let combine_enabled = streaming_combine_enabled();
let max_combine_messages = streaming_combine_max_messages();
let max_combine_bytes = streaming_combine_max_bytes();
let mut deferred_non_batch: VecDeque<IndexMessage> = VecDeque::new();
loop {
let next_message = if let Some(m) = deferred_non_batch.pop_front() {
Ok(m)
} else {
rx.recv()
};
match next_message {
Ok(IndexMessage::Batch {
connector_name,
conversations,
is_discovered,
message_count,
byte_reservation,
}) => {
let mut combined_conversations: Vec<NormalizedConversation> = conversations;
let mut combined_message_count = message_count;
let mut combined_byte_reservation = byte_reservation;
let mut combined_batch_size = combined_conversations.len();
total_conversations += combined_batch_size;
total_messages += combined_message_count;
{
let stats = connector_stats
.entry(connector_name.to_string())
.or_insert_with(|| ConnectorStats {
name: connector_name.to_string(),
..Default::default()
});
stats.conversations += combined_batch_size;
stats.messages += message_count;
}
if !switched_to_indexing {
if let Some(p) = progress {
p.phase.store(2, Ordering::Relaxed); p.total.store(0, Ordering::Relaxed); p.current.store(0, Ordering::Relaxed);
}
switched_to_indexing = true;
}
if let Some(p) = progress {
p.total.fetch_add(combined_batch_size, Ordering::Relaxed);
}
if is_discovered {
remember_discovered_connector(&mut discovered_names, connector_name);
}
if combine_enabled && active_producers > 1 {
let mut combined_messages_so_far = 1usize;
while combined_messages_so_far < max_combine_messages
&& combined_byte_reservation < max_combine_bytes
{
match rx.try_recv() {
Ok(IndexMessage::Batch {
connector_name: cname2,
conversations: extra_convs,
is_discovered: extra_discovered,
message_count: extra_msg_count,
byte_reservation: extra_byte_reservation,
}) => {
let extra_size = extra_convs.len();
let stats = connector_stats
.entry(cname2.to_string())
.or_insert_with(|| ConnectorStats {
name: cname2.to_string(),
..Default::default()
});
stats.conversations += extra_size;
stats.messages += extra_msg_count;
if extra_discovered {
remember_discovered_connector(&mut discovered_names, cname2);
}
if let Some(p) = progress {
p.total.fetch_add(extra_size, Ordering::Relaxed);
}
combined_conversations.extend(extra_convs);
combined_message_count += extra_msg_count;
combined_byte_reservation += extra_byte_reservation;
combined_batch_size += extra_size;
total_conversations += extra_size;
total_messages += extra_msg_count;
combined_messages_so_far += 1;
}
Ok(other) => {
deferred_non_batch.push_back(other);
break;
}
Err(_) => break,
}
}
}
let batch_outcome = ingest_non_watch_batch_with_oom_split(
storage,
t_index.as_deref_mut(),
data_dir,
&combined_conversations,
progress,
lexical_strategy,
defer_streaming_checkpoints,
);
flow_limiter.release(combined_byte_reservation);
ingest_outcome = ingest_outcome.accumulate(batch_outcome?);
let message_count = combined_message_count;
let batch_size = combined_batch_size;
if last_commit.elapsed() >= streaming_consumer_commit_interval() {
if let Some(t_index) = t_index.as_deref_mut() {
if let Err(e) = t_index.commit() {
tracing::warn!("incremental commit failed: {}", e);
} else {
tracing::debug!("incremental commit completed");
}
}
let preserve_scan_watermark = scan_watermark_preservation_active();
if !preserve_scan_watermark
&& let Some(ts) = scan_start_ts
&& let Err(e) = persist::with_ephemeral_writer(
storage,
defer_streaming_checkpoints,
"updating streaming incremental last_scan_ts",
|writer| writer.set_last_scan_ts(ts),
)
{
tracing::warn!("incremental last_scan_ts save failed: {}", e);
} else if preserve_scan_watermark {
tracing::debug!(
"preserving streaming incremental last_scan_ts because scan exclusions or active source skips are active"
);
}
last_commit = std::time::Instant::now();
}
tracing::info!(
connector = connector_name,
conversations = batch_size,
messages = message_count,
"streaming_ingest"
);
}
Ok(IndexMessage::ScanError {
connector_name,
error,
}) => {
let stats = connector_stats
.entry(connector_name.to_string())
.or_insert_with(|| ConnectorStats {
name: connector_name.to_string(),
..Default::default()
});
stats.error = Some(error.clone());
tracing::warn!(
connector = connector_name,
error = %error,
"streaming_scan_error"
);
}
Ok(IndexMessage::Done {
connector_name,
scan_ms,
is_discovered,
}) => {
active_producers -= 1;
let stats = connector_stats
.entry(connector_name.to_string())
.or_insert_with(|| ConnectorStats {
name: connector_name.to_string(),
..Default::default()
});
stats.scan_ms = scan_ms;
if is_discovered {
remember_discovered_connector(&mut discovered_names, connector_name);
}
if !switched_to_indexing && let Some(p) = progress {
p.current.fetch_add(1, Ordering::Relaxed);
}
tracing::debug!(
connector = connector_name,
scan_ms,
remaining = active_producers,
"streaming_producer_done"
);
if active_producers == 0 {
break;
}
}
Err(_) => {
let error = format!(
"streaming indexing aborted: channel closed with {active_producers} producers still active"
);
tracing::warn!(remaining = active_producers, error = %error);
set_progress_last_error(progress.as_ref(), Some(error.clone()));
return Err(anyhow::anyhow!(error));
}
}
}
if let Some(t_index) = t_index
&& !ingest_outcome.lexical_update_deferred
{
t_index.commit()?;
} else if ingest_outcome.lexical_update_deferred {
tracing::warn!(
"skipping final streaming Tantivy commit because lexical updates were deferred; authoritative DB rebuild will replace derived lexical assets"
);
}
let index_ms = index_start.elapsed().as_millis() as u64;
let scan_ms = connector_stats
.values()
.map(|s| s.scan_ms)
.max()
.unwrap_or(0);
if let Some(p) = progress
&& let Ok(mut stats) = p.stats.lock()
{
stats.scan_ms = scan_ms;
stats.index_ms = index_ms;
stats.connectors = connector_stats.values().cloned().collect();
stats.agents_discovered = discovered_names.clone();
stats.total_conversations = total_conversations;
stats.total_messages = total_messages;
stats.quarantined_conversations = stats
.quarantined_conversations
.saturating_add(ingest_outcome.quarantined_conversations);
stats.lexical_update_deferred |= ingest_outcome.lexical_update_deferred;
}
tracing::info!(
total_conversations,
total_messages,
scan_ms,
index_ms,
discovered = discovered_names.len(),
"streaming_indexing_complete"
);
Ok((discovered_names, ingest_outcome))
}
fn run_streaming_index(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
opts: &IndexOptions,
since_ts: Option<i64>,
lexical_strategy: LexicalPopulationStrategy,
additional_scan_roots: Vec<ScanRoot>,
scan_start_ts: i64,
) -> Result<NonWatchIngestOutcome> {
run_streaming_index_with_connector_factories(
storage,
t_index,
opts,
since_ts,
lexical_strategy,
additional_scan_roots,
configured_connector_factories(),
scan_start_ts,
)
}
type ConnectorFactory = fn() -> Box<dyn Connector + Send>;
fn configured_connector_factories() -> Vec<(&'static str, ConnectorFactory)> {
filter_disabled_connector_factories(get_connector_factories())
}
fn filter_disabled_connector_factories(
connector_factories: Vec<(&'static str, ConnectorFactory)>,
) -> Vec<(&'static str, ConnectorFactory)> {
if dotenvy::var("CASS_IGNORE_SOURCES_CONFIG").is_ok() {
return connector_factories;
}
let config = match SourcesConfig::load() {
Ok(config) => config,
Err(error) => {
tracing::debug!(
error = %error,
"failed to load sources config while filtering disabled connectors"
);
return connector_factories;
}
};
let disabled_agents = config.configured_disabled_agents();
if disabled_agents.is_empty() {
return connector_factories;
}
let filtered = connector_factories
.into_iter()
.filter(|(name, _)| !config.is_agent_disabled(name))
.collect::<Vec<_>>();
tracing::info!(
disabled_agents = ?disabled_agents,
enabled_connectors = filtered.len(),
"skipping disabled connectors from indexing configuration"
);
filtered
}
#[allow(clippy::too_many_arguments)]
fn run_streaming_index_with_connector_factories(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
opts: &IndexOptions,
since_ts: Option<i64>,
lexical_strategy: LexicalPopulationStrategy,
additional_scan_roots: Vec<ScanRoot>,
connector_factories: Vec<(&'static str, ConnectorFactory)>,
scan_start_ts: i64,
) -> Result<NonWatchIngestOutcome> {
if connector_factories.is_empty() {
tracing::warn!("no enabled connectors are configured for indexing; skipping scan");
if let Some(p) = &opts.progress {
p.phase.store(1, Ordering::Relaxed);
p.total.store(0, Ordering::Relaxed);
p.current.store(0, Ordering::Relaxed);
p.discovered_agents.store(0, Ordering::Relaxed);
if let Ok(mut names) = p.discovered_agent_names.lock() {
names.clear();
}
}
return Ok(NonWatchIngestOutcome::default());
}
let buffered_connectors: Vec<&'static str> = connector_factories
.iter()
.filter_map(|(name, factory)| {
let connector = factory();
(!connector.supports_streaming_scan()).then_some(*name)
})
.collect();
let num_connectors = connector_factories.len();
if !buffered_connectors.is_empty() {
tracing::warn!(
connectors = ?buffered_connectors,
"streaming index still has buffered connectors that do not implement callback streaming"
);
}
if let Some(p) = &opts.progress {
p.phase.store(1, Ordering::Relaxed); p.total.store(num_connectors, Ordering::Relaxed);
p.current.store(0, Ordering::Relaxed);
p.discovered_agents.store(0, Ordering::Relaxed);
if let Ok(mut names) = p.discovered_agent_names.lock() {
names.clear();
}
}
let (tx, rx) = bounded::<IndexMessage>(STREAMING_CHANNEL_SIZE);
let producer_config = StreamingProducerConfig {
flow_limiter: Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
data_dir: opts.data_dir.clone(),
additional_scan_roots: additional_scan_roots.clone(),
since_ts,
progress: opts.progress.clone(),
active_source_filter: Arc::new(ActiveSessionSourceFilter::new(
opts.watch && opts.watch_once_paths.as_ref().is_none_or(Vec::is_empty),
)),
};
let handles: Vec<(&'static str, JoinHandle<()>)> = connector_factories
.into_iter()
.map(|(name, factory)| {
(
name,
spawn_connector_producer(name, factory, tx.clone(), producer_config.clone()),
)
})
.collect();
drop(tx);
let consumer_result = run_streaming_consumer(
rx,
num_connectors,
storage,
&opts.data_dir,
t_index,
producer_config.flow_limiter.clone(),
&opts.progress,
lexical_strategy,
Some(scan_start_ts),
);
if consumer_result.is_err() {
producer_config.flow_limiter.close();
}
let mut join_errors = Vec::new();
for (name, handle) in handles {
if let Err(payload) = handle.join() {
let panic_message = panic_payload_message(payload);
tracing::error!(connector = name, panic = %panic_message, "streaming producer panicked");
join_errors.push(format!("{name}: {panic_message}"));
}
}
if let Err(error) = consumer_result {
if !join_errors.is_empty() {
let combined = format!(
"{error}; streaming producer thread panicked: {}",
join_errors.join("; ")
);
set_progress_last_error(opts.progress.as_ref(), Some(combined.clone()));
return Err(anyhow::anyhow!(combined));
}
set_progress_last_error(opts.progress.as_ref(), Some(error.to_string()));
return Err(error);
}
if !join_errors.is_empty() {
let error = format!(
"streaming producer thread panicked: {}",
join_errors.join("; ")
);
set_progress_last_error(opts.progress.as_ref(), Some(error.clone()));
return Err(anyhow::anyhow!(error));
}
let (discovered_names, ingest_outcome) = match consumer_result {
Ok(result) => result,
Err(_) => unreachable!("handled above"),
};
if let Some(p) = &opts.progress
&& let Ok(mut names) = p.discovered_agent_names.lock()
{
names.extend(discovered_names);
}
Ok(ingest_outcome)
}
fn run_batch_index(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
opts: &IndexOptions,
since_ts: Option<i64>,
lexical_strategy: LexicalPopulationStrategy,
additional_scan_roots: Vec<ScanRoot>,
scan_start_ts: i64,
) -> Result<NonWatchIngestOutcome> {
run_batch_index_with_connector_factories(
storage,
t_index,
opts,
since_ts,
lexical_strategy,
additional_scan_roots,
configured_connector_factories(),
scan_start_ts,
)
}
#[allow(clippy::too_many_arguments)]
fn run_batch_index_with_connector_factories(
storage: &FrankenStorage,
mut t_index: Option<&mut TantivyIndex>,
opts: &IndexOptions,
since_ts: Option<i64>,
lexical_strategy: LexicalPopulationStrategy,
additional_scan_roots: Vec<ScanRoot>,
connector_factories: Vec<(&'static str, ConnectorFactory)>,
scan_start_ts: i64,
) -> Result<NonWatchIngestOutcome> {
let scan_start = std::time::Instant::now();
if let Some(p) = &opts.progress {
p.phase.store(1, Ordering::Relaxed); p.total.store(connector_factories.len(), Ordering::Relaxed);
p.current.store(0, Ordering::Relaxed);
p.discovered_agents.store(0, Ordering::Relaxed);
if let Ok(mut names) = p.discovered_agent_names.lock() {
names.clear();
}
}
use rayon::prelude::*;
let progress_ref = opts.progress.as_ref();
let data_dir = opts.data_dir.clone();
let active_source_filter = Arc::new(ActiveSessionSourceFilter::new(
opts.watch && opts.watch_once_paths.as_ref().is_none_or(Vec::is_empty),
));
let pending_batches: Vec<(&'static str, Vec<NormalizedConversation>, bool)> =
connector_factories
.into_par_iter()
.filter_map(|(name, factory)| {
let conn = factory();
let detect = conn.detect();
let was_detected = detect.detected;
let mut convs = Vec::new();
let mut is_discovered = false;
if detect.detected {
if let Some(p) = progress_ref {
p.discovered_agents.fetch_add(1, Ordering::Relaxed);
}
is_discovered = true;
let ctx =
crate::connectors::ScanContext::local_default(data_dir.clone(), since_ts);
let fallback_roots: Vec<ScanRoot> = detect
.root_paths
.iter()
.cloned()
.map(ScanRoot::local)
.collect();
capture_connector_sources_before_parse(
conn.as_ref(),
&ctx,
&data_dir,
name,
&fallback_roots,
since_ts,
active_source_filter.as_ref(),
);
match conn.scan(&ctx) {
Ok(mut local_convs) => {
let local_origin = Origin::local();
local_convs.retain(|conv| {
!should_skip_active_session_source(
active_source_filter.as_ref(),
LOCAL_SOURCE_ID,
&conv.source_path,
)
});
for conv in &mut local_convs {
inject_provenance(conv, &local_origin);
attach_raw_mirror_capture(&data_dir, conv);
}
convs.extend(local_convs);
}
Err(e) => {
tracing::warn!("scan failed for {}: {}", name, e);
}
}
}
if !additional_scan_roots.is_empty() {
for root in &additional_scan_roots {
let ctx = crate::connectors::ScanContext::with_roots(
root.path.clone(),
vec![root.clone()],
since_ts,
);
capture_connector_sources_before_parse(
conn.as_ref(),
&ctx,
&data_dir,
name,
std::slice::from_ref(root),
since_ts,
active_source_filter.as_ref(),
);
match conn.scan(&ctx) {
Ok(mut remote_convs) => {
remote_convs.retain(|conv| {
!should_skip_active_session_source(
active_source_filter.as_ref(),
&root.origin.source_id,
&conv.source_path,
)
});
for conv in &mut remote_convs {
inject_provenance(conv, &root.origin);
apply_workspace_rewrite(conv, root);
attach_raw_mirror_capture(&data_dir, conv);
}
convs.extend(remote_convs);
}
Err(e) => {
tracing::warn!(
connector = name,
root = %root.path.display(),
"remote scan failed: {e}"
);
}
}
}
}
if !was_detected && !convs.is_empty() {
if let Some(p) = progress_ref {
p.discovered_agents.fetch_add(1, Ordering::Relaxed);
}
is_discovered = true;
}
if let Some(p) = progress_ref {
p.current.fetch_add(1, Ordering::Relaxed);
}
if convs.is_empty() && !is_discovered {
return None;
}
tracing::info!(
connector = name,
conversations = convs.len(),
discovered = is_discovered,
"batch_scan_complete"
);
Some((name, convs, is_discovered))
})
.collect();
let scan_ms = scan_start.elapsed().as_millis() as u64;
let discovered_names: Vec<String> = pending_batches
.iter()
.filter(|(_, _, discovered)| *discovered)
.map(|(name, _, _)| (*name).to_string())
.collect();
let total_conversations: usize = pending_batches
.iter()
.map(|(_, convs, _)| convs.len())
.sum();
let total_messages: usize = pending_batches
.iter()
.map(|(_, convs, _)| convs.iter().map(|c| c.messages.len()).sum::<usize>())
.sum();
let connector_stats: Vec<ConnectorStats> = pending_batches
.iter()
.filter(|(_, convs, _)| !convs.is_empty())
.map(|(name, convs, _)| {
let msgs: usize = convs.iter().map(|c| c.messages.len()).sum();
ConnectorStats {
name: (*name).to_string(),
conversations: convs.len(),
messages: msgs,
scan_ms,
error: None,
}
})
.collect();
if let Some(p) = &opts.progress {
if let Ok(mut names) = p.discovered_agent_names.lock() {
names.extend(discovered_names.clone());
}
p.phase.store(2, Ordering::Relaxed); p.total.store(total_conversations, Ordering::Relaxed);
p.current.store(0, Ordering::Relaxed);
}
let index_start = std::time::Instant::now();
let mut last_scan_ts_save = std::time::Instant::now();
let mut ingest_outcome = NonWatchIngestOutcome::default();
let preserve_scan_watermark = scan_watermark_preservation_active();
for (name, convs, _discovered) in pending_batches {
let batch_outcome = ingest_non_watch_batch_with_oom_split(
storage,
t_index.as_deref_mut(),
&opts.data_dir,
&convs,
&opts.progress,
lexical_strategy,
!opts.watch,
)?;
ingest_outcome = ingest_outcome.accumulate(batch_outcome);
if !preserve_scan_watermark && last_scan_ts_save.elapsed() >= Duration::from_secs(10) {
if let Err(e) = persist::with_ephemeral_writer(
storage,
false,
"updating batch incremental last_scan_ts",
|writer| writer.set_last_scan_ts(scan_start_ts),
) {
tracing::warn!("batch incremental last_scan_ts save failed: {}", e);
}
last_scan_ts_save = std::time::Instant::now();
} else if preserve_scan_watermark && last_scan_ts_save.elapsed() >= Duration::from_secs(10)
{
tracing::debug!(
"preserving batch incremental last_scan_ts because scan exclusions or active source skips are active"
);
last_scan_ts_save = std::time::Instant::now();
}
tracing::info!(
connector = name,
conversations = convs.len(),
"batch_ingest"
);
}
let index_ms = index_start.elapsed().as_millis() as u64;
if let Some(p) = &opts.progress
&& let Ok(mut stats) = p.stats.lock()
{
stats.scan_ms = scan_ms;
stats.index_ms = index_ms;
stats.connectors = connector_stats;
stats.agents_discovered = discovered_names;
stats.total_conversations = total_conversations;
stats.total_messages = total_messages;
stats.quarantined_conversations = stats
.quarantined_conversations
.saturating_add(ingest_outcome.quarantined_conversations);
stats.lexical_update_deferred |= ingest_outcome.lexical_update_deferred;
}
Ok(ingest_outcome)
}
fn non_watch_scan_since_ts(
full: bool,
needs_rebuild: bool,
retry_stale_index_ingest_quarantine: bool,
last_scan_ts: Option<i64>,
) -> Option<i64> {
if full || needs_rebuild || retry_stale_index_ingest_quarantine {
None
} else {
last_scan_ts.map(|ts| ts.saturating_sub(1))
}
}
pub fn run_index(
opts: IndexOptions,
event_channel: Option<(Sender<IndexerEvent>, Receiver<IndexerEvent>)>,
) -> Result<()> {
ACTIVE_SESSION_SOURCE_SKIP_OBSERVED.store(false, Ordering::Relaxed);
let _progress_reset = RunIndexProgressReset::new(opts.progress.clone());
set_progress_last_error(opts.progress.as_ref(), None);
let initial_lock_mode = if opts.watch {
SearchMaintenanceMode::WatchStartup
} else if opts
.watch_once_paths
.as_ref()
.is_some_and(|paths| !paths.is_empty())
{
SearchMaintenanceMode::WatchOnce
} else {
SearchMaintenanceMode::Index
};
let mut index_run_lock =
acquire_index_run_lock(&opts.data_dir, &opts.db_path, initial_lock_mode)?;
let _index_run_lock_heartbeat = IndexRunLockHeartbeat::start(
opts.data_dir.clone(),
index_run_lock_heartbeat_interval(),
Arc::clone(&index_run_lock.metadata_write_lock),
Arc::clone(&index_run_lock.last_progress_at_ms_atomic),
);
let progress_bump = Arc::clone(&index_run_lock.last_progress_at_ms_atomic);
if can_skip_absent_explicit_watch_once_index_run(&opts) {
let path_count = opts
.watch_once_paths
.as_ref()
.map(std::vec::Vec::len)
.unwrap_or_default();
tracing::info!(
db_path = %opts.db_path.display(),
data_dir = %opts.data_dir.display(),
path_count,
"skipping watch-once index because all explicit paths are absent"
);
return Ok(());
}
let index_path = index_dir(&opts.data_dir)?;
ensure_index_storage_headroom(&opts.data_dir, &opts.db_path)?;
if should_try_readonly_nonresumable_lexical_resume(&opts) {
match nonresumable_pending_lexical_rebuild_status_from_readonly_db(
&index_path,
&opts.db_path,
) {
Ok(Some((_status, total_conversations))) => {
tracing::info!(
db_path = %opts.db_path.display(),
total_conversations,
"restarting non-resumable lexical rebuild from a readonly canonical DB before writable storage open"
);
record_lexical_population_strategy(
opts.progress.as_ref(),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
"readonly_fast_resume_incomplete_nonresumable_lexical_rebuild",
);
tracing::info!(
strategy = LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild.as_str(),
reason = "readonly_fast_resume_incomplete_nonresumable_lexical_rebuild",
"selected_lexical_population_strategy"
);
let rebuild = rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
&opts.db_path,
&opts.data_dir,
total_conversations,
opts.progress.clone(),
Arc::clone(&progress_bump),
)?;
if let Some(p) = &opts.progress
&& let Ok(mut stats) = p.stats.lock()
{
stats.scan_ms = 0;
stats.total_conversations = total_conversations;
}
if let Some(observed_messages) = rebuild.observed_messages {
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
total_conversations,
observed_messages,
);
}
return Ok(());
}
Ok(None) => {}
Err(err) => {
tracing::debug!(
db_path = %opts.db_path.display(),
error = %err,
"readonly lexical resume preflight failed; falling back to writable storage open"
);
}
}
}
if try_readonly_canonical_force_rebuild(&opts, &progress_bump)? {
return Ok(());
}
let (mut storage, canonical_storage_rebuilt, opened_fresh_for_full) =
open_storage_for_index(&opts.db_path, opts.full)?;
let defer_checkpoints = !opts.watch;
let mut reopened_after_writable_preflight = false;
if let Err(err) = storage
.raw()
.execute("UPDATE meta SET value = value WHERE key = 'schema_version'")
{
tracing::warn!(
db_path = %opts.db_path.display(),
error = %err,
"primary storage connection failed writable preflight; \
attempting to close and reopen"
);
storage.close_best_effort_in_place();
storage = crate::storage::sqlite::open_franken_storage_with_timeout(
&opts.db_path,
Duration::from_secs(10),
)
.with_context(|| {
format!(
"reopening storage after writable preflight failure: {}. \
If this persists, check that no other cass process holds \
an exclusive lock on the database.",
opts.db_path.display()
)
})?;
reopened_after_writable_preflight = true;
}
persist::apply_index_writer_busy_timeout(&storage);
persist::apply_index_writer_checkpoint_policy(&storage, defer_checkpoints);
if let Err(err) = storage.validate_fts_messages_integrity() {
tracing::error!(
db_path = %opts.db_path.display(),
error = %err,
"canonical archive has corrupt fts_messages metadata; refusing to enter index pipeline"
);
storage.close_best_effort_in_place();
return Err(canonical_archive_unhealthy_for_index_error(
&opts.db_path,
&err.to_string(),
));
}
if opts.full
&& !opened_fresh_for_full
&& let Some(reason) = full_rebuild_existing_storage_integrity_problem(&storage)?
{
tracing::error!(
db_path = %opts.db_path.display(),
reason = %reason,
"full rebuild detected an unhealthy current-schema canonical db; refusing to replace the canonical archive"
);
storage.close_best_effort_in_place();
return Err(canonical_archive_unhealthy_for_index_error(
&opts.db_path,
&reason,
));
}
if can_skip_unchanged_explicit_watch_once_index_run(&opts, &storage, &index_path)? {
let now_ms = FrankenStorage::now_millis();
persist_final_index_run_metadata(&storage, &opts.db_path, false, now_ms, now_ms)?;
record_lexical_population_strategy_if_unset(
opts.progress.as_ref(),
LexicalPopulationStrategy::IncrementalInline,
"watch_once_targeted_reindex_applies_inline_lexical_updates_for_changed_paths",
);
reset_progress_to_idle(opts.progress.as_ref());
let path_count = opts
.watch_once_paths
.as_ref()
.map(std::vec::Vec::len)
.unwrap_or_default();
tracing::info!(
db_path = %opts.db_path.display(),
path_count,
"skipping unchanged explicit watch-once index run before startup maintenance"
);
return close_storage_after_index(storage, &opts.db_path, "watch-once no-op index run");
}
if let Err(err) = storage.cleanup_orphan_fk_rows() {
tracing::warn!(
target: "cass::fk_repair",
db_path = %opts.db_path.display(),
error = %err,
"cass#202: orphan FK self-heal failed; aborting index run before further writes"
);
storage.close_best_effort_in_place();
return Err(orphan_fk_cleanup_failed_index_error(&opts.db_path, &err));
}
let initial_canonical_sessions_before_salvage = count_total_conversations_exact(&storage)?;
if opts.full
&& !opened_fresh_for_full
&& full_rebuild_requires_historical_restart(
&storage,
&opts.db_path,
initial_canonical_sessions_before_salvage,
)?
{
tracing::error!(
db_path = %opts.db_path.display(),
conversations = initial_canonical_sessions_before_salvage,
"full rebuild detected incomplete historical salvage state; refusing to replace the canonical archive"
);
storage.close_best_effort_in_place();
return Err(canonical_archive_unhealthy_for_index_error(
&opts.db_path,
"historical salvage restart would require replacing canonical SQLite",
));
}
let canonical_only_full_rebuild =
opts.force_rebuild && initial_canonical_sessions_before_salvage > 0;
let has_explicit_watch_once_paths = opts
.watch_once_paths
.as_ref()
.is_some_and(|paths| !paths.is_empty());
let targeted_semantic_watch_once = should_run_targeted_semantic_watch_once(&opts);
let populated_explicit_watch_once_only = has_explicit_watch_once_paths
&& !opts.watch
&& !opts.full
&& !opts.force_rebuild
&& !opts.semantic
&& !opts.build_hnsw
&& initial_canonical_sessions_before_salvage > 0;
let mut initial_matching_lexical_checkpoint = MatchingLexicalRebuildStateStatus::default();
let mut restart_pending_lexical_rebuild_from_zero = false;
let resume_lexical_rebuild = if opts.force_rebuild {
false
} else if initial_canonical_sessions_before_salvage > 0 {
if let Some(status) = nonresumable_pending_lexical_rebuild_status_without_fingerprint(
&index_path,
&opts.db_path,
initial_canonical_sessions_before_salvage,
)? {
initial_matching_lexical_checkpoint = status;
restart_pending_lexical_rebuild_from_zero = true;
} else if populated_explicit_watch_once_only {
if let Some(status) =
matching_completed_lexical_rebuild_state_status_without_fingerprint(
&index_path,
&opts.db_path,
initial_canonical_sessions_before_salvage,
)?
{
initial_matching_lexical_checkpoint = status;
} else {
initial_matching_lexical_checkpoint =
matching_lexical_rebuild_state_status_if_present(&index_path, || {
lexical_rebuild_db_state_with_total_conversations(
&storage,
&opts.db_path,
initial_canonical_sessions_before_salvage,
)
})?;
}
} else {
initial_matching_lexical_checkpoint =
matching_lexical_rebuild_state_status_if_present(&index_path, || {
lexical_rebuild_db_state_with_total_conversations(
&storage,
&opts.db_path,
initial_canonical_sessions_before_salvage,
)
})?;
}
initial_matching_lexical_checkpoint.has_pending_resume
} else {
false
};
let preserve_matching_completed_checkpoint_during_full_scan =
should_preserve_matching_completed_lexical_checkpoint_during_full_scan(
opts.full,
resume_lexical_rebuild,
canonical_only_full_rebuild,
&initial_matching_lexical_checkpoint,
);
if opts.full
&& !resume_lexical_rebuild
&& !preserve_matching_completed_checkpoint_during_full_scan
{
clear_lexical_rebuild_state(&index_path)?;
} else if preserve_matching_completed_checkpoint_during_full_scan {
tracing::info!(
db_path = %opts.db_path.display(),
completed_indexed_docs = initial_matching_lexical_checkpoint.completed_indexed_docs,
"preserving matching completed lexical checkpoint during full scan until canonical mutations require a rebuild"
);
}
let pre_scan_daily_stats_archive_fingerprint =
preserve_matching_completed_checkpoint_during_full_scan
.then_some(
initial_matching_lexical_checkpoint
.completed_storage_fingerprint
.as_deref(),
)
.flatten();
let mut checked_daily_stats_pre_scan = false;
if opts.full && !canonical_only_full_rebuild {
if let DailyStatsRepairOutcome::SkippedKnownHealthyForFingerprint {
archive_fingerprint,
} = repair_daily_stats_if_drifted(
&storage,
&opts.db_path,
pre_scan_daily_stats_archive_fingerprint,
)? {
tracing::info!(
db_path = %opts.db_path.display(),
archive_fingerprint,
"skipping pre-scan daily_stats health probe because this full run preserved an archive fingerprint already known to be healthy"
);
}
checked_daily_stats_pre_scan = true;
} else if canonical_only_full_rebuild {
tracing::info!(
db_path = %opts.db_path.display(),
conversations = initial_canonical_sessions_before_salvage,
"deferring daily_stats repair because full rebuild is reindexing an already-populated canonical database"
);
}
let mut performed_scan = false;
let mut scan_canonical_mutations = CanonicalMutationCounts::default();
let mut scan_lexical_update_deferred = false;
let mut stale_index_ingest_quarantine_retry_attempted = false;
let mut tantivy_requires_rebuild = false;
let mut observed_tantivy_docs = None;
if should_preflight_existing_tantivy_reader(resume_lexical_rebuild, opts.full) {
let schema_hash_path = index_path.join("schema_hash.json");
let schema_matches = schema_hash_path.exists()
&& std::fs::read_to_string(&schema_hash_path)
.ok()
.and_then(|content| serde_json::from_str::<serde_json::Value>(&content).ok())
.and_then(|json| {
json.get("schema_hash")
.and_then(|v| v.as_str())
.map(schema_hash_matches)
})
.unwrap_or(false);
tantivy_requires_rebuild = opts.force_rebuild
|| !crate::search::tantivy::searchable_index_exists(&index_path)
|| !schema_matches;
if !tantivy_requires_rebuild {
match crate::search::tantivy::searchable_index_summary(&index_path) {
Ok(Some(summary)) => {
if let Err(e) =
crate::search::tantivy::validate_searchable_index_contract(&index_path)
{
tracing::warn!(
error = %e,
path = %index_path.display(),
"tantivy contract preflight failed; forcing rebuild"
);
tantivy_requires_rebuild = true;
} else {
observed_tantivy_docs = Some(summary.docs);
}
}
Ok(None) => {
tantivy_requires_rebuild = true;
}
Err(e) => {
tracing::warn!(
error = %e,
path = %index_path.display(),
"tantivy open preflight failed; forcing rebuild"
);
tantivy_requires_rebuild = true;
}
}
}
} else if resume_lexical_rebuild {
tracing::info!(
db_path = %opts.db_path.display(),
"skipping live Tantivy schema/reader preflight because checkpoint resume will rebuild directly from the canonical database"
);
} else if opts.full {
tracing::info!(
db_path = %opts.db_path.display(),
"deferring live Tantivy reader/doc-count preflight until after the full scan proves the canonical archive is unchanged"
);
} else {
tracing::info!(db_path = %opts.db_path.display(), "skipping live Tantivy reader preflight");
}
let mut needs_rebuild =
should_force_authoritative_rebuild(canonical_storage_rebuilt, tantivy_requires_rebuild);
let initial_needs_rebuild = needs_rebuild;
if needs_rebuild && let Some(p) = &opts.progress {
p.is_rebuilding.store(true, Ordering::Relaxed);
}
if needs_rebuild && !resume_lexical_rebuild {
if index_path.exists() {
let mut backup_path = index_path.with_extension("bak");
let mut attempt = 1u32;
while backup_path.exists() {
backup_path = index_path.with_extension(format!("bak.{attempt}"));
attempt += 1;
}
match std::fs::rename(&index_path, &backup_path) {
Ok(()) => {
tracing::warn!(
old_index = %index_path.display(),
backup = %backup_path.display(),
canonical_storage_rebuilt,
tantivy_requires_rebuild,
"backed up existing Tantivy index before rebuild \
(canonical db or index metadata changed); remove the backup \
manually once you have confirmed the new index is healthy"
);
}
Err(err) => {
tracing::warn!(
old_index = %index_path.display(),
backup = %backup_path.display(),
error = %err,
"failed to back up existing Tantivy index; \
falling back to removal without backup"
);
let _ = std::fs::remove_dir_all(&index_path);
}
}
}
}
let scan_start_ts = FrankenStorage::now_millis();
let keep_tantivy_open_after_rebuild = opts.watch
|| opts
.watch_once_paths
.as_ref()
.is_some_and(|paths| !paths.is_empty());
let mut exact_completed_lexical_checkpoint = false;
let mut skipped_noop_full_scan_authoritative_rebuild = false;
let mut targeted_watch_once_only_run = false;
let t_index = if resume_lexical_rebuild {
tracing::info!(
db_path = %opts.db_path.display(),
"resuming incomplete lexical rebuild from canonical database checkpoint"
);
record_lexical_population_strategy(
opts.progress.as_ref(),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
"resume_incomplete_authoritative_db_rebuild_from_checkpoint",
);
tracing::info!(
strategy = LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild.as_str(),
reason = "resume_incomplete_authoritative_db_rebuild_from_checkpoint",
"selected_lexical_population_strategy"
);
let rebuild = if restart_pending_lexical_rebuild_from_zero {
rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
&opts.db_path,
&opts.data_dir,
initial_canonical_sessions_before_salvage,
opts.progress.clone(),
Arc::clone(&progress_bump),
)?
} else {
rebuild_tantivy_from_db_with_progress_bump(
&opts.db_path,
&opts.data_dir,
initial_canonical_sessions_before_salvage,
opts.progress.clone(),
Arc::clone(&progress_bump),
)?
};
exact_completed_lexical_checkpoint = rebuild.exact_checkpoint_persisted;
if let Some(p) = &opts.progress
&& let Ok(mut stats) = p.stats.lock()
{
stats.total_conversations = initial_canonical_sessions_before_salvage;
}
if let Some(observed_messages) = rebuild.observed_messages {
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
initial_canonical_sessions_before_salvage,
observed_messages,
);
}
if keep_tantivy_open_after_rebuild {
Some(TantivyIndex::open_or_create(&index_path)?)
} else {
None
}
} else {
let mut t_index: Option<TantivyIndex> = None;
if opts.full && !opened_fresh_for_full && initial_canonical_sessions_before_salvage == 0 {
} else if opts.full {
}
let canonical_sessions_before_salvage = initial_canonical_sessions_before_salvage;
let mut has_pending_historical_bundles = if canonical_only_full_rebuild {
false
} else {
storage.has_pending_historical_bundles(&opts.db_path)?
};
let targeted_watch_once_only = should_run_targeted_watch_once_only(
has_explicit_watch_once_paths,
opts.watch,
opts.full,
needs_rebuild,
canonical_sessions_before_salvage,
);
targeted_watch_once_only_run = targeted_watch_once_only;
let should_salvage_historical = !targeted_watch_once_only
&& should_salvage_historical_databases(
canonical_storage_rebuilt,
canonical_sessions_before_salvage,
has_pending_historical_bundles,
canonical_only_full_rebuild,
);
tracing::warn!(
db_path = %opts.db_path.display(),
canonical_storage_rebuilt,
reopened_after_writable_preflight,
opened_fresh_for_full,
canonical_sessions_before_salvage,
has_pending_historical_bundles,
canonical_only_full_rebuild,
targeted_watch_once_only,
should_salvage_historical,
"historical salvage decision"
);
let historical_salvage: HistoricalSalvageOutcome = if targeted_watch_once_only {
tracing::info!(
db_path = %opts.db_path.display(),
"skipping historical salvage because targeted watch-once paths were supplied"
);
HistoricalSalvageOutcome::default()
} else if should_salvage_historical {
let mut outcome = HistoricalSalvageOutcome::default();
if canonical_sessions_before_salvage == 0 {
let (reopened_storage, seed_outcome) =
maybe_seed_empty_canonical_from_historical_bundle(storage, &opts.db_path)?;
storage = reopened_storage;
persist::apply_index_writer_busy_timeout(&storage);
persist::apply_index_writer_checkpoint_policy(&storage, defer_checkpoints);
if let Some(seed_outcome) = seed_outcome {
outcome.accumulate(seed_outcome);
has_pending_historical_bundles =
storage.has_pending_historical_bundles(&opts.db_path)?;
}
}
if has_pending_historical_bundles {
outcome.accumulate(storage.salvage_historical_databases(&opts.db_path)?);
} else {
tracing::info!(
db_path = %opts.db_path.display(),
"skipping incremental historical salvage because all discoverable historical bundles are already recorded in the canonical database"
);
}
outcome
} else {
tracing::info!(
db_path = %opts.db_path.display(),
conversations = canonical_sessions_before_salvage,
pending_historical_bundles = has_pending_historical_bundles,
"skipping historical salvage because canonical database is already populated and no additional historical bundles are pending"
);
HistoricalSalvageOutcome::default()
};
if historical_salvage.messages_imported > 0 {
tracing::info!(
bundles_imported = historical_salvage.bundles_imported,
conversations_imported = historical_salvage.conversations_imported,
messages_imported = historical_salvage.messages_imported,
"historical cass bundles merged into canonical database before scan"
);
}
let rebuild_from_canonical_only =
canonical_only_full_rebuild && historical_salvage.conversations_imported == 0;
let repair_context = IncrementalCanonicalLexicalRepairContext {
full_refresh: opts.full,
force_rebuild: opts.force_rebuild,
resume_lexical_rebuild,
targeted_watch_once_only,
salvage_messages_imported: historical_salvage.messages_imported,
canonical_messages: 0,
tantivy_requires_rebuild,
observed_tantivy_docs,
published_index_validated_for_current_data: false,
};
let incremental_canonical_lexical_repair = if canonical_sessions_before_salvage > 0
&& should_evaluate_incremental_canonical_lexical_repair(&repair_context)
{
let canonical_messages = count_total_messages_exact(&storage)?;
let published_index_validated_for_current_data = !tantivy_requires_rebuild
&& observed_tantivy_docs.is_some_and(|docs| docs < canonical_messages)
&& published_lexical_index_validated_for_current_data(&index_path, &opts.db_path);
choose_incremental_canonical_lexical_repair_plan(
IncrementalCanonicalLexicalRepairContext {
canonical_messages,
published_index_validated_for_current_data,
..repair_context
},
)
} else {
None
};
if should_repair_daily_stats_after_historical_salvage(
checked_daily_stats_pre_scan,
opts.full,
rebuild_from_canonical_only,
historical_salvage.messages_imported,
) {
repair_daily_stats_if_drifted(&storage, &opts.db_path, None)?;
}
if rebuild_from_canonical_only {
tracing::info!(
db_path = %opts.db_path.display(),
conversations = initial_canonical_sessions_before_salvage,
"skipping raw source rescan during full rebuild because the canonical database is already populated"
);
record_lexical_population_strategy(
opts.progress.as_ref(),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
"full_rebuild_uses_authoritative_canonical_db_rebuild_only",
);
tracing::info!(
strategy = LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild.as_str(),
reason = "full_rebuild_uses_authoritative_canonical_db_rebuild_only",
"selected_lexical_population_strategy"
);
}
if rebuild_from_canonical_only {
drop(t_index.take());
let rebuild_start = std::time::Instant::now();
let rebuild_convs = canonical_sessions_before_salvage;
let rebuild = rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
&opts.db_path,
&opts.data_dir,
rebuild_convs,
opts.progress.clone(),
Arc::clone(&progress_bump),
)?;
exact_completed_lexical_checkpoint = rebuild.exact_checkpoint_persisted;
let rebuild_ms = rebuild_start.elapsed().as_millis() as u64;
if let Some(p) = &opts.progress
&& let Ok(mut stats) = p.stats.lock()
{
stats.scan_ms = 0; stats.index_ms = rebuild_ms;
stats.total_conversations = rebuild_convs;
}
if let Some(observed_messages) = rebuild.observed_messages {
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
rebuild_convs,
observed_messages,
);
}
if keep_tantivy_open_after_rebuild {
t_index = Some(TantivyIndex::open_or_create(&index_path)?);
}
} else {
let followup_scan_after_authoritative_repair =
incremental_canonical_lexical_repair.is_some();
if let Some(repair_plan) = incremental_canonical_lexical_repair {
tracing::info!(
db_path = %opts.db_path.display(),
canonical_conversations = canonical_sessions_before_salvage,
canonical_messages = repair_plan.canonical_messages,
observed_tantivy_docs = repair_plan.observed_tantivy_docs,
reason = repair_plan.reason,
"repairing Tantivy from the authoritative canonical database before incremental source scan"
);
record_lexical_population_strategy(
opts.progress.as_ref(),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
repair_plan.reason,
);
record_incremental_canonical_lexical_repair(
opts.progress.as_ref(),
&repair_plan,
canonical_sessions_before_salvage,
);
tracing::info!(
strategy = LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild.as_str(),
reason = repair_plan.reason,
"selected_lexical_population_strategy"
);
drop(t_index.take());
let rebuild_convs = count_total_conversations_exact(&storage)?;
let rebuild = rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
&opts.db_path,
&opts.data_dir,
rebuild_convs,
opts.progress.clone(),
Arc::clone(&progress_bump),
)?;
exact_completed_lexical_checkpoint = rebuild.exact_checkpoint_persisted;
if let Some(observed_messages) = rebuild.observed_messages {
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
rebuild_convs,
observed_messages,
);
}
t_index = Some(TantivyIndex::open_or_create(&index_path)?);
needs_rebuild = false;
}
if targeted_watch_once_only {
tracing::info!(
db_path = %opts.db_path.display(),
"skipping broad incremental scan because targeted watch-once paths were supplied"
);
} else if should_skip_broad_scan_after_watch_once_authoritative_repair(
has_explicit_watch_once_paths,
opts.watch,
opts.full,
followup_scan_after_authoritative_repair,
) {
tracing::info!(
db_path = %opts.db_path.display(),
"skipping broad incremental scan because targeted watch-once paths were supplied after authoritative lexical repair"
);
} else {
let (lexical_strategy, lexical_strategy_reason) =
resolve_lexical_population_strategy(
needs_rebuild,
opts.full,
historical_salvage.messages_imported,
);
record_lexical_population_strategy_if_unset(
opts.progress.as_ref(),
lexical_strategy,
lexical_strategy_reason,
);
let stale_index_ingest_quarantine_retry =
if targeted_watch_once_only || canonical_only_full_rebuild {
None
} else {
stale_index_ingest_quarantine_version_retry(&opts.data_dir)?
};
if let Some(retry) = &stale_index_ingest_quarantine_retry {
tracing::warn!(
stale_records = retry.stale_records,
legacy_records = retry.legacy_records,
previous_versions = ?retry.previous_versions,
current_version = current_cass_version(),
"retrying stale index-ingest quarantine records after cass version change"
);
}
if followup_scan_after_authoritative_repair {
tracing::info!(
strategy = lexical_strategy.as_str(),
reason = lexical_strategy_reason,
full = opts.full,
needs_rebuild,
salvage_messages_imported = historical_salvage.messages_imported,
"selected_followup_scan_lexical_strategy_after_authoritative_repair"
);
} else {
tracing::info!(
strategy = lexical_strategy.as_str(),
reason = lexical_strategy_reason,
full = opts.full,
needs_rebuild,
salvage_messages_imported = historical_salvage.messages_imported,
"selected_lexical_population_strategy"
);
}
let since_ts = non_watch_scan_since_ts(
opts.full,
needs_rebuild,
stale_index_ingest_quarantine_retry.is_some(),
storage.get_last_scan_ts().unwrap_or(None),
);
if since_ts.is_some() {
tracing::info!(since_ts = ?since_ts, "incremental_scan: using last_scan_ts");
} else {
tracing::info!("full_scan: no last_scan_ts or rebuild requested");
}
let additional_scan_roots =
additional_scan_roots_for_scan_or_watch(&storage, &opts.data_dir);
let scan_requires_tantivy =
lexical_population_strategy_requires_inline_tantivy(lexical_strategy);
if scan_requires_tantivy && t_index.is_none() {
t_index = Some(TantivyIndex::open_or_create(&index_path)?);
} else if !scan_requires_tantivy {
tracing::info!(
strategy = lexical_strategy.as_str(),
"scan phase is deferring Tantivy writer open/commit until the authoritative rebuild"
);
}
if streaming_index_enabled() {
tracing::info!("using streaming indexing (Opt 8.2)");
let scan_outcome = run_streaming_index(
&storage,
t_index.as_mut(),
&opts,
since_ts,
lexical_strategy,
additional_scan_roots.clone(),
scan_start_ts,
)?;
bump_index_run_lock_progress_atomic(&progress_bump);
scan_canonical_mutations =
scan_canonical_mutations.accumulate(scan_outcome.canonical_mutations);
scan_lexical_update_deferred |= scan_outcome.lexical_update_deferred;
} else {
tracing::info!(
"using batch indexing (streaming disabled via CASS_STREAMING_INDEX=0)"
);
let scan_outcome = run_batch_index(
&storage,
t_index.as_mut(),
&opts,
since_ts,
lexical_strategy,
additional_scan_roots.clone(),
scan_start_ts,
)?;
bump_index_run_lock_progress_atomic(&progress_bump);
scan_canonical_mutations =
scan_canonical_mutations.accumulate(scan_outcome.canonical_mutations);
scan_lexical_update_deferred |= scan_outcome.lexical_update_deferred;
}
performed_scan = true;
stale_index_ingest_quarantine_retry_attempted =
stale_index_ingest_quarantine_retry.is_some();
if scan_lexical_update_deferred {
tracing::warn!(
db_path = %opts.db_path.display(),
inserted_conversations = scan_canonical_mutations.inserted_conversations,
inserted_messages = scan_canonical_mutations.inserted_messages,
"inline lexical updates were deferred during non-watch scan; rebuilding lexical assets from canonical SQLite"
);
drop(t_index.take());
let rebuild_convs = count_total_conversations_exact(&storage)?;
let rebuild = rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
&opts.db_path,
&opts.data_dir,
rebuild_convs,
opts.progress.clone(),
Arc::clone(&progress_bump),
)?;
exact_completed_lexical_checkpoint = rebuild.exact_checkpoint_persisted;
if let Some(observed_messages) = rebuild.observed_messages {
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
rebuild_convs,
observed_messages,
);
}
if keep_tantivy_open_after_rebuild {
t_index = Some(TantivyIndex::open_or_create(&index_path)?);
}
} else if scan_requires_tantivy {
t_index
.as_mut()
.expect("tantivy index must remain open for lexical commit")
.commit()?;
}
if !scan_lexical_update_deferred
&& (opts.full || historical_salvage.messages_imported > 0)
{
let post_scan_observed_tantivy_docs =
observed_tantivy_docs_for_post_full_scan_skip(
&index_path,
opts.full,
initial_needs_rebuild,
historical_salvage.messages_imported,
&initial_matching_lexical_checkpoint,
scan_canonical_mutations,
observed_tantivy_docs,
)?;
if should_skip_post_full_scan_authoritative_rebuild(
opts.full,
initial_needs_rebuild,
historical_salvage.messages_imported,
&initial_matching_lexical_checkpoint,
scan_canonical_mutations,
post_scan_observed_tantivy_docs,
) {
tracing::info!(
db_path = %opts.db_path.display(),
observed_tantivy_docs = post_scan_observed_tantivy_docs,
completed_indexed_docs = initial_matching_lexical_checkpoint
.completed_indexed_docs,
inserted_conversations = scan_canonical_mutations.inserted_conversations,
inserted_messages = scan_canonical_mutations.inserted_messages,
"skipping post-scan authoritative lexical rebuild because the full scan found no canonical changes and the live Tantivy index still matches the completed checkpoint"
);
let (exact_total_conversations, exact_total_messages) =
initial_matching_lexical_checkpoint
.completed_exact_totals
.unwrap_or((
count_total_conversations_exact(&storage)?,
count_total_messages_exact(&storage)?,
));
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
exact_total_conversations,
exact_total_messages,
);
skipped_noop_full_scan_authoritative_rebuild = true;
} else {
drop(t_index.take());
let rebuild_convs = count_total_conversations_exact(&storage)?;
let rebuild = rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
&opts.db_path,
&opts.data_dir,
rebuild_convs,
opts.progress.clone(),
Arc::clone(&progress_bump),
)?;
exact_completed_lexical_checkpoint = rebuild.exact_checkpoint_persisted;
if let Some(observed_messages) = rebuild.observed_messages {
record_exact_total_counts_in_progress(
opts.progress.as_ref(),
rebuild_convs,
observed_messages,
);
}
if keep_tantivy_open_after_rebuild {
t_index = Some(TantivyIndex::open_or_create(&index_path)?);
}
}
}
}
}
t_index
};
if stale_index_ingest_quarantine_retry_attempted && scan_watermark_preservation_active() {
tracing::info!(
data_dir = %opts.data_dir.display(),
"leaving stale index-ingest quarantine retry records unchanged because this scan preserved the source watermark"
);
} else if stale_index_ingest_quarantine_retry_attempted {
match mark_stale_index_ingest_quarantine_retry_attempted(&opts.data_dir) {
Ok(marked) if marked > 0 => tracing::info!(
data_dir = %opts.data_dir.display(),
marked,
current_version = current_cass_version(),
"marked stale index-ingest quarantine records as retried for current cass version"
),
Ok(_) => {}
Err(err) => tracing::warn!(
data_dir = %opts.data_dir.display(),
error = %err,
"failed to mark stale index-ingest quarantine records as retried"
),
}
}
if opts.semantic && targeted_semantic_watch_once {
tracing::info!(
embedder = %opts.embedder,
"deferring broad semantic indexing until targeted watch-once ingest completes"
);
} else if opts.semantic {
let vi_dir = opts
.data_dir
.join(crate::search::vector_index::VECTOR_INDEX_DIR);
let has_existing_index = vi_dir.is_dir()
&& std::fs::read_dir(&vi_dir)
.map(|entries| {
entries
.filter_map(|e| e.ok())
.any(|e| e.path().extension().is_some_and(|ext| ext == "fsvi"))
})
.unwrap_or(false);
let has_watermark = storage.get_last_embedded_message_id()?.is_some();
if opts.watch && has_existing_index && has_watermark {
tracing::info!(
dir = %vi_dir.display(),
"skipping bulk semantic re-embed (existing index + watermark found); \
incremental watch callback will handle new messages"
);
} else {
tracing::info!(embedder = %opts.embedder, "starting semantic indexing");
let semantic_indexer = SemanticIndexer::new(&opts.embedder, Some(&opts.data_dir))?;
let mut semantic_read_storage = FrankenStorage::open_readonly(&opts.db_path)
.with_context(|| {
format!(
"opening fresh readonly canonical storage for semantic indexing: {}",
opts.db_path.display()
)
})?;
let mut embedding_inputs =
packet_embedding_inputs_from_storage(&semantic_read_storage)?;
tracing::info!(
message_count = embedding_inputs.len(),
packet_driven = true,
"built semantic inputs from canonical ConversationPacket replay"
);
embedding_inputs.retain(|message| {
!is_hard_message_noise(semantic_role_name(message.role), &message.content)
});
let embedded_messages = semantic_indexer.embed_messages(&embedding_inputs)?;
tracing::info!(
embedded_count = embedded_messages.len(),
"generated embeddings"
);
if !embedded_messages.is_empty() {
let embedded_doc_count = embedded_messages.len();
let build_started_at_ms = semantic_indexing_now_ms();
let vector_index =
semantic_indexer.build_and_save_index(embedded_messages, &opts.data_dir)?;
let index_path = crate::search::vector_index::vector_index_path(
&opts.data_dir,
semantic_indexer.embedder_id(),
);
tracing::info!(
path = %index_path.display(),
embedder = semantic_indexer.embedder_id(),
"saved semantic vector index"
);
if opts.build_hnsw {
let hnsw_path = semantic_indexer.build_hnsw_index(
&vector_index,
&opts.data_dir,
None, None, )?;
tracing::info!(
path = %hnsw_path.display(),
embedder = semantic_indexer.embedder_id(),
"saved HNSW index for approximate search"
);
}
if let Err(err) = publish_direct_semantic_artifact(
&semantic_read_storage,
&opts.data_dir,
&index_path,
semantic_indexer.embedder_id(),
semantic_indexer.embedder_dimension(),
u64::try_from(embedded_doc_count).unwrap_or(u64::MAX),
build_started_at_ms,
) {
tracing::warn!(
embedder = semantic_indexer.embedder_id(),
error = %err,
"direct semantic artifact published to disk but \
manifest update failed; cass status may report \
stale/unavailable until next backfill cycle"
);
}
}
semantic_read_storage.close_best_effort_in_place();
if let Some(max_id) = embedding_inputs.iter().map(|e| e.message_id).max() {
persist::with_ephemeral_writer(
&storage,
false,
"updating semantic indexing watermark",
|writer| {
writer
.set_last_embedded_message_id(i64::try_from(max_id).unwrap_or(i64::MAX))
},
)?;
}
}
}
if targeted_watch_once_only_run {
tracing::info!(
db_path = %opts.db_path.display(),
"deferring final index-run metadata update until targeted watch-once paths are evaluated"
);
} else {
let now_ms = FrankenStorage::now_millis();
let preserve_scan_watermark = scan_watermark_preservation_active();
let performed_scan_for_watermark = performed_scan && !preserve_scan_watermark;
if performed_scan && !performed_scan_for_watermark {
tracing::info!(
db_path = %opts.db_path.display(),
"preserving final last_scan_ts because scan exclusions or active source skips are active"
);
}
persist_final_index_run_metadata(
&storage,
&opts.db_path,
performed_scan_for_watermark,
scan_start_ts,
now_ms,
)?;
}
let exact_total_counts = exact_total_counts_from_progress(opts.progress.as_ref());
if exact_completed_lexical_checkpoint && exact_total_counts.is_some() {
tracing::info!(
db_path = %opts.db_path.display(),
"skipping final lexical checkpoint refresh because the authoritative rebuild already persisted exact completed state"
);
} else if skipped_noop_full_scan_authoritative_rebuild {
tracing::info!(
db_path = %opts.db_path.display(),
inserted_conversations = scan_canonical_mutations.inserted_conversations,
inserted_messages = scan_canonical_mutations.inserted_messages,
"skipping final lexical checkpoint refresh because the full scan preserved an already-matching completed checkpoint"
);
} else if targeted_watch_once_only_run {
tracing::info!(
db_path = %opts.db_path.display(),
"skipping final lexical checkpoint refresh because targeted watch-once startup does not need broad checkpoint maintenance"
);
} else if should_skip_noop_final_lexical_checkpoint_refresh(
opts.full,
initial_needs_rebuild,
&initial_matching_lexical_checkpoint,
exact_total_counts,
scan_canonical_mutations,
) {
tracing::info!(
db_path = %opts.db_path.display(),
inserted_conversations = scan_canonical_mutations.inserted_conversations,
inserted_messages = scan_canonical_mutations.inserted_messages,
"skipping final lexical checkpoint refresh because this incremental run made no canonical changes and started from a matching completed checkpoint"
);
} else {
refresh_completed_lexical_rebuild_checkpoint_for_final_state(
&mut storage,
&opts.db_path,
&opts.data_dir,
opts.watch || opts.watch_once_paths.is_some(),
exact_total_counts,
)
.with_context(|| {
format!(
"refreshing completed lexical checkpoint after index run for {}",
opts.db_path.display()
)
})?;
}
let fallback_fts_archive_fingerprint = skipped_noop_full_scan_authoritative_rebuild
.then_some(
initial_matching_lexical_checkpoint
.completed_storage_fingerprint
.as_deref(),
)
.flatten();
if let Some(repair) = repair_fallback_fts_after_full_index_run(
&storage,
&opts.db_path,
opts.full,
canonical_only_full_rebuild,
fallback_fts_archive_fingerprint,
)
.with_context(|| {
format!(
"repairing frankensqlite-owned fallback FTS after full index run for {}",
opts.db_path.display()
)
})? {
match repair {
FallbackFtsRepairOutcome::SkippedKnownHealthyForFingerprint {
archive_fingerprint,
} => {
tracing::info!(
db_path = %opts.db_path.display(),
archive_fingerprint,
"skipping fallback FTS consistency repair because this no-op full run preserved an archive fingerprint already known to be healthy"
);
}
FallbackFtsRepairOutcome::Repaired(FtsConsistencyRepair::AlreadyHealthy { rows }) => {
tracing::info!(
db_path = %opts.db_path.display(),
rows,
"fallback FTS was already healthy after full index run; skipped rebuild"
);
}
FallbackFtsRepairOutcome::Repaired(FtsConsistencyRepair::IncrementalCatchUp {
inserted_rows,
total_rows,
}) => {
tracing::info!(
db_path = %opts.db_path.display(),
inserted_rows,
total_rows,
"incrementally repaired fallback FTS after full index run"
);
}
FallbackFtsRepairOutcome::Repaired(FtsConsistencyRepair::Rebuilt { inserted_rows }) => {
tracing::info!(
db_path = %opts.db_path.display(),
inserted_rows,
"rebuilt fallback FTS after full index run"
);
}
}
} else if opts.full {
tracing::info!(
db_path = %opts.db_path.display(),
canonical_only_full_rebuild,
"skipping frankensqlite-owned fallback FTS rebuild because this full run only rebuilt Tantivy from the existing canonical database"
);
}
reset_progress_to_idle(opts.progress.as_ref());
if opts.watch || opts.watch_once_paths.is_some() {
let additional_scan_roots =
additional_scan_roots_for_scan_or_watch(&storage, &opts.data_dir);
let watch_roots = build_watch_roots(additional_scan_roots.clone());
let watch_once_mode = opts
.watch_once_paths
.as_ref()
.is_some_and(|paths| !paths.is_empty());
if targeted_watch_once_only_run
&& t_index.is_none()
&& watch_once_mode
&& should_skip_unchanged_explicit_watch_once_paths(&opts, &storage, &watch_roots)?
{
let path_count = opts
.watch_once_paths
.as_ref()
.map(std::vec::Vec::len)
.unwrap_or_default();
record_lexical_population_strategy_if_unset(
opts.progress.as_ref(),
LexicalPopulationStrategy::IncrementalInline,
"watch_once_targeted_reindex_applies_inline_lexical_updates_for_changed_paths",
);
tracing::info!(
db_path = %opts.db_path.display(),
path_count,
"skipping unchanged explicit watch-once paths before opening Tantivy"
);
return close_storage_after_index(storage, &opts.db_path, "watch-once no-op index run");
}
restore_watch_steady_state_checkpoint_policy(&storage, opts.watch);
if opts.watch {
index_run_lock.set_mode(SearchMaintenanceMode::Watch)?;
}
let opts_clone = opts.clone();
let state = Mutex::new(load_watch_state(&opts.data_dir));
let storage = Rc::new(Mutex::new(storage));
let storage_for_watch = Rc::clone(&storage);
let should_preopen_tantivy_for_watch = opts.watch;
let watch_once_defers_tantivy_open =
watch_once_mode && !should_preopen_tantivy_for_watch && t_index.is_none();
let t_index = Mutex::new(if should_preopen_tantivy_for_watch {
Some(match t_index {
Some(t_index) => t_index,
None => TantivyIndex::open_or_create(&index_path).with_context(|| {
format!(
"opening Tantivy index before entering watch mode for {}",
index_path.display()
)
})?,
})
} else {
t_index
});
if watch_once_defers_tantivy_open {
tracing::info!(
index_path = %index_path.display(),
"deferring Tantivy open until one-shot watch-once ingest has conversations"
);
}
let index_path_for_watch = index_path.clone();
let watch_recycle_counter: std::cell::Cell<u32> = std::cell::Cell::new(0);
let watch_recycle_interval: u32 = dotenvy::var("CASS_WATCH_RECYCLE_INTERVAL")
.ok()
.and_then(|v| v.parse().ok())
.filter(|&v| v > 0) .unwrap_or(50);
let semantic_enabled = opts.semantic;
let targeted_semantic_watch_once = targeted_semantic_watch_once && watch_once_mode;
let embedder_id = opts.embedder.clone();
let data_dir_for_semantic = opts.data_dir.clone();
let pre_watch_semantic_conversations = if targeted_semantic_watch_once {
let storage = storage.lock().map_err(|err| {
anyhow::anyhow!("storage lock poisoned before semantic watch-once: {err}")
})?;
count_total_conversations_exact(&storage)?
} else {
0
};
let semantic_cooldown = Duration::from_secs(
dotenvy::var("CASS_WATCH_SEMANTIC_COOLDOWN_SECS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(60),
);
let last_semantic_embed = Mutex::new(Instant::now());
let stale_detector = Arc::new(StaleDetector::from_env());
let stale_config = StaleConfig::from_env();
if stale_config.is_enabled() {
tracing::info!(
action = ?stale_config.action,
threshold_hours = stale_config.threshold_hours,
check_interval_mins = stale_config.check_interval_mins,
"stale detection enabled"
);
}
let detector_clone = stale_detector.clone();
let watch_result = watch_sources(
opts.watch_once_paths.clone(),
watch_roots.clone(),
event_channel,
stale_detector,
opts.watch_interval_secs,
move |paths, roots, is_rebuild| {
let mut semantic_delta = WatchSemanticDelta::default();
let indexed = if is_rebuild {
if let Ok(mut g) = state.lock() {
g.clear();
if let Err(e) = save_watch_state(&opts_clone.data_dir, &g) {
tracing::warn!("failed to save watch state: {e}");
}
}
detector_clone.reset();
let all_root_paths: Vec<PathBuf> =
roots.iter().map(|(_, root)| root.path.clone()).collect();
let indexed = reindex_paths(
&opts_clone,
all_root_paths,
roots,
&state,
&storage_for_watch,
&t_index,
&index_path_for_watch,
true,
);
finalize_watch_reindex_result(
indexed,
&detector_clone,
opts_clone.progress.as_ref(),
"watch rebuild reindex",
)
} else if watch_once_mode {
let indexed = finalize_watch_once_reindex_result(
reindex_paths_with_semantic_delta(
&opts_clone,
paths,
roots,
&state,
&storage_for_watch,
&t_index,
&index_path_for_watch,
false,
semantic_enabled.then_some(&mut semantic_delta),
),
&detector_clone,
opts_clone.progress.as_ref(),
"watch incremental reindex",
)?;
if indexed > 0
&& let Ok(mut guard) = t_index.lock()
&& let Some(t_index) = guard.as_mut()
&& let Err(e) = t_index.optimize_if_idle()
{
tracing::warn!(error = %e, "segment merge failed during watch");
}
if targeted_semantic_watch_once {
let stats = run_targeted_semantic_watch_once_publish(
&embedder_id,
&data_dir_for_semantic,
&storage_for_watch,
indexed,
pre_watch_semantic_conversations,
)?;
record_semantic_watch_once_stats(opts_clone.progress.as_ref(), stats);
}
indexed
} else {
let indexed = finalize_watch_reindex_result(
reindex_paths_with_semantic_delta(
&opts_clone,
paths,
roots,
&state,
&storage_for_watch,
&t_index,
&index_path_for_watch,
false,
semantic_enabled.then_some(&mut semantic_delta),
),
&detector_clone,
opts_clone.progress.as_ref(),
"watch incremental reindex",
);
if indexed > 0
&& let Ok(mut guard) = t_index.lock()
&& let Some(t_index) = guard.as_mut()
&& let Err(e) = t_index.optimize_if_idle()
{
tracing::warn!(error = %e, "segment merge failed during watch");
}
indexed
};
if semantic_enabled && indexed > 0 && !targeted_semantic_watch_once {
let should_embed = last_semantic_embed
.lock()
.map(|t| t.elapsed() >= semantic_cooldown)
.unwrap_or(false);
if should_embed {
let embed_result = if semantic_delta.max_message_id.is_some() {
incremental_semantic_embed_from_delta(
&embedder_id,
&data_dir_for_semantic,
&storage_for_watch,
semantic_delta,
)
} else {
incremental_semantic_embed(
&embedder_id,
&data_dir_for_semantic,
&storage_for_watch,
)
};
match embed_result {
Ok(0) => {} Ok(n) => {
tracing::info!(
count = n,
"incremental semantic embedding complete"
);
if let Ok(mut t) = last_semantic_embed.lock() {
*t = Instant::now();
}
}
Err(e) => {
tracing::warn!(error = %e, "incremental semantic embedding failed");
if let Ok(mut t) = last_semantic_embed.lock() {
*t = Instant::now();
}
}
}
}
}
let count = watch_recycle_counter.get().wrapping_add(1);
watch_recycle_counter.set(count);
if count.is_multiple_of(watch_recycle_interval)
&& let Ok(mut guard) = storage_for_watch.lock()
{
let db_path = guard.database_path().ok();
guard.close_best_effort_in_place();
if let Some(path) = db_path {
match FrankenStorage::open(&path) {
Ok(new_storage) => {
*guard = new_storage;
tracing::debug!(
cycle = count,
"recycled long-lived storage handle to shed MVCC state"
);
}
Err(err) => {
tracing::warn!(
error = %err,
cycle = count,
"failed to reopen storage handle after recycle; \
next watch cycle will use the closed handle \
and likely fail"
);
}
}
}
}
Ok(())
},
);
let close_result =
release_watch_storage_after_index(storage, &opts.db_path, "watch indexing session");
if let Err(err) = watch_result {
if let Err(close_err) = close_result {
tracing::warn!(
error = %close_err,
db_path = %opts.db_path.display(),
"failed to close canonical db cleanly after watch indexing error"
);
}
return Err(err);
}
close_result?;
return Ok(());
}
close_storage_after_index(storage, &opts.db_path, "index run")
}
fn close_storage_after_index(storage: FrankenStorage, db_path: &Path, context: &str) -> Result<()> {
prepare_storage_for_final_checkpoint(&storage, db_path, context);
storage.close().with_context(|| {
format!(
"closing canonical db before final WAL checkpoint after {context}: {}",
db_path.display()
)
})?;
run_final_wal_checkpoint(db_path, context)
}
fn prepare_storage_for_final_checkpoint(storage: &FrankenStorage, db_path: &Path, context: &str) {
let previous_pages = storage.index_writer_checkpoint_pages();
persist::apply_index_writer_checkpoint_policy(storage, false);
let restored_pages = storage.index_writer_checkpoint_pages();
if previous_pages == Some(0) {
tracing::info!(
db_path = %db_path.display(),
context,
restored_wal_autocheckpoint_pages = ?restored_pages,
"restored checkpoint policy before final index close"
);
}
}
fn run_final_wal_checkpoint(db_path: &Path, context: &str) -> Result<()> {
let checkpoint_db_path = db_path.to_string_lossy().into_owned();
let conn = frankensqlite::Connection::open(checkpoint_db_path).with_context(|| {
format!(
"opening canonical db for final WAL checkpoint after {context}: {}",
db_path.display()
)
})?;
let checkpoint_result = query_final_wal_checkpoint(&conn, db_path, context);
let close_result = conn.close().with_context(|| {
format!(
"closing final WAL checkpoint handle after {context}: {}",
db_path.display()
)
});
checkpoint_result?;
close_result?;
Ok(())
}
fn query_final_wal_checkpoint(
conn: &frankensqlite::Connection,
db_path: &Path,
context: &str,
) -> Result<()> {
let rows = conn
.query("PRAGMA wal_checkpoint(TRUNCATE);")
.with_context(|| {
format!(
"running final WAL checkpoint after {context}: {}",
db_path.display()
)
})?;
let row = rows.first().ok_or_else(|| {
anyhow::anyhow!(
"final WAL checkpoint returned no status row after {context}: {}",
db_path.display()
)
})?;
let busy: i64 = row
.get_typed(0)
.with_context(|| "reading final WAL checkpoint busy flag")?;
let log_frames: i64 = row
.get_typed(1)
.with_context(|| "reading final WAL checkpoint log frame count")?;
let checkpointed_frames: i64 = row
.get_typed(2)
.with_context(|| "reading final WAL checkpoint backfilled frame count")?;
if log_frames >= 0 {
if busy > 0 {
tracing::warn!(
db_path = %db_path.display(),
context,
busy,
log_frames,
checkpointed_frames,
"final WAL checkpoint was blocked by active readers"
);
} else {
tracing::info!(
db_path = %db_path.display(),
context,
log_frames,
checkpointed_frames,
"final WAL checkpoint completed after index run"
);
}
}
Ok(())
}
fn restore_watch_steady_state_checkpoint_policy(storage: &FrankenStorage, watch_enabled: bool) {
if watch_enabled {
persist::apply_index_writer_checkpoint_policy(storage, false);
}
}
fn release_watch_storage_after_index(
storage: Rc<Mutex<FrankenStorage>>,
db_path: &Path,
context: &str,
) -> Result<()> {
let storage = Rc::try_unwrap(storage).map_err(|_| {
anyhow::anyhow!(
"watch indexing retained extra canonical db handles while closing {}",
db_path.display()
)
})?;
match storage.into_inner() {
Ok(storage) => close_storage_after_index(storage, db_path, context),
Err(poisoned) => {
let mut storage = poisoned.into_inner();
storage.close_best_effort_in_place();
Err(anyhow::anyhow!(
"storage mutex poisoned while closing canonical db after {context}: {}",
db_path.display()
))
}
}
}
fn semantic_role_name(role: u8) -> Option<&'static str> {
match role {
ROLE_USER => Some("user"),
ROLE_ASSISTANT => Some("assistant"),
ROLE_SYSTEM => Some("system"),
ROLE_TOOL => Some("tool"),
_ => None,
}
}
#[derive(Debug, Default)]
struct WatchSemanticDelta {
inputs: Vec<EmbeddingInput>,
max_message_id: Option<i64>,
}
impl WatchSemanticDelta {
fn extend_from_batch(
&mut self,
batch_inputs: Vec<EmbeddingInput>,
max_message_id: Option<i64>,
) {
self.inputs.extend(batch_inputs);
if let Some(max_message_id) = max_message_id {
self.max_message_id = Some(
self.max_message_id
.map_or(max_message_id, |current| current.max(max_message_id)),
);
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct SemanticContentFingerprint {
total_conversations: usize,
max_conversation_id: i64,
max_message_id: i64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum TargetedSemanticWatchOnceMode {
RebuildAll,
AppendToExisting,
AlreadyCovered,
}
#[derive(Debug)]
struct TargetedSemanticWatchOnceSelection {
mode: TargetedSemanticWatchOnceMode,
inputs: Vec<EmbeddingInput>,
raw_max_message_id: Option<i64>,
tier: SemanticTierKind,
index_path: PathBuf,
total_conversations: u64,
current_db_fingerprint: String,
manifest_before_db_fingerprint: Option<String>,
reason: &'static str,
}
fn should_run_targeted_semantic_watch_once(opts: &IndexOptions) -> bool {
opts.semantic
&& !opts.watch
&& !opts.full
&& !opts.force_rebuild
&& !opts.build_hnsw
&& opts
.watch_once_paths
.as_ref()
.is_some_and(|paths| !paths.is_empty())
}
fn parse_semantic_content_fingerprint(raw: &str) -> Option<SemanticContentFingerprint> {
let mut parts = raw.strip_prefix("content-v1:")?.split(':');
let total_conversations = parts.next()?.parse::<usize>().ok()?;
let max_conversation_id = parts.next()?.parse::<i64>().ok()?;
let max_message_id = parts.next()?.parse::<i64>().ok()?;
if parts.next().is_some() {
return None;
}
Some(SemanticContentFingerprint {
total_conversations,
max_conversation_id,
max_message_id,
})
}
fn semantic_artifact_for_tier(
manifest: &SemanticManifest,
tier: SemanticTierKind,
) -> Option<&ArtifactRecord> {
match tier {
SemanticTierKind::Fast => manifest.fast_tier.as_ref(),
SemanticTierKind::Quality => manifest.quality_tier.as_ref(),
}
}
fn semantic_artifact_index_path(data_dir: &Path, artifact: &ArtifactRecord) -> Result<PathBuf> {
let path = PathBuf::from(&artifact.index_path);
if path.is_absolute() {
return Ok(path);
}
let mut resolved = data_dir.to_path_buf();
for component in path.components() {
let std::path::Component::Normal(part) = component else {
anyhow::bail!(
"semantic watch-once cannot use unsafe relative vector artifact path {}",
artifact.index_path
);
};
resolved.push(part);
}
Ok(resolved)
}
fn validate_semantic_watch_once_artifact(
data_dir: &Path,
artifact: &ArtifactRecord,
indexer: &SemanticIndexer,
tier: SemanticTierKind,
) -> Result<PathBuf> {
if !artifact.ready {
anyhow::bail!("semantic watch-once cannot reuse artifact that is not ready");
}
let artifact_matches_indexer = artifact.tier.eq(&tier)
&& artifact.embedder_id.as_str().eq(indexer.embedder_id())
&& artifact.dimension.eq(&indexer.embedder_dimension())
&& artifact.schema_version.eq(&SEMANTIC_SCHEMA_VERSION)
&& artifact.chunking_version.eq(&CHUNKING_STRATEGY_VERSION);
if !artifact_matches_indexer {
anyhow::bail!(
"semantic watch-once cannot prove coverage from incompatible semantic artifact"
);
}
let index_path = semantic_artifact_index_path(data_dir, artifact)?;
let canonical_index_path = vector_index_path(data_dir, indexer.embedder_id());
if !index_path.eq(&canonical_index_path) {
anyhow::bail!(
"semantic watch-once cannot append to non-canonical vector path {}; expected {}",
index_path.display(),
canonical_index_path.display()
);
}
let index = FsVectorIndex::open(&index_path).map_err(|err| {
anyhow::anyhow!(
"semantic watch-once cannot open existing vector artifact {}: {err}",
index_path.display()
)
})?;
let observed_docs = u64::try_from(index.record_count()).unwrap_or(u64::MAX);
if !observed_docs.eq(&artifact.doc_count) {
anyhow::bail!(
"semantic watch-once cannot prove existing vector prefix: manifest doc_count={} but index has {} records",
artifact.doc_count,
observed_docs
);
}
Ok(index_path)
}
fn semantic_artifact_is_append_only_prefix(
storage: &FrankenStorage,
artifact_fingerprint: SemanticContentFingerprint,
current_fingerprint: SemanticContentFingerprint,
) -> Result<bool> {
if artifact_fingerprint.total_conversations > current_fingerprint.total_conversations
|| artifact_fingerprint.max_conversation_id > current_fingerprint.max_conversation_id
|| artifact_fingerprint.max_message_id > current_fingerprint.max_message_id
{
return Ok(false);
}
let prefix_conversations: i64 = storage
.raw()
.query_row_map(
"SELECT COUNT(*)
FROM conversations
WHERE id <= ?1",
&[ParamValue::from(artifact_fingerprint.max_conversation_id)],
|row| row.get_typed(0),
)
.context("checking semantic watch-once prefix conversation count")?;
let observed_prefix_conversations =
usize::try_from(prefix_conversations.max(0)).unwrap_or(usize::MAX);
Ok(observed_prefix_conversations.eq(&artifact_fingerprint.total_conversations))
}
fn filter_semantic_watch_once_inputs(inputs: &mut Vec<EmbeddingInput>) {
inputs.retain(|message| {
!is_hard_message_noise(semantic_role_name(message.role), &message.content)
});
}
fn select_targeted_semantic_watch_once_inputs(
storage: &FrankenStorage,
data_dir: &Path,
indexer: &SemanticIndexer,
pre_watch_conversations: usize,
) -> Result<TargetedSemanticWatchOnceSelection> {
let total_conversations = count_total_conversations_exact(storage)?;
if total_conversations == 0 {
anyhow::bail!(
"semantic watch-once indexed zero conversations; refusing to publish semantic success"
);
}
let current_db_fingerprint = lexical_rebuild_content_fingerprint(storage, total_conversations)?;
let current_fingerprint = parse_semantic_content_fingerprint(¤t_db_fingerprint)
.ok_or_else(|| {
anyhow::anyhow!(
"semantic watch-once could not parse current DB fingerprint {current_db_fingerprint}"
)
})?;
let tier = semantic_tier_for_embedder_id(indexer.embedder_id()).ok_or_else(|| {
anyhow::anyhow!(
"semantic watch-once cannot publish unknown embedder tier for {}",
indexer.embedder_id()
)
})?;
let manifest = SemanticManifest::load_or_default(data_dir).map_err(|err| {
anyhow::anyhow!("loading semantic manifest for semantic watch-once: {err}")
})?;
let artifact = semantic_artifact_for_tier(&manifest, tier)
.filter(|artifact| artifact.embedder_id.as_str().eq(indexer.embedder_id()))
.cloned();
let manifest_before_db_fingerprint = artifact
.as_ref()
.map(|artifact| artifact.db_fingerprint.clone());
if let Some(artifact) = artifact.as_ref()
&& artifact.db_fingerprint.eq(¤t_db_fingerprint)
{
let index_path = validate_semantic_watch_once_artifact(data_dir, artifact, indexer, tier)?;
return Ok(TargetedSemanticWatchOnceSelection {
mode: TargetedSemanticWatchOnceMode::AlreadyCovered,
inputs: Vec::new(),
raw_max_message_id: (current_fingerprint.max_message_id > 0)
.then_some(current_fingerprint.max_message_id),
tier,
index_path,
total_conversations: u64::try_from(total_conversations).unwrap_or(u64::MAX),
current_db_fingerprint,
manifest_before_db_fingerprint,
reason: "semantic_artifact_already_covers_db",
});
}
if pre_watch_conversations == 0 {
let mut inputs = packet_embedding_inputs_from_storage(storage)?;
let raw_max_message_id = inputs
.iter()
.filter_map(|input| i64::try_from(input.message_id).ok())
.max()
.or_else(|| {
(current_fingerprint.max_message_id > 0)
.then_some(current_fingerprint.max_message_id)
});
filter_semantic_watch_once_inputs(&mut inputs);
return Ok(TargetedSemanticWatchOnceSelection {
mode: TargetedSemanticWatchOnceMode::RebuildAll,
inputs,
raw_max_message_id,
tier,
index_path: vector_index_path(data_dir, indexer.embedder_id()),
total_conversations: u64::try_from(total_conversations).unwrap_or(u64::MAX),
current_db_fingerprint,
manifest_before_db_fingerprint,
reason: "fresh_watch_once_db",
});
}
let artifact = artifact.ok_or_else(|| {
anyhow::anyhow!(
"semantic watch-once cannot prove bounded coverage: no existing {} artifact for populated DB",
tier.as_str()
)
})?;
let artifact_fingerprint = parse_semantic_content_fingerprint(&artifact.db_fingerprint)
.ok_or_else(|| {
anyhow::anyhow!(
"semantic watch-once cannot parse existing artifact fingerprint {}",
artifact.db_fingerprint
)
})?;
let artifact_fingerprint_conversations =
u64::try_from(artifact_fingerprint.total_conversations).unwrap_or(u64::MAX);
if !artifact
.conversation_count
.eq(&artifact_fingerprint_conversations)
{
anyhow::bail!(
"semantic watch-once cannot prove existing vector prefix: manifest conversation_count={} but fingerprint has {} conversations",
artifact.conversation_count,
artifact_fingerprint.total_conversations
);
}
let index_path = validate_semantic_watch_once_artifact(data_dir, &artifact, indexer, tier)?;
if !semantic_artifact_is_append_only_prefix(storage, artifact_fingerprint, current_fingerprint)?
{
anyhow::bail!(
"semantic watch-once cannot prove bounded coverage: existing semantic artifact is not an append-only prefix of the current DB"
);
}
let mut batch =
packet_embedding_inputs_from_storage_since(storage, artifact_fingerprint.max_message_id)?;
let raw_max_message_id = batch.raw_max_message_id.or_else(|| {
(current_fingerprint.max_message_id > 0).then_some(current_fingerprint.max_message_id)
});
filter_semantic_watch_once_inputs(&mut batch.inputs);
Ok(TargetedSemanticWatchOnceSelection {
mode: TargetedSemanticWatchOnceMode::AppendToExisting,
inputs: batch.inputs,
raw_max_message_id,
tier,
index_path,
total_conversations: u64::try_from(total_conversations).unwrap_or(u64::MAX),
current_db_fingerprint,
manifest_before_db_fingerprint,
reason: "semantic_artifact_is_append_only_prefix",
})
}
fn publish_semantic_watch_once_artifact(
data_dir: &Path,
indexer: &SemanticIndexer,
selection: &TargetedSemanticWatchOnceSelection,
doc_count: u64,
build_started_at_ms: i64,
) -> Result<()> {
let size_bytes = fs::metadata(&selection.index_path)
.with_context(|| {
format!(
"stat semantic watch-once index {}",
selection.index_path.display()
)
})?
.len();
let relative_index_path = selection
.index_path
.strip_prefix(data_dir)
.unwrap_or(selection.index_path.as_path())
.to_string_lossy()
.to_string();
let mut manifest = SemanticManifest::load_or_default(data_dir).map_err(|err| {
anyhow::anyhow!("loading semantic manifest for semantic watch-once publish: {err}")
})?;
manifest.publish_artifact(ArtifactRecord {
tier: selection.tier,
embedder_id: indexer.embedder_id().to_string(),
model_revision: semantic_model_revision_for_embedder_id(indexer.embedder_id()),
schema_version: SEMANTIC_SCHEMA_VERSION,
chunking_version: CHUNKING_STRATEGY_VERSION,
dimension: indexer.embedder_dimension(),
doc_count,
conversation_count: selection.total_conversations,
db_fingerprint: selection.current_db_fingerprint.clone(),
index_path: relative_index_path,
size_bytes,
started_at_ms: build_started_at_ms,
completed_at_ms: semantic_indexing_now_ms(),
ready: true,
});
manifest.refresh_backlog(
selection.total_conversations,
&selection.current_db_fingerprint,
);
manifest
.save(data_dir)
.map_err(|err| anyhow::anyhow!("saving semantic watch-once manifest: {err}"))
}
fn run_targeted_semantic_watch_once_publish(
embedder: &str,
data_dir: &Path,
storage: &Mutex<FrankenStorage>,
indexed_conversations: usize,
pre_watch_conversations: usize,
) -> Result<SemanticWatchOnceStats> {
if indexed_conversations == 0 {
anyhow::bail!(
"semantic watch-once indexed zero conversations; refusing to publish semantic success"
);
}
let indexer = SemanticIndexer::new(embedder, Some(data_dir))?;
let selection = {
let storage = storage.lock().map_err(|err| {
anyhow::anyhow!("lock storage for semantic watch-once selection: {err}")
})?;
select_targeted_semantic_watch_once_inputs(
&storage,
data_dir,
&indexer,
pre_watch_conversations,
)?
};
let selected_docs = selection.inputs.len();
let build_started_at_ms = semantic_indexing_now_ms();
let embedded = if selection.inputs.is_empty() {
Vec::new()
} else {
indexer.embed_messages(&selection.inputs)?
};
let embedded_docs = embedded.len();
let doc_count = match selection.mode {
TargetedSemanticWatchOnceMode::AlreadyCovered => {
let index = FsVectorIndex::open(&selection.index_path).map_err(|err| {
anyhow::anyhow!(
"open already-covered semantic watch-once index {}: {err}",
selection.index_path.display()
)
})?;
u64::try_from(index.record_count()).unwrap_or(u64::MAX)
}
TargetedSemanticWatchOnceMode::RebuildAll => {
let index = indexer.build_and_save_index(embedded, data_dir)?;
u64::try_from(index.record_count()).unwrap_or(u64::MAX)
}
TargetedSemanticWatchOnceMode::AppendToExisting => {
if embedded_docs > 0 {
let appended = indexer.append_to_index(embedded, data_dir)?;
if !appended.eq(&embedded_docs) {
anyhow::bail!(
"semantic watch-once append count mismatch: appended {appended}, embedded {embedded_docs}"
);
}
}
let index = FsVectorIndex::open(&selection.index_path).map_err(|err| {
anyhow::anyhow!(
"open appended semantic watch-once index {}: {err}",
selection.index_path.display()
)
})?;
u64::try_from(index.record_count()).unwrap_or(u64::MAX)
}
};
publish_semantic_watch_once_artifact(
data_dir,
&indexer,
&selection,
doc_count,
build_started_at_ms,
)?;
if let Some(raw_max_message_id) = selection.raw_max_message_id {
update_incremental_semantic_watermark(
storage,
raw_max_message_id,
"updating semantic watch-once watermark",
)?;
}
Ok(SemanticWatchOnceStats {
published: true,
selected_docs,
embedded_docs,
tier: selection.tier.as_str().to_string(),
vector_index_path: selection.index_path.display().to_string(),
manifest_before_db_fingerprint: selection.manifest_before_db_fingerprint,
manifest_after_db_fingerprint: Some(selection.current_db_fingerprint),
reason: selection.reason.to_string(),
})
}
fn update_incremental_semantic_watermark(
storage: &Mutex<FrankenStorage>,
raw_max_id: i64,
context: &str,
) -> Result<()> {
let guard = storage
.lock()
.map_err(|e| anyhow::anyhow!("lock storage for watermark write: {e}"))?;
persist::with_ephemeral_writer(&guard, false, context, |writer| {
writer.set_last_embedded_message_id(raw_max_id)
})?;
Ok(())
}
fn embed_incremental_semantic_inputs(
embedder: &str,
data_dir: &Path,
storage: &Mutex<FrankenStorage>,
embedding_inputs: Vec<EmbeddingInput>,
raw_max_id: i64,
filtered_watermark_context: &str,
success_watermark_context: &str,
) -> Result<usize> {
if embedding_inputs.is_empty() {
update_incremental_semantic_watermark(storage, raw_max_id, filtered_watermark_context)?;
return Ok(0);
}
let semantic_indexer = SemanticIndexer::new(embedder, Some(data_dir))?;
let embedded = semantic_indexer.embed_messages(&embedding_inputs)?;
let count = semantic_indexer.append_to_index(embedded, data_dir)?;
update_incremental_semantic_watermark(storage, raw_max_id, success_watermark_context)?;
Ok(count)
}
fn incremental_semantic_embed_from_delta(
embedder: &str,
data_dir: &Path,
storage: &Mutex<FrankenStorage>,
semantic_delta: WatchSemanticDelta,
) -> Result<usize> {
let Some(raw_max_id) = semantic_delta.max_message_id else {
return Ok(0);
};
let embedding_inputs: Vec<EmbeddingInput> = semantic_delta
.inputs
.into_iter()
.filter(|msg| !is_hard_message_noise(semantic_role_name(msg.role), &msg.content))
.collect();
embed_incremental_semantic_inputs(
embedder,
data_dir,
storage,
embedding_inputs,
raw_max_id,
"advancing incremental semantic watermark for filtered packet delta",
"updating incremental semantic watermark from packet delta",
)
}
fn incremental_semantic_embed(
embedder: &str,
data_dir: &Path,
storage: &Mutex<FrankenStorage>,
) -> Result<usize> {
let watermark = storage
.lock()
.map_err(|e| anyhow::anyhow!("lock storage for watermark read: {e}"))?
.get_last_embedded_message_id()?
.unwrap_or(0);
let batch = {
let storage = storage
.lock()
.map_err(|e| anyhow::anyhow!("lock storage for message fetch: {e}"))?;
packet_embedding_inputs_from_storage_since(&storage, watermark)?
};
let Some(raw_max_id) = batch.raw_max_message_id else {
return Ok(0);
};
tracing::info!(
since_id = watermark,
conversations = batch.conversations_in_batch,
count = batch.inputs.len(),
packet_driven = true,
"incremental semantic: fetched canonical packet catch-up batch"
);
let embedding_inputs: Vec<EmbeddingInput> = batch
.inputs
.into_iter()
.filter(|msg| !is_hard_message_noise(semantic_role_name(msg.role), &msg.content))
.collect();
embed_incremental_semantic_inputs(
embedder,
data_dir,
storage,
embedding_inputs,
raw_max_id,
"advancing incremental semantic watermark for filtered batch",
"updating incremental semantic watermark",
)
}
fn open_storage_for_index(
db_path: &Path,
full_index: bool,
) -> Result<(FrankenStorage, bool, bool)> {
if db_path.exists() {
match non_destructive_meta_schema_version(db_path) {
Ok(Some(version)) if version > crate::storage::sqlite::CURRENT_SCHEMA_VERSION => {
return Err(canonical_archive_unhealthy_for_index_error(
db_path,
&format!(
"schema_version {version} is newer than supported version {}",
crate::storage::sqlite::CURRENT_SCHEMA_VERSION
),
));
}
Ok(_) => {}
Err(err) if anyhow_chain_indicates_retryable_storage_contention(&err) => {
return Err(anyhow::anyhow!(
"canonical db is busy/locked during index open; refusing to replace it: {err:#}"
));
}
Err(err) => {
return Err(canonical_archive_unhealthy_for_index_error(
db_path,
&index_storage_open_error_reason(&err),
));
}
}
}
if db_path.exists() {
match crate::storage::sqlite::open_current_schema_storage_with_timeout(
db_path,
Duration::from_secs(10),
) {
Ok(Some(storage)) => return Ok((storage, false, false)),
Ok(None) => {}
Err(err) => tracing::warn!(
db_path = %db_path.display(),
error = ?err,
"single-open current-schema storage path failed; attempting non-destructive compatibility open"
),
}
}
if db_path.exists() {
match FrankenStorage::open(db_path) {
Ok(storage) => Ok((storage, false, false)),
Err(err) if anyhow_chain_indicates_retryable_storage_contention(&err) => {
Err(anyhow::anyhow!(
"canonical db is busy/locked during index open; refusing to replace it: {err}"
))
}
Err(err) => Err(canonical_archive_unhealthy_for_index_error(
db_path,
&index_storage_open_error_reason(&err),
)),
}
} else {
FrankenStorage::open(db_path)
.map(|storage| (storage, false, full_index))
.with_context(|| format!("creating frankensqlite storage at {}", db_path.display()))
}
}
fn index_storage_open_error_reason(err: &anyhow::Error) -> String {
let message = format!("{err:#}");
crate::storage::sqlite::fts_messages_integrity_error_from_message(&message)
.map(|fts_err| fts_err.to_string())
.unwrap_or(message)
}
fn non_destructive_meta_schema_version(db_path: &Path) -> Result<Option<i64>> {
let mut conn = crate::storage::sqlite::open_franken_raw_readonly_connection_with_timeout(
db_path,
Duration::from_secs(10),
)
.with_context(|| {
format!(
"opening canonical archive read-only before index: {}",
db_path.display()
)
})?;
let result = match conn.query("SELECT value FROM meta WHERE key = 'schema_version';") {
Ok(rows) => Ok(rows
.first()
.and_then(|row| row.get_typed::<String>(0).ok())
.and_then(|raw| raw.parse::<i64>().ok())),
Err(err) if storage_error_mentions_missing_table_or_column(&err) => Ok(None),
Err(err) => Err(anyhow::anyhow!(
"reading canonical archive schema_version before index: {err}"
)),
};
if let Err(close_err) = conn.close_without_checkpoint_in_place() {
tracing::debug!(
error = %close_err,
db_path = %db_path.display(),
"non_destructive_meta_schema_version: close_without_checkpoint_in_place failed"
);
conn.close_best_effort_in_place();
}
result
}
fn storage_error_mentions_missing_table_or_column(err: &impl std::fmt::Display) -> bool {
let message = err.to_string().to_ascii_lowercase();
message.contains("no such table") || message.contains("no such column")
}
fn anyhow_chain_indicates_retryable_storage_contention(err: &anyhow::Error) -> bool {
err.chain()
.any(|cause| crate::storage::sqlite::retryable_storage_error_message(&cause.to_string()))
}
fn full_rebuild_existing_storage_integrity_problem(
storage: &FrankenStorage,
) -> Result<Option<String>> {
let quick_check = match storage.raw().query_row_map(
"PRAGMA quick_check(1)",
&[] as &[ParamValue],
|row| row.get_typed::<String>(0),
) {
Ok(status) => status,
Err(err) if crate::storage::sqlite::retryable_franken_error(&err) => {
return Err(anyhow::anyhow!(
"full rebuild archive integrity preflight hit transient storage contention: {err}"
));
}
Err(err) => return Ok(Some(format!("quick_check failed: {err}"))),
};
if !quick_check.trim().eq_ignore_ascii_case("ok") {
return Ok(Some(format!("quick_check reported {quick_check:?}")));
}
for (table, sql) in [
("conversations", "SELECT COUNT(*) FROM conversations"),
("messages", "SELECT COUNT(*) FROM messages"),
("sources", "SELECT COUNT(*) FROM sources"),
] {
match storage
.raw()
.query_row_map(sql, &[] as &[ParamValue], |row| row.get_typed::<i64>(0))
{
Ok(_) => {}
Err(err) if crate::storage::sqlite::retryable_franken_error(&err) => {
return Err(anyhow::anyhow!(
"full rebuild archive integrity canary for {table} hit transient storage contention: {err}"
));
}
Err(err) => {
return Ok(Some(format!(
"canonical table canary failed for {table}: {err}"
)));
}
}
}
Ok(None)
}
fn canonical_archive_unhealthy_for_index_error(db_path: &Path, reason: &str) -> anyhow::Error {
anyhow::anyhow!(
"canonical cass archive at {} is not safe for indexing: {reason}. \
cass index will not replace or truncate the SQLite source of truth. \
Run 'cass doctor check --json' to inspect the archive, then use the \
doctor repair plan or recover from an explicit backup before indexing again.",
db_path.display()
)
}
fn orphan_fk_cleanup_failed_index_error(db_path: &Path, err: &anyhow::Error) -> anyhow::Error {
anyhow::anyhow!(
"orphan FK self-heal failed for canonical cass archive at {}: {err:#}. \
cass stopped before counting or writing because the archive connection may be OOM-poisoned or corrupt. \
Run 'cass doctor check --json' and free disk/memory pressure before retrying.",
db_path.display()
)
}
const INDEX_MIN_FREE_SPACE_BYTES: u64 = 512 * 1024 * 1024;
fn ensure_index_storage_headroom(data_dir: &Path, db_path: &Path) -> Result<()> {
if index_disk_headroom_check_disabled() {
return Ok(());
}
let required = required_index_headroom_bytes(db_path);
for probe_path in existing_headroom_probe_paths(data_dir, db_path) {
let available = fs2::available_space(&probe_path).with_context(|| {
format!(
"checking free disk space for cass index at {}",
probe_path.display()
)
})?;
if available >= required {
continue;
}
return Err(anyhow::anyhow!(
"canonical archive disk headroom check failed for {}: available={} bytes, required={} bytes. \
Free space before running cass index so SQLite, WAL, and lexical scratch writes cannot fail mid-commit. \
Run 'cass doctor check --json' for a read-only health report.",
probe_path.display(),
available,
required
));
}
Ok(())
}
fn index_disk_headroom_check_disabled() -> bool {
dotenvy::var("CASS_INDEX_SKIP_DISK_HEADROOM_CHECK")
.map(|value| env_value_truthy(&value))
.unwrap_or(false)
}
fn env_value_truthy(value: &str) -> bool {
matches!(
value.trim().to_ascii_lowercase().as_str(),
"1" | "true" | "yes" | "on"
)
}
fn existing_headroom_probe_paths(data_dir: &Path, db_path: &Path) -> Vec<PathBuf> {
let mut paths = Vec::with_capacity(2);
for candidate in [data_dir, db_path.parent().unwrap_or(db_path)] {
if let Some(existing) = nearest_existing_path(candidate) {
push_unique_headroom_probe_path(&mut paths, existing);
}
}
if paths.is_empty() {
paths.push(std::env::current_dir().unwrap_or_else(|_| PathBuf::from("/")));
}
paths
}
fn push_unique_headroom_probe_path(paths: &mut Vec<PathBuf>, candidate: PathBuf) {
if paths
.iter()
.any(|existing| same_headroom_probe_path(existing, &candidate))
{
return;
}
paths.push(candidate);
}
fn same_headroom_probe_path(left: &Path, right: &Path) -> bool {
if left == right {
return true;
}
match (fs::canonicalize(left), fs::canonicalize(right)) {
(Ok(left), Ok(right)) => left == right,
_ => false,
}
}
fn nearest_existing_path(path: &Path) -> Option<PathBuf> {
for ancestor in path.ancestors() {
if ancestor.as_os_str().is_empty() {
let current = Path::new(".");
if current.exists() {
return Some(current.to_path_buf());
}
continue;
}
if ancestor.exists() {
return Some(ancestor.to_path_buf());
}
}
None
}
fn required_index_headroom_bytes(db_path: &Path) -> u64 {
let db_bundle_bytes = database_bundle_size_bytes(db_path);
INDEX_MIN_FREE_SPACE_BYTES.max(db_bundle_bytes.saturating_mul(2))
}
fn database_bundle_size_bytes(db_path: &Path) -> u64 {
let mut total = file_size_bytes(db_path);
for sidecar in database_sidecar_paths(db_path) {
total = total.saturating_add(file_size_bytes(&sidecar));
}
total
}
fn database_sidecar_paths(db_path: &Path) -> [PathBuf; 2] {
[
database_path_with_suffix(db_path, "-wal"),
database_path_with_suffix(db_path, "-shm"),
]
}
fn database_path_with_suffix(db_path: &Path, suffix: &str) -> PathBuf {
let Some(file_name) = db_path.file_name() else {
return PathBuf::from(format!("{}{}", db_path.display(), suffix));
};
let mut sidecar_name = file_name.to_os_string();
sidecar_name.push(suffix);
db_path.with_file_name(sidecar_name)
}
fn file_size_bytes(path: &Path) -> u64 {
fs::metadata(path).map(|meta| meta.len()).unwrap_or(0)
}
#[cfg(test)]
fn current_schema_fast_probe(db_path: &Path) -> Result<bool> {
let mut storage = FrankenStorage::open_readonly(db_path)
.with_context(|| format!("opening frankensqlite db readonly at {}", db_path.display()))?;
let version = storage
.raw()
.query("SELECT value FROM meta WHERE key = 'schema_version';")
.ok()
.and_then(|rows| rows.first().cloned())
.and_then(|row| row.get_typed::<String>(0).ok())
.and_then(|raw| raw.parse::<i64>().ok());
if let Err(close_err) = storage.close_without_checkpoint_in_place() {
tracing::warn!(
error = %close_err,
db_path = %db_path.display(),
"current_schema_fast_probe: close_without_checkpoint_in_place failed; falling back to best-effort close"
);
storage.close_best_effort_in_place();
}
Ok(version == Some(crate::storage::sqlite::CURRENT_SCHEMA_VERSION))
}
fn quarantine_failed_seed_bundle(db_path: &Path) -> Result<Option<PathBuf>> {
if !db_path.exists() {
return Ok(None);
}
let Some(parent) = db_path.parent() else {
return Ok(None);
};
let db_name = db_path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or("agent_search.db");
let backups_dir = parent.join("backups");
fs::create_dir_all(&backups_dir).with_context(|| {
format!(
"creating backups directory for failed baseline seed bundle: {}",
backups_dir.display()
)
})?;
sync_parent_directory(&backups_dir)?;
let backup_root = unique_failed_seed_backup_root(&backups_dir, db_name);
for suffix in ["", "-wal", "-shm"] {
let src = if suffix.is_empty() {
db_path.to_path_buf()
} else {
db_path.with_file_name(format!("{db_name}{suffix}"))
};
if !src.exists() {
continue;
}
let dest = if suffix.is_empty() {
backup_root.clone()
} else {
backup_root.with_file_name(format!(
"{}{}",
backup_root
.file_name()
.and_then(|name| name.to_str())
.unwrap_or("agent_search.db.failed-baseline-seed.bak"),
suffix
))
};
fs::rename(&src, &dest).with_context(|| {
format!(
"moving failed baseline seed bundle component {} -> {}",
src.display(),
dest.display()
)
})?;
}
sync_parent_directory(db_path)?;
sync_parent_directory(&backup_root)?;
Ok(Some(backup_root))
}
fn maybe_seed_empty_canonical_from_historical_bundle(
storage: FrankenStorage,
db_path: &Path,
) -> Result<(FrankenStorage, Option<HistoricalSalvageOutcome>)> {
let conversation_count = count_total_conversations_exact(&storage)?;
if conversation_count > 0 {
return Ok((storage, None));
}
storage.close().with_context(|| {
format!(
"closing canonical db before baseline historical seed attempt: {}",
db_path.display()
)
})?;
match seed_canonical_from_best_historical_bundle(db_path) {
Ok(result) => {
let reopened = if result.is_some() {
FrankenStorage::open_writer(db_path).with_context(|| {
format!(
"reopening canonical database after baseline historical seed attempt without rerunning migrations: {}",
db_path.display()
)
})?
} else {
FrankenStorage::open(db_path).with_context(|| {
format!(
"reopening canonical database after baseline historical seed attempt: {}",
db_path.display()
)
})?
};
Ok((reopened, result))
}
Err(err) => {
tracing::warn!(
db_path = %db_path.display(),
error = %err,
"baseline historical seed import failed; falling back to incremental salvage"
);
match FrankenStorage::open(db_path) {
Ok(reopened) => Ok((reopened, None)),
Err(reopen_err) => {
tracing::warn!(
db_path = %db_path.display(),
error = %reopen_err,
"canonical database could not be reopened after failed baseline seed; quarantining partial bundle"
);
let failed_seed_backup =
quarantine_failed_seed_bundle(db_path).with_context(|| {
format!(
"quarantining failed baseline seed bundle before incremental salvage: {}",
db_path.display()
)
})?;
if let Some(path) = failed_seed_backup {
tracing::info!(
db_path = %db_path.display(),
backup_path = %path.display(),
"moved failed baseline seed bundle aside before incremental salvage fallback"
);
}
let reopened = FrankenStorage::open(db_path).with_context(|| {
format!(
"recreating fresh canonical database after failed baseline seed import: {}",
db_path.display()
)
})?;
Ok((reopened, None))
}
}
}
}
}
pub(crate) fn rebuild_tantivy_from_db(
db_path: &Path,
data_dir: &Path,
total_conversations: usize,
progress: Option<Arc<IndexingProgress>>,
) -> Result<LexicalRebuildOutcome> {
rebuild_tantivy_from_db_with_options(
db_path,
data_dir,
total_conversations,
progress,
LexicalRebuildStartupOptions::default(),
None,
)
}
fn rebuild_tantivy_from_db_with_progress_bump(
db_path: &Path,
data_dir: &Path,
total_conversations: usize,
progress: Option<Arc<IndexingProgress>>,
progress_bump: Arc<AtomicI64>,
) -> Result<LexicalRebuildOutcome> {
rebuild_tantivy_from_db_with_options(
db_path,
data_dir,
total_conversations,
progress,
LexicalRebuildStartupOptions::default(),
Some(progress_bump),
)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct SearchLexicalRepairOutcome {
pub indexed_docs: usize,
}
pub(crate) fn refresh_completed_lexical_rebuild_checkpoint_from_live_index(
db_path: &Path,
data_dir: &Path,
) -> Result<()> {
let storage = FrankenStorage::open_readonly(db_path).with_context(|| {
format!(
"opening database to refresh lexical checkpoint: {}",
db_path.display()
)
})?;
refresh_completed_lexical_rebuild_checkpoint(&storage, db_path, data_dir)?;
storage.close_without_checkpoint().with_context(|| {
format!(
"closing readonly database after lexical checkpoint refresh: {}",
db_path.display()
)
})
}
pub(crate) fn repair_lexical_index_from_canonical_db_for_search(
db_path: &Path,
data_dir: &Path,
progress: Option<Arc<IndexingProgress>>,
) -> Result<SearchLexicalRepairOutcome> {
let index_run_lock = acquire_index_run_lock(data_dir, db_path, SearchMaintenanceMode::Index)?;
let _index_run_lock_heartbeat = IndexRunLockHeartbeat::start(
data_dir.to_path_buf(),
index_run_lock_heartbeat_interval(),
Arc::clone(&index_run_lock.metadata_write_lock),
Arc::clone(&index_run_lock.last_progress_at_ms_atomic),
);
let storage = FrankenStorage::open_readonly(db_path).with_context(|| {
format!(
"opening database to repair lexical index for search: {}",
db_path.display()
)
})?;
let total_conversations = count_total_conversations_exact(&storage)?;
if total_conversations == 0 {
let index_path = index_dir(data_dir)?;
let stage_parent = index_path.parent().unwrap_or(data_dir);
let stage_root = TempDirBuilder::new()
.prefix("cass-empty-lexical-repair-")
.tempdir_in(stage_parent)
.with_context(|| {
format!(
"creating staging directory for empty lexical repair beside {}",
index_path.display()
)
})?;
let staged_index_path = stage_root.path().join("index");
let empty_index = TantivyIndex::open_or_create(&staged_index_path).with_context(|| {
format!(
"creating empty lexical index for empty canonical database: {}",
staged_index_path.display()
)
})?;
drop(empty_index);
publish_staged_lexical_index(&staged_index_path, &index_path).with_context(|| {
format!(
"publishing empty lexical index repair {} -> {}",
staged_index_path.display(),
index_path.display()
)
})?;
refresh_completed_lexical_rebuild_checkpoint(&storage, db_path, data_dir)?;
storage.close_without_checkpoint().with_context(|| {
format!(
"closing readonly database after empty search-triggered lexical repair: {}",
db_path.display()
)
})?;
return Ok(SearchLexicalRepairOutcome { indexed_docs: 0 });
}
storage.close_without_checkpoint().with_context(|| {
format!(
"closing readonly database before search-triggered lexical repair: {}",
db_path.display()
)
})?;
let rebuild = rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
db_path,
data_dir,
total_conversations,
progress,
Arc::clone(&index_run_lock.last_progress_at_ms_atomic),
)?;
Ok(SearchLexicalRepairOutcome {
indexed_docs: rebuild.indexed_docs,
})
}
#[cfg(test)]
fn rebuild_tantivy_from_db_deferred_startup(
db_path: &Path,
data_dir: &Path,
total_conversations: usize,
progress: Option<Arc<IndexingProgress>>,
) -> Result<LexicalRebuildOutcome> {
rebuild_tantivy_from_db_deferred_startup_with_options(
db_path,
data_dir,
total_conversations,
progress,
None,
)
}
fn rebuild_tantivy_from_db_deferred_startup_with_progress_bump(
db_path: &Path,
data_dir: &Path,
total_conversations: usize,
progress: Option<Arc<IndexingProgress>>,
progress_bump: Arc<AtomicI64>,
) -> Result<LexicalRebuildOutcome> {
rebuild_tantivy_from_db_deferred_startup_with_options(
db_path,
data_dir,
total_conversations,
progress,
Some(progress_bump),
)
}
fn rebuild_tantivy_from_db_deferred_startup_with_options(
db_path: &Path,
data_dir: &Path,
total_conversations: usize,
progress: Option<Arc<IndexingProgress>>,
progress_bump: Option<Arc<AtomicI64>>,
) -> Result<LexicalRebuildOutcome> {
rebuild_tantivy_from_db_with_options(
db_path,
data_dir,
total_conversations,
progress,
LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: true,
},
progress_bump,
)
}
#[derive(Debug)]
struct LexicalRebuildPreparedPage {
packets: Vec<LexicalRebuildConversationPacket>,
page_last_conversation_id: i64,
planned_shard_index: Option<usize>,
finishes_planned_shard: bool,
conversation_list_duration: Duration,
message_fetch_duration: Duration,
packet_prepare_duration: Duration,
}
#[derive(Debug)]
struct LexicalRebuildPagePrepWork {
sequence: u64,
conversation_page: Vec<crate::storage::sqlite::LexicalRebuildConversationRow>,
page_last_conversation_id: i64,
configured_page_size: i64,
planned_shard_index: Option<usize>,
finishes_planned_shard: bool,
conversation_list_duration: Duration,
pipeline_budget: LexicalRebuildPipelineBudgetSnapshot,
budget_generation: usize,
}
#[derive(Debug)]
struct LexicalRebuildSequencedPreparedPage {
sequence: u64,
page: LexicalRebuildPreparedPage,
}
#[derive(Debug)]
enum LexicalRebuildPagePrepResult {
Prepared(LexicalRebuildSequencedPreparedPage),
Error { sequence: u64, error: String },
}
#[derive(Debug)]
enum LexicalRebuildPipelineMessage {
Batch(LexicalRebuildPreparedPage),
Error(String),
Done,
}
fn lexical_rebuild_prepared_page_reserved_bytes(page: &LexicalRebuildPreparedPage) -> usize {
page.packets
.iter()
.map(|packet| packet.flow_reservation_bytes)
.sum::<usize>()
}
fn release_lexical_rebuild_prepared_page_reservation(
page: &LexicalRebuildPreparedPage,
flow_limiter: &StreamingByteLimiter,
) {
flow_limiter.release(lexical_rebuild_prepared_page_reserved_bytes(page));
}
fn release_completed_lexical_rebuild_pages(
completed_pages: &mut BTreeMap<u64, LexicalRebuildPreparedPage>,
flow_limiter: &StreamingByteLimiter,
) {
for (_, page) in std::mem::take(completed_pages) {
release_lexical_rebuild_prepared_page_reservation(&page, flow_limiter);
}
}
struct StreamingByteReservation<'a> {
flow_limiter: &'a StreamingByteLimiter,
reserved_bytes: usize,
}
impl<'a> StreamingByteReservation<'a> {
fn new(flow_limiter: &'a StreamingByteLimiter, reserved_bytes: usize) -> Self {
Self {
flow_limiter,
reserved_bytes,
}
}
fn disarm(&mut self) {
self.reserved_bytes = 0;
}
fn release_now(&mut self) {
if self.reserved_bytes > 0 {
self.flow_limiter.release(self.reserved_bytes);
self.disarm();
}
}
}
impl Drop for StreamingByteReservation<'_> {
fn drop(&mut self) {
self.release_now();
}
}
#[allow(clippy::too_many_arguments)]
fn prepare_lexical_rebuild_page_work(
storage: &mut FrankenStorage,
source_map: &HashMap<String, (SourceKind, Option<String>)>,
flow_limiter: &StreamingByteLimiter,
reservation_order: &LexicalRebuildReservationOrder,
producer_telemetry: &LexicalRebuildProducerTelemetry,
lexical_rebuild_worker_pool: Option<&ThreadPool>,
work: LexicalRebuildPagePrepWork,
) -> Result<LexicalRebuildSequencedPreparedPage> {
let sequence = work.sequence;
let conversation_ids = work
.conversation_page
.iter()
.filter_map(|conv| conv.id)
.collect::<Vec<_>>();
let (reserved_bytes, budget_wait_duration, waited_for_budget) =
acquire_ordered_lexical_rebuild_page_budget(
reservation_order,
flow_limiter,
sequence,
work.pipeline_budget.batch_fetch_message_bytes_limit,
)
.with_context(|| {
format!(
"acquiring lexical rebuild pipeline byte budget for ordered page sequence {}",
sequence
)
})?;
if waited_for_budget {
producer_telemetry.record_budget_wait(budget_wait_duration);
}
let mut reservation = StreamingByteReservation::new(flow_limiter, reserved_bytes);
let message_fetch_started = Instant::now();
let grouped_messages = match storage.fetch_messages_for_lexical_rebuild_batch(
&conversation_ids,
Some(work.pipeline_budget.batch_fetch_message_limit),
Some(work.pipeline_budget.batch_fetch_message_bytes_limit),
) {
Ok(grouped) => grouped,
Err(err) if format!("{err:#}").contains("guardrail") => {
tracing::warn!(
sequence,
conversations = conversation_ids.len(),
max_messages = work.pipeline_budget.batch_fetch_message_limit,
max_content_bytes = work.pipeline_budget.batch_fetch_message_bytes_limit,
error = %err,
"lexical rebuild page exceeded batch-fetch guardrail inside page-prep worker; falling back to per-conversation fetches"
);
let mut grouped = HashMap::with_capacity(conversation_ids.len());
for conversation_id in &conversation_ids {
let messages = storage
.fetch_messages_for_lexical_rebuild(*conversation_id)
.with_context(|| {
format!(
"fetching lexical rebuild messages for conversation {}",
conversation_id
)
})?;
grouped.insert(*conversation_id, messages);
}
grouped
}
Err(err) => {
return Err(err).context(format!(
"fetching lexical rebuild messages for {} conversations",
conversation_ids.len()
));
}
};
let message_fetch_duration = message_fetch_started.elapsed();
let packet_prepare_started = Instant::now();
let mut prepared_packets = prepare_lexical_rebuild_packet_batch(
work.conversation_page,
grouped_messages,
source_map,
lexical_rebuild_worker_pool,
)
.with_context(|| {
format!(
"preparing lexical rebuild packets for ordered page sequence {}",
sequence
)
})?;
let packet_prepare_duration = packet_prepare_started.elapsed();
let page_message_bytes = prepared_packets
.iter()
.map(|packet| packet.message_bytes)
.sum::<usize>();
let page_message_count = prepared_packets
.iter()
.map(|packet| packet.message_count)
.sum::<usize>();
if prepared_packets.is_empty() {
reservation.release_now();
} else {
assign_lexical_rebuild_flow_reservation_bytes(&mut prepared_packets, reserved_bytes);
reservation.disarm();
}
let configured_page_size = usize::try_from(work.configured_page_size.max(1))
.unwrap_or(usize::MAX)
.max(1);
let budget_shrink_decision =
if work.pipeline_budget.page_conversation_limit < configured_page_size {
"bounded_below_configured_page"
} else {
"using_configured_page"
};
tracing::info!(
sequence,
configured_page_size = work.configured_page_size,
page_conversation_limit = work.pipeline_budget.page_conversation_limit,
budget_generation = work.budget_generation,
budget_shrink_decision,
page_conversations = prepared_packets.len(),
page_last_conversation_id = work.page_last_conversation_id,
page_messages = page_message_count,
page_message_bytes,
reserved_bytes,
budget_wait_ms = budget_wait_duration.as_millis() as u64,
waited_for_budget,
batch_fetch_message_limit = work.pipeline_budget.batch_fetch_message_limit,
batch_fetch_message_bytes_limit = work.pipeline_budget.batch_fetch_message_bytes_limit,
max_message_bytes_in_flight = work.pipeline_budget.max_message_bytes_in_flight,
conversation_list_ms = work.conversation_list_duration.as_millis() as u64,
message_fetch_ms = message_fetch_duration.as_millis() as u64,
packet_prepare_ms = packet_prepare_duration.as_millis() as u64,
"lexical rebuild prepared bounded page"
);
Ok(LexicalRebuildSequencedPreparedPage {
sequence,
page: LexicalRebuildPreparedPage {
packets: prepared_packets,
page_last_conversation_id: work.page_last_conversation_id,
planned_shard_index: work.planned_shard_index,
finishes_planned_shard: work.finishes_planned_shard,
conversation_list_duration: work.conversation_list_duration,
message_fetch_duration,
packet_prepare_duration,
},
})
}
#[allow(clippy::too_many_arguments)]
fn spawn_lexical_rebuild_page_prep_workers(
worker_count: usize,
db_path: PathBuf,
source_map: Arc<HashMap<String, (SourceKind, Option<String>)>>,
work_rx: Receiver<LexicalRebuildPagePrepWork>,
result_tx: Sender<LexicalRebuildPagePrepResult>,
flow_limiter: Arc<StreamingByteLimiter>,
reservation_order: Arc<LexicalRebuildReservationOrder>,
producer_telemetry: Arc<LexicalRebuildProducerTelemetry>,
lexical_rebuild_worker_pool: Option<Arc<ThreadPool>>,
) -> Result<Vec<JoinHandle<()>>> {
let tracing_dispatch = tracing::dispatcher::get_default(|dispatch| dispatch.clone());
(0..worker_count.max(1))
.map(|worker_idx| {
let worker_db_path = db_path.clone();
let worker_source_map = Arc::clone(&source_map);
let worker_rx = work_rx.clone();
let worker_result_tx = result_tx.clone();
let worker_flow_limiter = Arc::clone(&flow_limiter);
let worker_reservation_order = Arc::clone(&reservation_order);
let worker_producer_telemetry = Arc::clone(&producer_telemetry);
let worker_pool = lexical_rebuild_worker_pool.clone();
let worker_dispatch = tracing_dispatch.clone();
thread::Builder::new()
.name(format!("cass-lexical-page-prep-{worker_idx}"))
.spawn(move || {
tracing::dispatcher::with_default(&worker_dispatch, || {
let mut storage = match FrankenStorage::open_readonly(&worker_db_path) {
Ok(storage) => storage,
Err(err) => {
let _ = worker_result_tx.send(LexicalRebuildPagePrepResult::Error {
sequence: 0,
error: format!(
"{:#}",
err.context(format!(
"opening readonly storage for lexical rebuild page-prep worker {}: {}",
worker_idx,
worker_db_path.display()
))
),
});
return;
}
};
while let Ok(work) = worker_rx.recv() {
let sequence = work.sequence;
match prepare_lexical_rebuild_page_work(
&mut storage,
worker_source_map.as_ref(),
worker_flow_limiter.as_ref(),
worker_reservation_order.as_ref(),
worker_producer_telemetry.as_ref(),
worker_pool.as_deref(),
work,
) {
Ok(prepared) => {
let reserved_bytes =
lexical_rebuild_prepared_page_reserved_bytes(&prepared.page);
if worker_result_tx
.send(LexicalRebuildPagePrepResult::Prepared(prepared))
.is_err()
{
worker_reservation_order.close();
worker_flow_limiter.release(reserved_bytes);
break;
}
}
Err(err) => {
worker_reservation_order.close();
let _ = worker_result_tx.send(LexicalRebuildPagePrepResult::Error {
sequence,
error: format!("{err:#}"),
});
break;
}
}
}
storage.close_best_effort_in_place();
});
})
.with_context(|| {
format!("spawning lexical rebuild page-prep worker {worker_idx}")
})
})
.collect()
}
#[allow(clippy::too_many_arguments)]
fn spawn_lexical_rebuild_packet_producer(
db_path: PathBuf,
resume_after_conversation_id: Option<i64>,
planned_shard_plan: Option<LexicalShardPlan>,
page_size: i64,
pipeline_channel_size: usize,
first_budget_promotion_commit_thresholds: Option<(usize, usize, usize)>,
pipeline_budget_controller: Arc<LexicalRebuildPipelineBudgetController>,
tx: Sender<LexicalRebuildPipelineMessage>,
flow_limiter: Arc<StreamingByteLimiter>,
lexical_rebuild_worker_pool: Option<Arc<ThreadPool>>,
producer_telemetry: Arc<LexicalRebuildProducerTelemetry>,
) -> JoinHandle<()> {
let tracing_dispatch = tracing::dispatcher::get_default(|dispatch| dispatch.clone());
thread::spawn(move || {
tracing::dispatcher::with_default(&tracing_dispatch, || {
let prep_profile = std::env::var_os("CASS_PREP_PROFILE").is_some();
let prep_started = Instant::now();
let mut prep_step_started = Instant::now();
let log_prep_step = |step: &str, step_started: &mut Instant| {
let step_ms = step_started.elapsed().as_millis() as u64;
let total_ms = prep_started.elapsed().as_millis() as u64;
if prep_profile {
eprintln!(
"CASS_PREP_PROFILE component=producer step={step} step_ms={} total_ms={}",
step_ms, total_ms
);
tracing::info!(
component = "producer",
step,
step_ms,
total_ms,
"lexical rebuild prep profile"
);
}
*step_started = Instant::now();
};
let send_error = |error: anyhow::Error| {
let _ = tx.send(LexicalRebuildPipelineMessage::Error(format!("{error:#}")));
};
let mut storage = match FrankenStorage::open_readonly(&db_path) {
Ok(storage) => storage,
Err(err) => {
send_error(err.context(format!(
"opening readonly storage for lexical rebuild packet producer: {}",
db_path.display()
)));
return;
}
};
log_prep_step("open_readonly", &mut prep_step_started);
let source_map = storage
.list_sources()
.unwrap_or_default()
.into_iter()
.map(|source| (source.id, (source.kind, source.host_label)))
.collect::<HashMap<_, _>>();
log_prep_step("load_sources", &mut prep_step_started);
let (agent_slugs, workspace_paths) = match storage.build_lexical_rebuild_lookups() {
Ok(lookups) => lookups,
Err(err) => {
send_error(err.context(format!(
"building lexical rebuild lookup tables inside packet producer: {}",
db_path.display()
)));
return;
}
};
log_prep_step("build_lookups", &mut prep_step_started);
let mut last_conversation_id = resume_after_conversation_id.unwrap_or(0);
let planned_shards = if let Some(shard_plan) = planned_shard_plan {
let largest_shard_conversations = shard_plan
.shards
.iter()
.map(|shard| shard.conversation_count)
.max()
.unwrap_or(0);
let largest_shard_messages = shard_plan
.shards
.iter()
.map(|shard| shard.message_count)
.max()
.unwrap_or(0);
let largest_shard_message_bytes = shard_plan
.shards
.iter()
.map(|shard| shard.message_bytes)
.max()
.unwrap_or(0);
tracing::info!(
plan_id = %shard_plan.plan_id,
shard_count = shard_plan.shards.len(),
oversized_conversations = shard_plan.oversized_conversation_ids.len(),
max_conversations_per_shard = shard_plan.budgets.max_conversations_per_shard,
max_messages_per_shard = shard_plan.budgets.max_messages_per_shard,
max_message_bytes_per_shard = shard_plan.budgets.max_message_bytes_per_shard,
largest_shard_conversations,
largest_shard_messages,
largest_shard_message_bytes,
total_conversations = shard_plan.total_conversations,
total_messages = shard_plan.total_messages,
total_message_bytes = shard_plan.total_message_bytes,
"reused deterministic lexical rebuild shard plan inside packet producer"
);
Some(shard_plan.shards)
} else {
None
};
let mut planned_shard_cursor = planned_shards.and_then(|planned_shards| {
LexicalRebuildPlannedShardCursor::for_resume(
planned_shards,
resume_after_conversation_id,
)
});
log_prep_step("load_resume_cursor", &mut prep_step_started);
let page_prep_worker_count = lexical_rebuild_page_prep_worker_parallelism();
let reservation_order = Arc::new(LexicalRebuildReservationOrder::new());
let (work_tx, work_rx) = bounded::<LexicalRebuildPagePrepWork>(page_prep_worker_count);
let (result_tx, result_rx) =
bounded::<LexicalRebuildPagePrepResult>(page_prep_worker_count);
let source_map = Arc::new(source_map);
let worker_handles = match spawn_lexical_rebuild_page_prep_workers(
page_prep_worker_count,
db_path.clone(),
Arc::clone(&source_map),
work_rx,
result_tx,
Arc::clone(&flow_limiter),
Arc::clone(&reservation_order),
Arc::clone(&producer_telemetry),
lexical_rebuild_worker_pool.clone(),
) {
Ok(handles) => handles,
Err(err) => {
send_error(err);
producer_telemetry.record(page_prep_worker_count, 0, 0);
storage.close_best_effort_in_place();
return;
}
};
producer_telemetry.record(page_prep_worker_count, 0, 0);
tracing::info!(
page_prep_workers = page_prep_worker_count,
work_queue_capacity = page_prep_worker_count,
result_queue_capacity = page_prep_worker_count,
pipeline_channel_size,
"lexical rebuild producer started ordered page-prep workers"
);
log_prep_step("start_page_prep_workers", &mut prep_step_started);
let producer_result: Result<()> = (|| {
let mut logged_first_batch_handoff = false;
let mut logged_first_page_enqueued = false;
let mut last_logged_budget = None;
let mut next_sequence_to_enqueue = 0u64;
let mut next_sequence_to_emit = 0u64;
let mut active_work = 0usize;
let mut enumeration_done = false;
let mut first_budget_promotion_observed =
first_budget_promotion_commit_thresholds.is_none();
let mut handoff_conversations_since_budget = 0usize;
let mut handoff_messages_since_budget = 0usize;
let mut handoff_message_bytes_since_budget = 0usize;
let mut completed_pages = BTreeMap::<u64, LexicalRebuildPreparedPage>::new();
let mut logged_current_shard_index = None;
loop {
while !enumeration_done && active_work < page_prep_worker_count {
if let Some(shard_cursor) = planned_shard_cursor.as_mut() {
shard_cursor.skip_completed(last_conversation_id);
}
if let Some(shard) = planned_shard_cursor
.as_ref()
.and_then(LexicalRebuildPlannedShardCursor::current)
{
if logged_current_shard_index != Some(shard.shard_index) {
tracing::info!(
shard_index = shard.shard_index,
first_conversation_id = shard.first_conversation_id,
last_conversation_id = shard.last_conversation_id,
conversation_count = shard.conversation_count,
message_count = shard.message_count,
message_bytes = shard.message_bytes,
conversation_id_fingerprint =
%shard.conversation_id_fingerprint,
oversized_single_conversation =
shard.oversized_single_conversation,
"lexical rebuild producer opened planned shard boundary"
);
logged_current_shard_index = Some(shard.shard_index);
}
} else if planned_shard_cursor.is_some() {
enumeration_done = true;
break;
}
let pipeline_budget = pipeline_budget_controller.snapshot();
let max_active_page_work = page_prep_worker_count
.min(pipeline_budget.page_conversation_limit.max(1));
if active_work >= max_active_page_work {
break;
}
if last_logged_budget != Some(pipeline_budget) {
tracing::info!(
page_conversation_limit = pipeline_budget.page_conversation_limit,
batch_fetch_message_limit =
pipeline_budget.batch_fetch_message_limit,
batch_fetch_message_bytes_limit =
pipeline_budget.batch_fetch_message_bytes_limit,
max_message_bytes_in_flight =
pipeline_budget.max_message_bytes_in_flight,
commit_interval_conversations =
pipeline_budget.commit_interval_conversations,
commit_interval_messages = pipeline_budget.commit_interval_messages,
commit_interval_message_bytes =
pipeline_budget.commit_interval_message_bytes,
"lexical rebuild producer adopted pipeline budgets"
);
last_logged_budget = Some(pipeline_budget);
}
let conversation_page_limit =
i64::try_from(pipeline_budget.page_conversation_limit.max(1))
.unwrap_or(i64::MAX)
.min(page_size.max(1));
let current_planned_shard = planned_shard_cursor
.as_ref()
.and_then(LexicalRebuildPlannedShardCursor::current)
.cloned();
let conversation_list_started = Instant::now();
let conversation_page = if let Some(shard) = ¤t_planned_shard {
storage
.list_conversations_for_lexical_rebuild_after_id_through_id(
conversation_page_limit,
last_conversation_id,
shard.last_conversation_id,
&agent_slugs,
&workspace_paths,
)
.with_context(|| {
format!(
"listing lexical rebuild conversations after id {} through planned shard {} upper bound {}",
last_conversation_id,
shard.shard_index,
shard.last_conversation_id
)
})?
} else {
storage
.list_conversations_for_lexical_rebuild_after_id(
conversation_page_limit,
last_conversation_id,
&agent_slugs,
&workspace_paths,
)
.with_context(|| {
format!(
"listing lexical rebuild conversations after id {}",
last_conversation_id
)
})?
};
let conversation_list_duration = conversation_list_started.elapsed();
if conversation_page.is_empty() {
if let Some(shard_cursor) = planned_shard_cursor.as_mut() {
shard_cursor.advance();
continue;
}
enumeration_done = true;
break;
}
let page_last_conversation_id = conversation_page
.last()
.and_then(|conv| conv.id)
.ok_or_else(|| {
anyhow::anyhow!(
"lexical rebuild page missing terminal conversation id after {}",
last_conversation_id
)
})?;
let page_conversation_count =
conversation_page.iter().filter_map(|conv| conv.id).count();
let finishes_planned_shard =
current_planned_shard.as_ref().is_some_and(|shard| {
page_last_conversation_id >= shard.last_conversation_id
});
let work = LexicalRebuildPagePrepWork {
sequence: next_sequence_to_enqueue,
conversation_page,
page_last_conversation_id,
configured_page_size: page_size,
planned_shard_index: current_planned_shard
.as_ref()
.map(|shard| shard.shard_index),
finishes_planned_shard,
conversation_list_duration,
pipeline_budget,
budget_generation: pipeline_budget_controller.generation(),
};
work_tx.send(work).map_err(|_| {
anyhow::anyhow!(
"lexical rebuild page-prep work queue closed before producer finished"
)
})?;
if !logged_first_page_enqueued {
log_prep_step("first_page_enqueued", &mut prep_step_started);
logged_first_page_enqueued = true;
}
tracing::debug!(
sequence = next_sequence_to_enqueue,
active_work = active_work + 1,
shard_index = current_planned_shard
.as_ref()
.map(|shard| shard.shard_index),
page_last_conversation_id,
page_conversations = page_conversation_count,
"lexical rebuild producer enqueued page-prep work"
);
next_sequence_to_enqueue = next_sequence_to_enqueue.saturating_add(1);
active_work = active_work.saturating_add(1);
producer_telemetry.record(
page_prep_worker_count,
active_work,
completed_pages.len(),
);
last_conversation_id = page_last_conversation_id;
if finishes_planned_shard {
tracing::info!(
shard_index = current_planned_shard
.as_ref()
.map(|shard| shard.shard_index),
shard_last_conversation_id = current_planned_shard
.as_ref()
.map(|shard| shard.last_conversation_id),
"lexical rebuild producer completed planned shard boundary"
);
if let Some(shard_cursor) = planned_shard_cursor.as_mut() {
shard_cursor.advance();
}
}
}
while let Some(prepared_page) = completed_pages.remove(&next_sequence_to_emit) {
producer_telemetry.record(
page_prep_worker_count,
active_work,
completed_pages.len(),
);
let reserved_bytes =
lexical_rebuild_prepared_page_reserved_bytes(&prepared_page);
let page_handoff_conversations = prepared_page.packets.len();
let page_handoff_messages = prepared_page
.packets
.iter()
.map(|packet| packet.message_count)
.sum::<usize>();
let page_handoff_message_bytes = prepared_page
.packets
.iter()
.map(|packet| packet.message_bytes)
.sum::<usize>();
let page_planned_shard_index = prepared_page.planned_shard_index;
let page_finishes_planned_shard = prepared_page.finishes_planned_shard;
match tx.try_send(LexicalRebuildPipelineMessage::Batch(prepared_page)) {
Ok(()) => {}
Err(TrySendError::Full(message)) => {
let handoff_wait_started = Instant::now();
if tx.send(message).is_err() {
reservation_order.close();
flow_limiter.release(reserved_bytes);
release_completed_lexical_rebuild_pages(
&mut completed_pages,
flow_limiter.as_ref(),
);
return Err(anyhow::anyhow!(
"lexical rebuild consumer disconnected before ordered page handoff"
));
}
producer_telemetry
.record_handoff_wait(handoff_wait_started.elapsed());
}
Err(TrySendError::Disconnected(_message)) => {
reservation_order.close();
flow_limiter.release(reserved_bytes);
release_completed_lexical_rebuild_pages(
&mut completed_pages,
flow_limiter.as_ref(),
);
return Err(anyhow::anyhow!(
"lexical rebuild consumer disconnected before ordered page handoff"
));
}
}
handoff_conversations_since_budget = handoff_conversations_since_budget
.saturating_add(page_handoff_conversations);
handoff_messages_since_budget =
handoff_messages_since_budget.saturating_add(page_handoff_messages);
handoff_message_bytes_since_budget = handoff_message_bytes_since_budget
.saturating_add(page_handoff_message_bytes);
if !logged_first_batch_handoff {
log_prep_step("first_batch_handoff", &mut prep_step_started);
let total_startup_ms = prep_started.elapsed().as_millis() as u64;
tracing::info!(
component = "producer",
total_startup_ms,
conversations_in_first_batch = page_handoff_conversations,
"lexical rebuild producer delivered first batch — serial startup complete"
);
logged_first_batch_handoff = true;
}
tracing::debug!(
sequence = next_sequence_to_emit,
active_work,
ordered_buffered_pages = completed_pages.len(),
planned_shard_index = page_planned_shard_index,
finishes_planned_shard = page_finishes_planned_shard,
"lexical rebuild producer handed off ordered prepared page"
);
next_sequence_to_emit = next_sequence_to_emit.saturating_add(1);
if !first_budget_promotion_observed
&& let Some((
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
)) = first_budget_promotion_commit_thresholds
&& should_commit_lexical_rebuild(
handoff_conversations_since_budget,
handoff_messages_since_budget,
handoff_message_bytes_since_budget,
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
)
{
let observed_generation = pipeline_budget_controller.generation();
tracing::info!(
observed_generation,
handoff_conversations_since_budget,
handoff_messages_since_budget,
handoff_message_bytes_since_budget,
"lexical rebuild producer waiting for first durable budget promotion"
);
if pipeline_budget_controller.wait_for_update_after(
observed_generation,
lexical_rebuild_first_budget_promotion_wait(),
) {
first_budget_promotion_observed = true;
handoff_conversations_since_budget = 0;
handoff_messages_since_budget = 0;
handoff_message_bytes_since_budget = 0;
tracing::info!(
new_generation = pipeline_budget_controller.generation(),
"lexical rebuild producer observed first durable budget promotion"
);
} else {
first_budget_promotion_observed = true;
tracing::warn!(
observed_generation,
wait_ms = lexical_rebuild_first_budget_promotion_wait()
.as_millis()
as u64,
"lexical rebuild producer timed out waiting for first durable budget promotion; continuing with current budgets"
);
}
}
}
if enumeration_done && active_work == 0 {
break;
}
match result_rx.recv() {
Ok(LexicalRebuildPagePrepResult::Prepared(prepared)) => {
active_work = active_work.saturating_sub(1);
producer_telemetry.record(
page_prep_worker_count,
active_work,
completed_pages.len(),
);
if prepared.sequence < next_sequence_to_emit {
release_lexical_rebuild_prepared_page_reservation(
&prepared.page,
flow_limiter.as_ref(),
);
reservation_order.close();
release_completed_lexical_rebuild_pages(
&mut completed_pages,
flow_limiter.as_ref(),
);
return Err(anyhow::anyhow!(
"lexical rebuild page-prep worker returned stale sequence {} after ordered barrier {}",
prepared.sequence,
next_sequence_to_emit
));
}
if let Some(previous) =
completed_pages.insert(prepared.sequence, prepared.page)
{
release_lexical_rebuild_prepared_page_reservation(
&previous,
flow_limiter.as_ref(),
);
reservation_order.close();
release_completed_lexical_rebuild_pages(
&mut completed_pages,
flow_limiter.as_ref(),
);
return Err(anyhow::anyhow!(
"lexical rebuild page-prep worker returned duplicate sequence {}",
prepared.sequence
));
}
producer_telemetry.record(
page_prep_worker_count,
active_work,
completed_pages.len(),
);
tracing::debug!(
sequence = prepared.sequence,
active_work,
ordered_buffered_pages = completed_pages.len(),
"lexical rebuild producer received prepared page from worker"
);
}
Ok(LexicalRebuildPagePrepResult::Error { sequence, error }) => {
reservation_order.close();
release_completed_lexical_rebuild_pages(
&mut completed_pages,
flow_limiter.as_ref(),
);
return Err(anyhow::anyhow!(
"lexical rebuild page-prep worker failed at sequence {}: {}",
sequence,
error
));
}
Err(_) => {
reservation_order.close();
release_completed_lexical_rebuild_pages(
&mut completed_pages,
flow_limiter.as_ref(),
);
return Err(anyhow::anyhow!(
"lexical rebuild page-prep result queue closed before producer completion"
));
}
}
}
Ok(())
})();
if producer_result.is_err() {
reservation_order.close();
}
drop(work_tx);
drop(result_rx);
for worker_handle in worker_handles {
if let Err(payload) = worker_handle.join() {
let panic_message = panic_payload_message(payload);
tracing::warn!(
error = %panic_message,
"lexical rebuild page-prep worker panicked while producer was shutting down"
);
if producer_result.is_ok() {
send_error(anyhow::anyhow!(
"lexical rebuild page-prep worker panicked: {}",
panic_message
));
producer_telemetry.record(page_prep_worker_count, 0, 0);
storage.close_best_effort_in_place();
return;
}
}
}
producer_telemetry.record(page_prep_worker_count, 0, 0);
storage.close_best_effort_in_place();
match producer_result {
Ok(()) => {
let _ = tx.send(LexicalRebuildPipelineMessage::Done);
}
Err(err) => send_error(err),
}
})
})
}
#[allow(dead_code)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum LexicalPublishRenameSite {
FirstPublish,
LinuxParkPriorLiveToCanonicalSidecar,
LinuxRetainCanonicalSidecar,
NonLinuxParkPriorLive,
NonLinuxPublishStagedLive,
NonLinuxRetainPriorLive,
RecoverRetainStaleInProgress,
RecoverRestorePriorLive,
}
#[cfg(test)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct LexicalPublishInjectedRenameFailure {
site: LexicalPublishRenameSite,
remaining_hits: usize,
raw_os_error: i32,
}
#[cfg(test)]
static LEXICAL_PUBLISH_INJECTED_RENAME_FAILURE: std::sync::Mutex<
Option<LexicalPublishInjectedRenameFailure>,
> = std::sync::Mutex::new(None);
#[cfg(test)]
struct LexicalPublishInjectedRenameFailureGuard {
previous: Option<LexicalPublishInjectedRenameFailure>,
}
#[cfg(test)]
impl Drop for LexicalPublishInjectedRenameFailureGuard {
fn drop(&mut self) {
let mut guard = match LEXICAL_PUBLISH_INJECTED_RENAME_FAILURE.lock() {
Ok(guard) => guard,
Err(poisoned) => poisoned.into_inner(),
};
*guard = self.previous;
}
}
#[cfg(test)]
fn inject_lexical_publish_rename_failure_once(
site: LexicalPublishRenameSite,
raw_os_error: i32,
) -> LexicalPublishInjectedRenameFailureGuard {
let mut guard = LEXICAL_PUBLISH_INJECTED_RENAME_FAILURE
.lock()
.expect("lexical publish rename fault injection lock");
let previous = *guard;
*guard = Some(LexicalPublishInjectedRenameFailure {
site,
remaining_hits: 1,
raw_os_error,
});
LexicalPublishInjectedRenameFailureGuard { previous }
}
#[cfg(test)]
fn maybe_inject_lexical_publish_rename_failure(
site: LexicalPublishRenameSite,
) -> std::io::Result<()> {
let mut guard = LEXICAL_PUBLISH_INJECTED_RENAME_FAILURE
.lock()
.expect("lexical publish rename fault injection lock");
let Some(state) = *guard else {
return Ok(());
};
if state.site != site || state.remaining_hits == 0 {
return Ok(());
}
if state.remaining_hits == 1 {
*guard = None;
} else {
*guard = Some(LexicalPublishInjectedRenameFailure {
remaining_hits: state.remaining_hits - 1,
..state
});
}
Err(std::io::Error::from_raw_os_error(state.raw_os_error))
}
fn rename_lexical_publish_path(
src: &Path,
dst: &Path,
site: LexicalPublishRenameSite,
) -> std::io::Result<()> {
#[cfg(test)]
maybe_inject_lexical_publish_rename_failure(site)?;
#[cfg(not(test))]
let _ = site;
fs::rename(src, dst)
}
#[cfg(target_os = "linux")]
fn maybe_pause_lexical_publish_for_kill_relaunch(
index_path: &Path,
canonical_sidecar: &Path,
) -> Result<()> {
let sentinel_path = match dotenvy::var("CASS_TEST_LEXICAL_PUBLISH_KILL_RELAUNCH_SENTINEL") {
Ok(raw) if !raw.trim().is_empty() => PathBuf::from(raw),
_ => return Ok(()),
};
let sleep_ms = dotenvy::var("CASS_TEST_LEXICAL_PUBLISH_KILL_RELAUNCH_SLEEP_MS")
.ok()
.and_then(|raw| raw.trim().parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(30_000);
let payload = serde_json::json!({
"stage": "linux_swap_committed_prior_live_parked",
"pid": std::process::id(),
"live_index_path": index_path.display().to_string(),
"canonical_sidecar_path": canonical_sidecar.display().to_string(),
});
write_json_pretty_atomically(&sentinel_path, &payload).with_context(|| {
format!(
"writing lexical publish kill-relaunch sentinel {}",
sentinel_path.display()
)
})?;
thread::sleep(Duration::from_millis(sleep_ms));
Ok(())
}
fn publish_staged_lexical_index(staged_index_path: &Path, index_path: &Path) -> Result<()> {
if let Some(parent) = index_path.parent() {
fs::create_dir_all(parent).with_context(|| {
format!(
"creating parent directory for staged lexical publish {}",
parent.display()
)
})?;
}
recover_or_finalize_interrupted_lexical_publish_backup(index_path)?;
ensure_lexical_publish_backups_dir(index_path)?;
if !index_path.exists() {
rename_lexical_publish_path(
staged_index_path,
index_path,
LexicalPublishRenameSite::FirstPublish,
)
.with_context(|| {
format!(
"publishing first staged lexical index {} -> {}",
staged_index_path.display(),
index_path.display()
)
})?;
sync_parent_directory(index_path)?;
return Ok(());
}
let retained_backup_path = unique_lexical_publish_backup_path(index_path);
#[cfg(target_os = "linux")]
{
if let Err(err) = atomic_exchange_paths(index_path, staged_index_path) {
let einval = err.chain().any(|cause| {
cause
.downcast_ref::<std::io::Error>()
.and_then(std::io::Error::raw_os_error)
== Some(linux_publish_swap::EINVAL)
});
if einval {
tracing::info!(
index_path = %index_path.display(),
staged_index_path = %staged_index_path.display(),
"renameat2(RENAME_EXCHANGE) returned EINVAL; falling through to rename-pair publish (likely NFSv3 or other filesystem without RENAME_EXCHANGE support)"
);
publish_via_rename_pair(staged_index_path, index_path, &retained_backup_path)?;
sync_parent_directory(index_path)?;
sync_parent_directory(&retained_backup_path)?;
tracing::info!(
retained_backup_path = %retained_backup_path.display(),
"staged lexical index published via rename-pair fallback"
);
return Ok(());
}
return Err(err);
}
let canonical_sidecar = lexical_publish_in_progress_backup_path(index_path);
if let Err(park_err) = rename_lexical_publish_path(
staged_index_path,
&canonical_sidecar,
LexicalPublishRenameSite::LinuxParkPriorLiveToCanonicalSidecar,
) {
match atomic_exchange_paths(index_path, staged_index_path) {
Ok(()) => {
return Err(park_err).with_context(|| {
format!(
"parking prior lexical index at canonical sidecar {} after atomic swap publish failed; rolled back to keep previous live index at {}",
canonical_sidecar.display(),
index_path.display()
)
});
}
Err(rollback_err) => {
return Err(anyhow::anyhow!(
"parking prior lexical index at canonical sidecar {} after atomic swap publish failed: {park_err:#}; rollback also failed: {rollback_err:#}",
canonical_sidecar.display()
));
}
}
}
maybe_pause_lexical_publish_for_kill_relaunch(index_path, &canonical_sidecar)
.with_context(|| {
format!(
"pausing lexical publish after parking prior live generation at {}",
canonical_sidecar.display()
)
})?;
if let Err(retain_err) = rename_lexical_publish_path(
&canonical_sidecar,
&retained_backup_path,
LexicalPublishRenameSite::LinuxRetainCanonicalSidecar,
) {
tracing::warn!(
error = %retain_err,
canonical_sidecar = %canonical_sidecar.display(),
retained_backup_path = %retained_backup_path.display(),
"published staged lexical index but could not move the prior live artifact into retained-backup storage; canonical sidecar preserved — startup recovery will finish the retain step"
);
}
}
#[cfg(not(target_os = "linux"))]
{
publish_via_rename_pair(staged_index_path, index_path, &retained_backup_path)?;
}
sync_parent_directory(index_path)?;
sync_parent_directory(&retained_backup_path)?;
tracing::info!(
retained_backup_path = %retained_backup_path.display(),
live_index_path = %index_path.display(),
"published staged lexical index and retained the prior live artifact as a backup"
);
if let Err(prune_err) = prune_lexical_publish_backups(index_path) {
tracing::warn!(
error = %prune_err,
live_index_path = %index_path.display(),
"failed to prune old retained lexical-publish backups; disk may not be reclaimed until the next successful prune attempt"
);
}
Ok(())
}
fn publish_via_rename_pair(
staged_index_path: &Path,
index_path: &Path,
retained_backup_path: &Path,
) -> Result<()> {
let in_progress_backup_path = lexical_publish_in_progress_backup_path(index_path);
rename_lexical_publish_path(
index_path,
&in_progress_backup_path,
LexicalPublishRenameSite::NonLinuxParkPriorLive,
)
.with_context(|| {
format!(
"parking prior lexical index at {} before staged publish",
in_progress_backup_path.display()
)
})?;
if let Err(publish_err) = rename_lexical_publish_path(
staged_index_path,
index_path,
LexicalPublishRenameSite::NonLinuxPublishStagedLive,
) {
match rename_lexical_publish_path(
&in_progress_backup_path,
index_path,
LexicalPublishRenameSite::RecoverRestorePriorLive,
) {
Ok(()) => {
return Err(publish_err).with_context(|| {
format!(
"publishing staged lexical index {} -> {} failed after parking the prior live index; restored the previous live index",
staged_index_path.display(),
index_path.display()
)
});
}
Err(rollback_err) => {
return Err(anyhow::anyhow!(
"publishing staged lexical index {} -> {} failed after parking the prior live index at {}: {publish_err:#}; rollback also failed: {rollback_err:#}",
staged_index_path.display(),
index_path.display(),
in_progress_backup_path.display()
));
}
}
}
if let Err(retain_err) = rename_lexical_publish_path(
&in_progress_backup_path,
retained_backup_path,
LexicalPublishRenameSite::NonLinuxRetainPriorLive,
) {
tracing::warn!(
error = %retain_err,
backup_path = %in_progress_backup_path.display(),
retained_backup_path = %retained_backup_path.display(),
"published staged lexical index but could not move the prior live artifact into retained backup storage"
);
}
Ok(())
}
fn lexical_publish_backup_retention_limit() -> usize {
const DEFAULT_RETENTION: usize = 1;
dotenvy::var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION")
.ok()
.and_then(|v| v.trim().parse::<usize>().ok())
.unwrap_or(DEFAULT_RETENTION)
}
fn prune_lexical_publish_backups(index_path: &Path) -> Result<()> {
let retention_limit = lexical_publish_backup_retention_limit();
let backups_dir = lexical_publish_backups_dir(index_path);
if !backups_dir.exists() {
return Ok(());
}
let mut entries: Vec<(PathBuf, SystemTime, u64)> = Vec::new();
for entry in fs::read_dir(&backups_dir).with_context(|| {
format!(
"reading retained lexical-publish backups dir for pruning {}",
backups_dir.display()
)
})? {
let entry = match entry {
Ok(e) => e,
Err(err) => {
tracing::debug!(
error = %err,
backups_dir = %backups_dir.display(),
"skipping unreadable entry while enumerating retained lexical-publish backups"
);
continue;
}
};
let path = entry.path();
let metadata = match entry.metadata() {
Ok(m) => m,
Err(err) => {
tracing::debug!(
error = %err,
path = %path.display(),
"skipping retained lexical-publish backup without readable metadata"
);
continue;
}
};
let modified = metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH);
let size = directory_size_bytes_best_effort(&path);
entries.push((path, modified, size));
}
if entries.len() <= retention_limit {
return Ok(());
}
entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
let mut pruned_count = 0usize;
let mut freed_bytes: u64 = 0;
for (path, _modified, size) in entries.into_iter().skip(retention_limit) {
match fs::remove_dir_all(&path) {
Ok(()) => {
pruned_count = pruned_count.saturating_add(1);
freed_bytes = freed_bytes.saturating_add(size);
tracing::info!(
pruned_backup_path = %path.display(),
freed_bytes = size,
retention_limit,
"pruned a retained lexical-publish backup beyond retention cap"
);
}
Err(err) => {
tracing::warn!(
error = %err,
pruned_backup_path = %path.display(),
"failed to prune retained lexical-publish backup; leaving on disk"
);
}
}
}
if pruned_count > 0 {
tracing::info!(
pruned_count,
freed_bytes,
retention_limit,
backups_dir = %backups_dir.display(),
"completed lexical-publish retention prune"
);
}
Ok(())
}
fn directory_size_bytes_best_effort(path: &Path) -> u64 {
let mut total: u64 = 0;
let mut stack: Vec<PathBuf> = vec![path.to_path_buf()];
while let Some(current) = stack.pop() {
let Ok(iter) = fs::read_dir(¤t) else {
continue;
};
for entry in iter.flatten() {
let entry_path = entry.path();
match entry.metadata() {
Ok(m) if m.is_dir() => stack.push(entry_path),
Ok(m) => total = total.saturating_add(m.len()),
Err(_) => continue,
}
}
}
total
}
fn lexical_publish_backups_dir(index_path: &Path) -> PathBuf {
index_path
.parent()
.unwrap_or(index_path)
.join(".lexical-publish-backups")
}
fn lexical_publish_in_progress_backup_path(index_path: &Path) -> PathBuf {
let file_name = index_path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or("index");
index_path.with_file_name(format!(".{file_name}.publish-in-progress.bak"))
}
fn unique_lexical_publish_backup_path(index_path: &Path) -> PathBuf {
let file_name = index_path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or("index");
let backup_seed = lexical_publish_backups_dir(index_path).join(file_name);
unique_atomic_sidecar_path(&backup_seed, "published-bak", "index")
}
fn ensure_lexical_publish_backups_dir(index_path: &Path) -> Result<()> {
let backups_dir = lexical_publish_backups_dir(index_path);
fs::create_dir_all(&backups_dir).with_context(|| {
format!(
"creating retained lexical publish backup directory {}",
backups_dir.display()
)
})?;
sync_parent_directory(&backups_dir)?;
Ok(())
}
fn recover_or_finalize_interrupted_lexical_publish_backup(index_path: &Path) -> Result<()> {
let in_progress_backup_path = lexical_publish_in_progress_backup_path(index_path);
if !in_progress_backup_path.exists() {
return Ok(());
}
if index_path.exists() {
ensure_lexical_publish_backups_dir(index_path)?;
let retained_backup_path = unique_lexical_publish_backup_path(index_path);
rename_lexical_publish_path(
&in_progress_backup_path,
&retained_backup_path,
LexicalPublishRenameSite::RecoverRetainStaleInProgress,
)
.with_context(|| {
format!(
"moving stale in-progress lexical publish backup {} into retained backup storage {}",
in_progress_backup_path.display(),
retained_backup_path.display()
)
})?;
sync_parent_directory(&retained_backup_path)?;
tracing::info!(
backup_path = %retained_backup_path.display(),
live_index_path = %index_path.display(),
"retained a stale in-progress lexical publish backup because the live index was already present"
);
return Ok(());
}
if let Some(parent) = index_path.parent() {
fs::create_dir_all(parent).with_context(|| {
format!(
"creating parent directory for interrupted lexical publish recovery {}",
parent.display()
)
})?;
}
rename_lexical_publish_path(
&in_progress_backup_path,
index_path,
LexicalPublishRenameSite::RecoverRestorePriorLive,
)
.with_context(|| {
format!(
"restoring prior published lexical index from interrupted publish backup {} -> {}",
in_progress_backup_path.display(),
index_path.display()
)
})?;
sync_parent_directory(index_path)?;
tracing::warn!(
recovered_backup_path = %index_path.display(),
"restored the prior published lexical index from an interrupted staged publish backup"
);
Ok(())
}
#[cfg(target_os = "linux")]
fn path_to_cstring(path: &Path) -> Result<CString> {
CString::new(path.as_os_str().as_bytes()).with_context(|| {
format!(
"encoding filesystem path for Linux atomic lexical publish swap: {}",
path.display()
)
})
}
#[cfg(target_os = "linux")]
fn atomic_exchange_paths(left: &Path, right: &Path) -> Result<()> {
let left_c = path_to_cstring(left)?;
let right_c = path_to_cstring(right)?;
let result = unsafe {
linux_publish_swap::renameat2(
linux_publish_swap::AT_FDCWD,
left_c.as_ptr(),
linux_publish_swap::AT_FDCWD,
right_c.as_ptr(),
linux_publish_swap::RENAME_EXCHANGE,
)
};
if result == 0 {
return Ok(());
}
Err(std::io::Error::last_os_error()).with_context(|| {
format!(
"atomically exchanging lexical publish paths {} <-> {}",
left.display(),
right.display()
)
})
}
struct LexicalRebuildFinalMergeArtifact {
publish_path: PathBuf,
docs: usize,
#[allow(dead_code)]
segments: usize,
}
#[cfg_attr(not(test), allow(dead_code))]
fn finalize_staged_lexical_rebuild_publish_artifact(
output_path: &Path,
input_paths: &[PathBuf],
stage_root: &Path,
max_parallel_jobs: usize,
) -> Result<LexicalRebuildFinalMergeArtifact> {
let mut input_artifacts = Vec::with_capacity(input_paths.len());
for (idx, input_path) in input_paths.iter().enumerate() {
let summary =
crate::search::tantivy::searchable_index_summary(input_path)?.ok_or_else(|| {
anyhow::anyhow!(
"staged lexical rebuild artifact is not searchable: {}",
input_path.display()
)
})?;
input_artifacts.push(LexicalRebuildShardMergeArtifact {
first_shard_index: idx,
last_shard_index: idx,
index_path: input_path.clone(),
docs: summary.docs,
segments: summary.segments,
});
}
finalize_staged_lexical_rebuild_publish_artifact_from_artifacts(
output_path,
&input_artifacts,
stage_root,
max_parallel_jobs,
)
}
fn finalize_staged_lexical_rebuild_publish_artifact_from_artifacts(
output_path: &Path,
input_artifacts: &[LexicalRebuildShardMergeArtifact],
stage_root: &Path,
max_parallel_jobs: usize,
) -> Result<LexicalRebuildFinalMergeArtifact> {
if input_artifacts.is_empty() {
return Err(anyhow::anyhow!(
"cannot finalize staged lexical rebuild without at least one merged artifact"
));
}
if input_artifacts.len() == 1 {
let artifact = &input_artifacts[0];
let publish_path = artifact.index_path.clone();
tracing::info!(
publish_path = %publish_path.display(),
"reusing already-final staged lexical rebuild artifact without redundant final merge"
);
crate::search::tantivy::validate_searchable_index_contract(&publish_path).with_context(
|| {
format!(
"single-input staged lexical rebuild artifact is not searchable: {}",
publish_path.display()
)
},
)?;
return Ok(LexicalRebuildFinalMergeArtifact {
publish_path,
docs: artifact.docs,
segments: artifact.segments,
});
}
tracing::info!(
publish_path = %output_path.display(),
staged_artifacts = input_artifacts.len(),
"publishing staged lexical rebuild as federated lexical shard set without final assembly collapse"
);
let _ = stage_root;
let _ = max_parallel_jobs;
let publish_inputs = input_artifacts
.iter()
.map(|artifact| {
(
artifact.index_path.clone(),
SearchableIndexSummary {
docs: artifact.docs,
segments: artifact.segments,
},
)
})
.collect::<Vec<_>>();
let summary =
crate::search::tantivy::publish_federated_searchable_index_directories_with_summaries(
output_path,
&publish_inputs,
)?;
Ok(LexicalRebuildFinalMergeArtifact {
publish_path: output_path.to_path_buf(),
docs: summary.docs,
segments: summary.segments,
})
}
#[cfg_attr(not(test), allow(dead_code))]
fn merge_lexical_rebuild_shard_index_tree(
output_path: &Path,
input_paths: &[PathBuf],
stage_root: &Path,
max_parallel_jobs: usize,
) -> Result<crate::search::tantivy::TantivyIndex> {
const MERGE_FAN_IN: usize = 4;
if input_paths.is_empty() {
return Err(anyhow::anyhow!(
"cannot merge staged lexical rebuild shards without at least one shard index"
));
}
if input_paths.len() <= MERGE_FAN_IN {
return crate::search::tantivy::TantivyIndex::merge_compatible_index_directories(
output_path,
input_paths,
);
}
let mut current_round_paths = input_paths.to_vec();
let mut round = 0usize;
let merge_pool = if max_parallel_jobs > 1 {
Some(
ThreadPoolBuilder::new()
.num_threads(max_parallel_jobs)
.thread_name(|idx| format!("cass-lexical-merge-{idx}"))
.build()
.context("building staged lexical merge thread pool")?,
)
} else {
None
};
while current_round_paths.len() > 1 {
let round_dir = stage_root.join(format!("round-{round:05}"));
fs::create_dir_all(&round_dir).with_context(|| {
format!(
"creating staged lexical merge round directory {}",
round_dir.display()
)
})?;
let final_round = current_round_paths.len() <= MERGE_FAN_IN;
let merge_jobs = current_round_paths
.chunks(MERGE_FAN_IN)
.enumerate()
.map(|(group_idx, group_inputs)| LexicalRebuildShardMergeWork {
output_path: if final_round {
output_path.to_path_buf()
} else {
round_dir.join(format!("merge-{group_idx:05}"))
},
input_paths: group_inputs.to_vec(),
})
.collect::<Vec<_>>();
tracing::info!(
merge_round = round,
merge_jobs = merge_jobs.len(),
merge_inputs = current_round_paths.len(),
merge_fan_in = MERGE_FAN_IN,
merge_workers = max_parallel_jobs.max(1),
"running staged lexical rebuild merge round"
);
let merge_job = |job: LexicalRebuildShardMergeWork| {
crate::search::tantivy::TantivyIndex::merge_compatible_index_directories(
&job.output_path,
&job.input_paths,
)
.with_context(|| {
format!(
"merging {} staged lexical shard indices into {}",
job.input_paths.len(),
job.output_path.display()
)
})?;
Ok::<PathBuf, anyhow::Error>(job.output_path)
};
let merge_jobs_parallel = merge_jobs.clone();
let run_merge_jobs_parallel = move || {
merge_jobs_parallel
.into_par_iter()
.map(merge_job)
.collect::<Result<Vec<_>>>()
};
let run_merge_jobs_sequential = move || {
merge_jobs
.into_iter()
.map(merge_job)
.collect::<Result<Vec<_>>>()
};
current_round_paths = match &merge_pool {
Some(pool) => pool.install(run_merge_jobs_parallel)?,
None => run_merge_jobs_sequential()?,
};
round = round.saturating_add(1);
}
crate::search::tantivy::TantivyIndex::open_or_create(output_path)
}
#[allow(clippy::too_many_arguments)]
fn rebuild_tantivy_from_db_via_staged_shards(
db_path: &Path,
index_path: &Path,
total_conversations: usize,
progress: Option<Arc<IndexingProgress>>,
options: LexicalRebuildStartupOptions,
prep_profile_started: Option<Instant>,
storage: FrankenStorage,
mut rebuild_state: LexicalRebuildState,
shard_plan: LexicalShardPlan,
pipeline_settings: LexicalRebuildPipelineSettingsSnapshot,
progress_heartbeat_interval_conversations: usize,
progress_heartbeat_interval: Duration,
mut latest_pipeline_runtime: LexicalRebuildPipelineRuntimeSnapshot,
mut responsiveness_controller: LexicalRebuildResponsivenessController,
pipeline_budget_controller: Arc<LexicalRebuildPipelineBudgetController>,
mut current_batch_conversation_limit: usize,
lexical_rebuild_flow_limiter: Arc<StreamingByteLimiter>,
producer_telemetry: Arc<LexicalRebuildProducerTelemetry>,
lexical_rebuild_worker_pool: Option<Arc<ThreadPool>>,
pipeline_rx: Receiver<LexicalRebuildPipelineMessage>,
producer_handle: JoinHandle<()>,
mut perf_profile: Option<LexicalRebuildPerfProfile>,
rebuild_profile_started: Option<Instant>,
progress_bump: Option<Arc<AtomicI64>>,
) -> Result<LexicalRebuildOutcome> {
let persist_initial_checkpoint_started = Instant::now();
persist_lexical_rebuild_state_for_active_run_start(index_path, &rebuild_state)?;
bump_index_run_lock_progress_if_present(progress_bump.as_ref());
log_lexical_rebuild_prep_profile_step(
prep_profile_started,
persist_initial_checkpoint_started,
"persist_initial_checkpoint",
);
if let Some(p) = &progress {
p.phase.store(2, Ordering::Relaxed);
p.is_rebuilding.store(true, Ordering::Relaxed);
p.total.store(total_conversations, Ordering::Relaxed);
p.current
.store(rebuild_state.processed_conversations, Ordering::Relaxed);
p.discovered_agents.store(0, Ordering::Relaxed);
}
let lexical_rebuild_started = Instant::now();
let stage_parent = index_path.parent().unwrap_or(index_path);
let shard_stage_root = TempDirBuilder::new()
.prefix("cass-lexical-shards.")
.tempdir_in(stage_parent)
.with_context(|| {
format!(
"creating staged lexical shard-build root under {}",
stage_parent.display()
)
})?;
let merge_stage_root = TempDirBuilder::new()
.prefix("cass-lexical-merge.")
.tempdir_in(stage_parent)
.with_context(|| {
format!(
"creating staged lexical merge root under {}",
stage_parent.display()
)
})?;
let eager_merge_stage_root = merge_stage_root.path().join("eager");
let final_merge_stage_root = merge_stage_root.path().join("final");
let staged_merged_index_path = merge_stage_root.path().join("index");
let staged_publish_base_meta_fingerprint = index_meta_fingerprint(index_path)?;
let shard_builder_settings =
lexical_rebuild_staged_shard_builder_settings(&pipeline_settings, shard_plan.shards.len());
let shard_merge_settings =
lexical_rebuild_staged_shard_merge_settings(&pipeline_settings, shard_plan.shards.len());
tracing::info!(
db_path = %db_path.display(),
planned_shards = shard_plan.shards.len(),
plan_id = %shard_plan.plan_id,
shard_builders_max = shard_builder_settings.max_builders,
shard_builder_writer_parallelism_budget =
shard_builder_settings.writer_parallelism_budget,
eager_merge_workers = shard_merge_settings.workers,
"running fresh authoritative lexical rebuild via staged shard-build path"
);
let (shard_work_tx, shard_work_rx) =
bounded::<LexicalRebuildShardBuildWork>(shard_builder_settings.max_builders.max(1));
let (shard_result_tx, shard_result_rx) = bounded::<LexicalRebuildShardBuildMessage>(
shard_plan
.shards
.len()
.max(shard_builder_settings.max_builders)
.max(1),
);
let shard_builder_handles = spawn_lexical_rebuild_shard_builder_workers(
shard_builder_settings.max_builders,
shard_work_rx,
shard_result_tx,
Arc::clone(&lexical_rebuild_flow_limiter),
lexical_rebuild_worker_pool.clone(),
);
let shard_work_dispatch_tx = shard_work_tx.clone();
let (merge_work_tx, merge_work_rx) =
bounded::<LexicalRebuildShardMergeJob>(shard_merge_settings.workers.max(1));
let (merge_result_tx, merge_result_rx) =
bounded::<LexicalRebuildShardMergeMessage>(shard_plan.shards.len().max(1));
let merge_worker_handles = spawn_lexical_rebuild_shard_merge_workers(
shard_merge_settings.workers,
merge_work_rx,
merge_result_tx,
);
let mut indexed_docs = rebuild_state.indexed_docs;
let mut observed_messages = rebuild_state.indexed_docs;
let mut processed_conversations = rebuild_state.processed_conversations;
let mut last_processed_conversation_id = rebuild_state.committed_conversation_id;
let mut conversations_since_progress_persist = 0usize;
let mut last_progress_persist = Instant::now();
let mut current_shard_packets = Vec::new();
let mut current_shard_index = None;
let mut current_shard_message_bytes = 0usize;
let mut enqueued_shards = 0usize;
let mut received_shard_results = 0usize;
let mut active_shard_build_jobs = 0usize;
let mut next_shard_to_commit = 0usize;
let mut pending_completed_shards = BTreeMap::new();
let mut completed_shard_artifacts = Vec::with_capacity(shard_plan.shards.len());
let mut scheduled_completed_shard_artifacts = 0usize;
let mut pending_shard_build_jobs: VecDeque<LexicalRebuildShardBuildWork> = VecDeque::new();
let mut pending_shard_build_message_bytes = 0usize;
let pending_shard_build_max_message_bytes =
lexical_rebuild_pending_shard_build_max_message_bytes(&pipeline_settings);
let pending_shard_build_max_jobs =
lexical_rebuild_pending_shard_build_max_jobs(&pipeline_settings);
let mut merge_coordinator = LexicalRebuildShardMergeCoordinator::new(eager_merge_stage_root);
let staged_merge_controller = LexicalRebuildStagedMergeController::new(
shard_merge_settings.workers,
pipeline_settings.controller_loadavg_high_watermark_1m_milli,
);
let staged_shard_build_controller = LexicalRebuildStagedShardBuildController::new(
shard_builder_settings.max_builders,
pipeline_settings.controller_loadavg_high_watermark_1m_milli,
);
let shard_build_telemetry = LexicalRebuildShardBuildTelemetry::default();
let mut max_conversation_id = 0i64;
let mut max_message_id = 0i64;
let mut final_merge_input_artifacts: Option<Vec<LexicalRebuildShardMergeArtifact>> = None;
let mut equivalence_accumulator = LexicalRebuildEquivalenceAccumulator::new();
let refresh_runtime =
|latest_pipeline_runtime: &mut LexicalRebuildPipelineRuntimeSnapshot,
responsiveness_controller: &mut LexicalRebuildResponsivenessController,
current_batch_conversation_limit: &mut usize,
pending_batch_conversations: usize,
pending_batch_message_bytes: usize| {
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
responsiveness_controller,
pipeline_budget_controller.as_ref(),
current_batch_conversation_limit,
None,
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
pipeline_rx.len(),
pending_batch_conversations,
pending_batch_message_bytes,
),
);
};
let refresh_staged_parallelism_and_dispatch =
|latest_pipeline_runtime: &mut LexicalRebuildPipelineRuntimeSnapshot,
merge_coordinator: &mut LexicalRebuildShardMergeCoordinator,
pending_shard_build_jobs: &mut VecDeque<LexicalRebuildShardBuildWork>,
pending_shard_build_message_bytes: &mut usize,
active_shard_build_jobs: &mut usize,
enqueued_shards: &mut usize,
producer_finished: bool|
-> Result<()> {
let staged_merge_runtime = staged_merge_controller.decide(
producer_finished,
latest_pipeline_runtime,
merge_coordinator,
);
merge_coordinator.set_allowed_pending_merge_jobs(
staged_merge_runtime.allowed_jobs,
&merge_work_tx,
)?;
let applied_runtime = staged_merge_controller.decide(
producer_finished,
latest_pipeline_runtime,
merge_coordinator,
);
apply_staged_merge_runtime_snapshot(
latest_pipeline_runtime,
progress.as_ref(),
&applied_runtime,
);
apply_staged_shard_build_telemetry_snapshot(
latest_pipeline_runtime,
progress.as_ref(),
staged_shard_build_controller.memory_reserve_bytes,
staged_shard_build_controller.emergency_memory_reserve_bytes,
&shard_build_telemetry,
);
let staged_shard_build_runtime = staged_shard_build_controller.decide(
latest_pipeline_runtime,
&applied_runtime,
*active_shard_build_jobs,
pending_shard_build_jobs.len(),
pending_shard_build_jobs
.front()
.map(|work| work.shard.message_bytes),
);
while *active_shard_build_jobs < staged_shard_build_runtime.allowed_jobs {
let Some(mut work) = pending_shard_build_jobs.pop_front() else {
break;
};
*pending_shard_build_message_bytes =
pending_shard_build_message_bytes.saturating_sub(work.message_bytes);
let dispatch_slot_index = *active_shard_build_jobs;
work.writer_parallelism =
lexical_rebuild_staged_shard_builder_writer_parallelism_for_dispatch(
shard_builder_settings.writer_parallelism_budget,
staged_shard_build_runtime.allowed_jobs,
dispatch_slot_index,
);
shard_work_dispatch_tx.send(work).map_err(|_| {
anyhow::anyhow!(
"staged lexical rebuild shard builder queue disconnected before shard enqueue"
)
})?;
*active_shard_build_jobs = active_shard_build_jobs.saturating_add(1);
*enqueued_shards = enqueued_shards.saturating_add(1);
}
let applied_shard_build_runtime = staged_shard_build_controller.decide(
latest_pipeline_runtime,
&applied_runtime,
*active_shard_build_jobs,
pending_shard_build_jobs.len(),
pending_shard_build_jobs
.front()
.map(|work| work.shard.message_bytes),
);
apply_staged_shard_build_runtime_snapshot(
latest_pipeline_runtime,
progress.as_ref(),
&applied_shard_build_runtime,
);
apply_staged_shard_build_telemetry_snapshot(
latest_pipeline_runtime,
progress.as_ref(),
staged_shard_build_controller.memory_reserve_bytes,
staged_shard_build_controller.emergency_memory_reserve_bytes,
&shard_build_telemetry,
);
Ok(())
};
let initial_staged_merge_runtime =
staged_merge_controller.decide(false, &latest_pipeline_runtime, &merge_coordinator);
apply_staged_merge_runtime_snapshot(
&mut latest_pipeline_runtime,
progress.as_ref(),
&initial_staged_merge_runtime,
);
let initial_staged_shard_build_runtime = staged_shard_build_controller.decide(
&latest_pipeline_runtime,
&initial_staged_merge_runtime,
active_shard_build_jobs,
pending_shard_build_jobs.len(),
pending_shard_build_jobs
.front()
.map(|work| work.shard.message_bytes),
);
apply_staged_shard_build_runtime_snapshot(
&mut latest_pipeline_runtime,
progress.as_ref(),
&initial_staged_shard_build_runtime,
);
apply_staged_shard_build_telemetry_snapshot(
&mut latest_pipeline_runtime,
progress.as_ref(),
staged_shard_build_controller.memory_reserve_bytes,
staged_shard_build_controller.emergency_memory_reserve_bytes,
&shard_build_telemetry,
);
let advance_completed_shards =
|pending_completed_shards: &mut BTreeMap<usize, LexicalRebuildShardBuildResult>,
next_shard_to_commit: &mut usize,
completed_shard_artifacts: &mut Vec<LexicalRebuildShardMergeArtifact>,
processed_conversations: &mut usize,
indexed_docs: &mut usize,
observed_messages: &mut usize,
last_processed_conversation_id: &mut Option<i64>,
conversations_since_progress_persist: &mut usize,
responsiveness_controller: &mut LexicalRebuildResponsivenessController,
current_batch_conversation_limit: &mut usize|
-> Result<bool> {
let mut force_progress_persist = false;
while let Some(result) = pending_completed_shards.remove(next_shard_to_commit) {
let validated_artifact = validate_lexical_rebuild_shard_build_result(&result)?;
*processed_conversations =
processed_conversations.saturating_add(result.shard.conversation_count);
*indexed_docs = indexed_docs.saturating_add(result.indexed_docs);
let shard_observed_messages =
if lexical_shard_message_count_is_known(result.shard.message_count) {
result.shard.message_count
} else {
result.indexed_docs
};
*observed_messages = observed_messages.saturating_add(shard_observed_messages);
*last_processed_conversation_id = Some(result.shard.last_conversation_id);
completed_shard_artifacts.push(validated_artifact);
*conversations_since_progress_persist = conversations_since_progress_persist
.saturating_add(result.shard.conversation_count);
if let Some(p) = &progress {
p.current.store(*processed_conversations, Ordering::Relaxed);
}
bump_index_run_lock_progress_if_present(progress_bump.as_ref());
if let Some(transition) = responsiveness_controller.record_first_durable_commit() {
apply_lexical_rebuild_budget_transition(
transition,
lexical_rebuild_flow_limiter.as_ref(),
pipeline_budget_controller.as_ref(),
current_batch_conversation_limit,
None,
);
force_progress_persist = true;
}
*next_shard_to_commit = next_shard_to_commit.saturating_add(1);
}
Ok(force_progress_persist)
};
let queue_newly_completed_shard_artifacts =
|merge_coordinator: &mut LexicalRebuildShardMergeCoordinator,
completed_shard_artifacts: &[LexicalRebuildShardMergeArtifact],
scheduled_completed_shard_artifacts: &mut usize|
-> Result<()> {
while *scheduled_completed_shard_artifacts < completed_shard_artifacts.len() {
merge_coordinator.queue_base_artifact(
completed_shard_artifacts[*scheduled_completed_shard_artifacts].clone(),
&merge_work_tx,
)?;
*scheduled_completed_shard_artifacts =
(*scheduled_completed_shard_artifacts).saturating_add(1);
}
Ok(())
};
let main_result: Result<()> = (|| {
let never_shard_result_rx = never::<LexicalRebuildShardBuildMessage>();
let never_merge_result_rx = never::<LexicalRebuildShardMergeMessage>();
let never_pipeline_rx = never::<LexicalRebuildPipelineMessage>();
let mut shard_result_channel_open = true;
let mut merge_result_channel_open = true;
let mut producer_finished = false;
let mut pipeline_backlog_pause_logged = false;
while !producer_finished {
let active_shard_result_rx = if shard_result_channel_open {
&shard_result_rx
} else {
&never_shard_result_rx
};
let active_merge_result_rx = if merge_result_channel_open {
&merge_result_rx
} else {
&never_merge_result_rx
};
let pipeline_backlog_paused = pending_shard_build_message_bytes
>= pending_shard_build_max_message_bytes
|| pending_shard_build_jobs.len() >= pending_shard_build_max_jobs;
if pipeline_backlog_paused && !pipeline_backlog_pause_logged {
tracing::info!(
pending_shard_build_jobs = pending_shard_build_jobs.len(),
pending_shard_build_max_jobs,
pending_shard_build_message_bytes,
pending_shard_build_max_message_bytes,
active_shard_build_jobs,
"pausing lexical rebuild producer handoff while staged shard-build backlog drains"
);
pipeline_backlog_pause_logged = true;
} else if !pipeline_backlog_paused && pipeline_backlog_pause_logged {
tracing::info!(
pending_shard_build_jobs = pending_shard_build_jobs.len(),
pending_shard_build_max_jobs,
pending_shard_build_message_bytes,
pending_shard_build_max_message_bytes,
active_shard_build_jobs,
"resuming lexical rebuild producer handoff after staged shard-build backlog drained"
);
pipeline_backlog_pause_logged = false;
}
let active_pipeline_rx = if pipeline_backlog_paused {
&never_pipeline_rx
} else {
&pipeline_rx
};
select! {
recv(active_shard_result_rx) -> message => {
match message {
Ok(LexicalRebuildShardBuildMessage::Built(result)) => {
active_shard_build_jobs = active_shard_build_jobs.saturating_sub(1);
received_shard_results = received_shard_results.saturating_add(1);
shard_build_telemetry.record(&result);
pending_completed_shards.insert(result.shard.shard_index, result);
}
Ok(LexicalRebuildShardBuildMessage::Error { shard_index, error }) => {
active_shard_build_jobs = active_shard_build_jobs.saturating_sub(1);
return Err(anyhow::anyhow!(
"building lexical rebuild shard {shard_index} failed: {error}"
));
}
Err(_) => {
if received_shard_results < enqueued_shards {
return Err(anyhow::anyhow!(
"staged lexical rebuild shard builder channel closed before all enqueued shards completed"
));
}
shard_result_channel_open = false;
continue;
}
}
let force_progress_persist = advance_completed_shards(
&mut pending_completed_shards,
&mut next_shard_to_commit,
&mut completed_shard_artifacts,
&mut processed_conversations,
&mut indexed_docs,
&mut observed_messages,
&mut last_processed_conversation_id,
&mut conversations_since_progress_persist,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
)?;
queue_newly_completed_shard_artifacts(
&mut merge_coordinator,
&completed_shard_artifacts,
&mut scheduled_completed_shard_artifacts,
)?;
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
producer_finished,
)?;
maybe_persist_staged_lexical_rebuild_progress(
index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
staged_publish_base_meta_fingerprint.as_deref(),
force_progress_persist,
&mut conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
&mut last_progress_persist,
progress_heartbeat_interval,
progress_bump.as_ref(),
perf_profile.as_mut(),
)?;
}
recv(active_merge_result_rx) -> message => {
match message {
Ok(LexicalRebuildShardMergeMessage::Built(result)) => {
merge_coordinator.complete_merge(result, &merge_work_tx)?;
bump_index_run_lock_progress_if_present(progress_bump.as_ref());
}
Ok(LexicalRebuildShardMergeMessage::Error {
output_level,
first_shard_index,
last_shard_index,
error,
}) => {
return Err(anyhow::anyhow!(
"eager staged lexical merge at level {output_level} for shard range {first_shard_index}..={last_shard_index} failed: {error}"
));
}
Err(_) => {
if merge_coordinator.pending_merge_jobs() > 0 {
return Err(anyhow::anyhow!(
"staged lexical rebuild eager merge channel closed before all queued merges completed"
));
}
merge_result_channel_open = false;
continue;
}
}
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
producer_finished,
)?;
maybe_persist_staged_lexical_rebuild_progress(
index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
staged_publish_base_meta_fingerprint.as_deref(),
false,
&mut conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
&mut last_progress_persist,
progress_heartbeat_interval,
progress_bump.as_ref(),
perf_profile.as_mut(),
)?;
}
recv(active_pipeline_rx) -> message => {
match message {
Ok(LexicalRebuildPipelineMessage::Batch(prepared_page)) => {
let mut page_reservation = StreamingByteReservation::new(
lexical_rebuild_flow_limiter.as_ref(),
lexical_rebuild_prepared_page_reserved_bytes(&prepared_page),
);
if let Some(profile) = perf_profile.as_mut() {
profile.conversation_list_duration +=
prepared_page.conversation_list_duration;
profile.message_stream_duration += prepared_page.message_fetch_duration;
profile.finish_conversation_duration +=
prepared_page.packet_prepare_duration;
}
if options.defer_initial_content_fingerprint {
max_conversation_id =
max_conversation_id.max(prepared_page.page_last_conversation_id);
}
let LexicalRebuildPreparedPage {
packets,
page_last_conversation_id: _page_last_conversation_id,
planned_shard_index,
finishes_planned_shard,
conversation_list_duration: _conversation_list_duration,
message_fetch_duration: _message_fetch_duration,
packet_prepare_duration: _packet_prepare_duration,
} = prepared_page;
let planned_shard_index = planned_shard_index.ok_or_else(|| {
anyhow::anyhow!(
"staged lexical rebuild received a prepared page without planned shard metadata"
)
})?;
if current_shard_index.is_none() {
current_shard_index = Some(planned_shard_index);
}
if current_shard_index != Some(planned_shard_index) {
return Err(anyhow::anyhow!(
"staged lexical rebuild crossed from shard {} into shard {} before closing the prior shard",
current_shard_index.unwrap_or(usize::MAX),
planned_shard_index
));
}
for mut packet in packets {
if options.defer_initial_content_fingerprint
&& let Some(last_message_id) = packet.last_message_id
{
max_message_id = max_message_id.max(last_message_id);
}
equivalence_accumulator.absorb_packet(&packet);
current_shard_message_bytes =
current_shard_message_bytes.saturating_add(packet.message_bytes);
packet.flow_reservation_bytes = 0;
current_shard_packets.push(packet);
}
bump_index_run_lock_progress_if_present(progress_bump.as_ref());
page_reservation.release_now();
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
producer_finished,
)?;
if finishes_planned_shard {
let shard = shard_plan
.shards
.get(planned_shard_index)
.cloned()
.ok_or_else(|| {
anyhow::anyhow!(
"missing staged lexical rebuild shard metadata for shard {}",
planned_shard_index
)
})?;
let shard_message_bytes = current_shard_message_bytes;
let shard_packets = std::mem::take(&mut current_shard_packets);
current_shard_message_bytes = 0;
current_shard_index = None;
pending_shard_build_message_bytes =
pending_shard_build_message_bytes
.saturating_add(shard_message_bytes);
pending_shard_build_jobs.push_back(LexicalRebuildShardBuildWork {
shard,
packets: shard_packets,
message_bytes: shard_message_bytes,
shard_index_path: shard_stage_root
.path()
.join(format!("shard-{planned_shard_index:05}")),
writer_parallelism: 1,
});
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
producer_finished,
)?;
}
maybe_persist_staged_lexical_rebuild_progress(
index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
staged_publish_base_meta_fingerprint.as_deref(),
false,
&mut conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
&mut last_progress_persist,
progress_heartbeat_interval,
progress_bump.as_ref(),
perf_profile.as_mut(),
)?;
}
Ok(LexicalRebuildPipelineMessage::Error(error)) => {
return Err(anyhow::anyhow!(error));
}
Ok(LexicalRebuildPipelineMessage::Done) => {
producer_finished = true;
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
producer_finished,
)?;
maybe_persist_staged_lexical_rebuild_progress(
index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
staged_publish_base_meta_fingerprint.as_deref(),
false,
&mut conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
&mut last_progress_persist,
progress_heartbeat_interval,
progress_bump.as_ref(),
perf_profile.as_mut(),
)?;
}
Err(_) => {
return Err(anyhow::anyhow!(
"lexical rebuild pipeline channel closed before producer completion"
));
}
}
}
default(Duration::from_millis(250)) => {
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
producer_finished,
)?;
}
}
}
if !current_shard_packets.is_empty() || current_shard_index.is_some() {
return Err(anyhow::anyhow!(
"staged lexical rebuild finished with an unterminated shard packet buffer"
));
}
drop(shard_work_tx);
while !pending_shard_build_jobs.is_empty()
|| received_shard_results < enqueued_shards
|| merge_coordinator.pending_merge_jobs() > 0
{
let active_shard_result_rx = if shard_result_channel_open {
&shard_result_rx
} else {
&never_shard_result_rx
};
let active_merge_result_rx = if merge_result_channel_open {
&merge_result_rx
} else {
&never_merge_result_rx
};
select! {
recv(active_shard_result_rx) -> message => {
match message {
Ok(LexicalRebuildShardBuildMessage::Built(result)) => {
active_shard_build_jobs = active_shard_build_jobs.saturating_sub(1);
received_shard_results = received_shard_results.saturating_add(1);
shard_build_telemetry.record(&result);
pending_completed_shards.insert(result.shard.shard_index, result);
}
Ok(LexicalRebuildShardBuildMessage::Error { shard_index, error }) => {
active_shard_build_jobs = active_shard_build_jobs.saturating_sub(1);
return Err(anyhow::anyhow!(
"building lexical rebuild shard {shard_index} failed: {error}"
));
}
Err(_) => {
if received_shard_results < enqueued_shards {
return Err(anyhow::anyhow!(
"staged lexical rebuild shard builder channel closed before all shards completed"
));
}
shard_result_channel_open = false;
continue;
}
}
let force_progress_persist = advance_completed_shards(
&mut pending_completed_shards,
&mut next_shard_to_commit,
&mut completed_shard_artifacts,
&mut processed_conversations,
&mut indexed_docs,
&mut observed_messages,
&mut last_processed_conversation_id,
&mut conversations_since_progress_persist,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
)?;
queue_newly_completed_shard_artifacts(
&mut merge_coordinator,
&completed_shard_artifacts,
&mut scheduled_completed_shard_artifacts,
)?;
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
true,
)?;
maybe_persist_staged_lexical_rebuild_progress(
index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
staged_publish_base_meta_fingerprint.as_deref(),
force_progress_persist,
&mut conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
&mut last_progress_persist,
progress_heartbeat_interval,
progress_bump.as_ref(),
perf_profile.as_mut(),
)?;
}
recv(active_merge_result_rx) -> message => {
match message {
Ok(LexicalRebuildShardMergeMessage::Built(result)) => {
merge_coordinator.complete_merge(result, &merge_work_tx)?;
bump_index_run_lock_progress_if_present(progress_bump.as_ref());
}
Ok(LexicalRebuildShardMergeMessage::Error {
output_level,
first_shard_index,
last_shard_index,
error,
}) => {
return Err(anyhow::anyhow!(
"eager staged lexical merge at level {output_level} for shard range {first_shard_index}..={last_shard_index} failed: {error}"
));
}
Err(_) => {
if merge_coordinator.pending_merge_jobs() > 0 {
return Err(anyhow::anyhow!(
"staged lexical rebuild eager merge channel closed before all queued merges completed"
));
}
merge_result_channel_open = false;
continue;
}
}
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
true,
)?;
maybe_persist_staged_lexical_rebuild_progress(
index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
staged_publish_base_meta_fingerprint.as_deref(),
false,
&mut conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
&mut last_progress_persist,
progress_heartbeat_interval,
progress_bump.as_ref(),
perf_profile.as_mut(),
)?;
}
default(Duration::from_millis(250)) => {
refresh_runtime(
&mut latest_pipeline_runtime,
&mut responsiveness_controller,
&mut current_batch_conversation_limit,
current_shard_packets.len(),
current_shard_message_bytes,
);
refresh_staged_parallelism_and_dispatch(
&mut latest_pipeline_runtime,
&mut merge_coordinator,
&mut pending_shard_build_jobs,
&mut pending_shard_build_message_bytes,
&mut active_shard_build_jobs,
&mut enqueued_shards,
true,
)?;
}
}
}
if next_shard_to_commit != shard_plan.shards.len() {
return Err(anyhow::anyhow!(
"staged lexical rebuild only committed {} of {} planned shards",
next_shard_to_commit,
shard_plan.shards.len()
));
}
validate_complete_lexical_rebuild_shard_artifacts(&shard_plan, &completed_shard_artifacts)?;
let mut reduced_final_merge_artifacts = merge_coordinator.final_merge_input_artifacts();
if should_reduce_staged_lexical_final_frontier(reduced_final_merge_artifacts.len()) {
reduced_final_merge_artifacts = reduce_staged_lexical_final_merge_frontier_via_workers(
reduced_final_merge_artifacts,
final_merge_stage_root.as_path(),
shard_merge_settings.workers,
&merge_work_tx,
&merge_result_rx,
)?;
}
final_merge_input_artifacts = Some(reduced_final_merge_artifacts);
Ok(())
})();
if main_result.is_err() {
lexical_rebuild_flow_limiter.close();
}
drop(pipeline_rx);
drop(shard_work_dispatch_tx);
drop(merge_work_tx);
match producer_handle.join() {
Ok(()) => {}
Err(payload) => {
let panic_message = panic_payload_message(payload);
if main_result.is_ok() {
return Err(anyhow::anyhow!(
"lexical rebuild packet producer panicked: {}",
panic_message
));
}
tracing::warn!(
error = %panic_message,
"lexical rebuild packet producer panicked while the staged shard consumer was already failing"
);
}
}
for handle in shard_builder_handles {
if let Err(payload) = handle.join() {
let panic_message = panic_payload_message(payload);
if main_result.is_ok() {
return Err(anyhow::anyhow!(
"staged lexical rebuild shard builder panicked: {}",
panic_message
));
}
tracing::warn!(
error = %panic_message,
"staged lexical rebuild shard builder panicked while the consumer was already failing"
);
}
}
for handle in merge_worker_handles {
if let Err(payload) = handle.join() {
let panic_message = panic_payload_message(payload);
if main_result.is_ok() {
return Err(anyhow::anyhow!(
"staged lexical rebuild eager merge worker panicked: {}",
panic_message
));
}
tracing::warn!(
error = %panic_message,
"staged lexical rebuild eager merge worker panicked while the consumer was already failing"
);
}
}
main_result?;
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
None,
LexicalRebuildPipelineSinkRuntimeSnapshot::new(0, 0, 0),
);
maybe_persist_staged_lexical_rebuild_progress(
index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
staged_publish_base_meta_fingerprint.as_deref(),
true,
&mut conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
&mut last_progress_persist,
progress_heartbeat_interval,
progress_bump.as_ref(),
perf_profile.as_mut(),
)?;
let final_merge_inputs = final_merge_input_artifacts
.unwrap_or_else(|| merge_coordinator.final_merge_input_artifacts());
let final_merge_artifact = finalize_staged_lexical_rebuild_publish_artifact_from_artifacts(
&staged_merged_index_path,
&final_merge_inputs,
&final_merge_stage_root,
shard_merge_settings.workers,
)?;
let merged_docs = final_merge_artifact.docs;
if merged_docs != indexed_docs {
return Err(anyhow::anyhow!(
"staged lexical rebuild merged {} docs but durable shard builds produced {} docs",
merged_docs,
indexed_docs
));
}
let final_storage_fingerprint = if options.defer_initial_content_fingerprint {
lexical_rebuild_content_fingerprint_value(
total_conversations,
max_conversation_id,
max_message_id,
)
} else {
rebuild_state.db.storage_fingerprint.clone()
};
let final_observed_messages = observed_messages.max(indexed_docs);
let lexical_rebuild_duration = lexical_rebuild_started.elapsed();
let publish_started = Instant::now();
let equivalence_evidence = equivalence_accumulator.finalize();
let generation_manifest = persist_lexical_rebuild_generation_artifacts(
&final_merge_artifact.publish_path,
&final_storage_fingerprint,
processed_conversations,
total_conversations,
final_observed_messages,
indexed_docs,
&equivalence_evidence,
)?;
let staged_published_meta_fingerprint =
index_meta_fingerprint(&final_merge_artifact.publish_path)?;
publish_staged_lexical_index(&final_merge_artifact.publish_path, index_path)?;
log_lexical_generation_manifest_published(&generation_manifest, &equivalence_evidence);
crate::search::tantivy::validate_searchable_index_contract(index_path).with_context(|| {
format!(
"validating staged lexical rebuild after publish: {}",
index_path.display()
)
})?;
if let Some(observed_tantivy_docs) = live_tantivy_doc_count(index_path)?
&& observed_tantivy_docs != indexed_docs
{
return Err(anyhow::anyhow!(
"staged lexical rebuild published {} docs but a fresh Tantivy reader only sees {}",
indexed_docs,
observed_tantivy_docs
));
}
let refresh_ledger =
build_authoritative_lexical_refresh_ledger(AuthoritativeLexicalRefreshLedgerInput {
publish_mode: "atomic_staged_swap",
lexical_duration: lexical_rebuild_duration,
publish_duration: publish_started.elapsed(),
processed_conversations,
total_conversations,
final_observed_messages,
indexed_docs,
equivalence_evidence: &equivalence_evidence,
});
persist_lexical_refresh_ledger(index_path, &refresh_ledger)?;
log_lexical_refresh_ledger_published(&refresh_ledger);
storage.close_without_checkpoint().with_context(|| {
format!(
"closing readonly database after staged Tantivy rebuild without checkpoint: {}",
db_path.display()
)
})?;
rebuild_state.db.storage_fingerprint = final_storage_fingerprint;
rebuild_state.db.total_messages = final_observed_messages;
rebuild_state.committed_offset = i64::try_from(total_conversations).unwrap_or(i64::MAX);
rebuild_state.committed_conversation_id = last_processed_conversation_id;
rebuild_state.processed_conversations = processed_conversations;
rebuild_state.indexed_docs = indexed_docs;
rebuild_state.mark_completed(staged_published_meta_fingerprint);
persist_lexical_rebuild_state(index_path, &rebuild_state)?;
if let Some(p) = &progress {
p.phase.store(0, Ordering::Relaxed);
p.is_rebuilding.store(false, Ordering::Relaxed);
}
if let Some(profile) = perf_profile.as_mut() {
if let Some(started) = rebuild_profile_started {
profile.total_duration = started.elapsed();
}
profile.log_summary();
}
Ok(LexicalRebuildOutcome {
indexed_docs,
observed_messages: Some(final_observed_messages),
exact_checkpoint_persisted: true,
equivalence: Some(equivalence_evidence),
})
}
fn rebuild_tantivy_from_db_with_options(
db_path: &Path,
data_dir: &Path,
total_conversations: usize,
progress: Option<Arc<IndexingProgress>>,
options: LexicalRebuildStartupOptions,
progress_bump: Option<Arc<AtomicI64>>,
) -> Result<LexicalRebuildOutcome> {
let prep_profile = std::env::var_os("CASS_PREP_PROFILE").is_some();
let prep_started = Instant::now();
let mut prep_step_started = Instant::now();
let log_prep_step = |step: &str, step_started: &mut Instant| {
let step_ms = step_started.elapsed().as_millis() as u64;
let total_ms = prep_started.elapsed().as_millis() as u64;
if prep_profile {
eprintln!(
"CASS_PREP_PROFILE step={step} step_ms={} total_ms={}",
step_ms, total_ms
);
tracing::info!(
component = "main",
step,
step_ms,
total_ms,
"lexical rebuild prep profile"
);
}
*step_started = Instant::now();
};
let storage = FrankenStorage::open_readonly(db_path).with_context(|| {
format!(
"opening database for Tantivy rebuild: {}",
db_path.display()
)
})?;
log_prep_step("open_readonly", &mut prep_step_started);
let index_path = index_dir(data_dir)?;
let db_state = if options.defer_initial_content_fingerprint {
deferred_lexical_rebuild_db_state(db_path, total_conversations)
} else {
lexical_rebuild_db_state_with_total_conversations(&storage, db_path, total_conversations)?
};
log_prep_step(
if options.defer_initial_content_fingerprint {
"prepare_db_state_deferred_fingerprint"
} else {
"compute_db_state_fingerprint"
},
&mut prep_step_started,
);
let mut rebuild_state = if options.defer_initial_content_fingerprint {
LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE)
} else {
match load_lexical_rebuild_state(&index_path)? {
Some(state) if state.matches_run(&db_state, LEXICAL_REBUILD_PAGE_SIZE) => {
if state.is_incomplete()
&& state.execution_mode.requires_restart_from_zero_on_resume()
{
tracing::info!(
db_path = %db_path.display(),
execution_mode = state.execution_mode.as_str(),
processed_conversations = state.reported_processed_conversations(),
"discarding non-resumable staged lexical rebuild checkpoint and restarting from zero"
);
LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE)
} else {
let mut state = reconcile_pending_lexical_commit(&index_path, state)?;
normalize_lexical_rebuild_state_for_current_run(&index_path, &mut state)?;
upgrade_lexical_rebuild_state_resume_cursor_if_needed(
&storage,
&index_path,
&mut state,
)?;
prepare_lexical_rebuild_state_for_active_run(&index_path, &mut state)?;
state
}
}
Some(state) => {
tracing::info!(
db_path = %db_path.display(),
existing_db_path = %state.db.db_path,
existing_total_conversations = state.db.total_conversations,
existing_storage_fingerprint = %state.db.storage_fingerprint,
"discarding incompatible lexical rebuild checkpoint and restarting from zero"
);
LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE)
}
None => LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE),
}
};
log_prep_step("load_checkpoint_state", &mut prep_step_started);
if rebuild_state.completed || rebuild_state.processed_conversations >= total_conversations {
storage.close_without_checkpoint().with_context(|| {
format!(
"closing readonly database after confirming completed Tantivy rebuild without checkpoint: {}",
db_path.display()
)
})?;
if let Some(p) = &progress {
p.phase.store(0, Ordering::Relaxed);
p.is_rebuilding.store(false, Ordering::Relaxed);
}
return Ok(LexicalRebuildOutcome {
indexed_docs: rebuild_state.indexed_docs,
observed_messages: Some(
rebuild_state
.db
.total_messages
.max(rebuild_state.indexed_docs),
),
exact_checkpoint_persisted: false,
equivalence: None,
});
}
let restart_from_zero =
rebuild_state.processed_conversations == 0 && rebuild_state.pending.is_none();
let page_size = LEXICAL_REBUILD_PAGE_SIZE;
let pipeline_settings = lexical_rebuild_pipeline_settings_snapshot();
let staged_shard_plan = if restart_from_zero && total_conversations > 0 {
Some(plan_lexical_rebuild_shards_from_storage_with_settings(
&storage,
&pipeline_settings,
total_conversations,
)?)
} else {
None
};
if staged_shard_plan.is_some() {
log_prep_step("plan_lexical_shards", &mut prep_step_started);
}
let will_use_atomic_staged_publish = staged_shard_plan.is_some();
if restart_from_zero && !will_use_atomic_staged_publish {
if let Err(err) = fs::remove_dir_all(&index_path)
&& err.kind() != std::io::ErrorKind::NotFound
{
return Err(err)
.with_context(|| format!("removing stale index {}", index_path.display()));
}
fs::create_dir_all(&index_path).with_context(|| {
format!("creating rebuilt index directory {}", index_path.display())
})?;
rebuild_state = LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE);
} else if restart_from_zero && will_use_atomic_staged_publish {
rebuild_state = LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE);
}
log_prep_step("restart_from_zero_reset", &mut prep_step_started);
let batch_conversation_limit = lexical_rebuild_batch_fetch_conversation_limit(page_size);
let initial_batch_conversation_limit =
lexical_rebuild_initial_batch_fetch_conversation_limit(batch_conversation_limit);
let lexical_rebuild_worker_pool = build_lexical_rebuild_worker_pool()?;
let mut commit_interval_conversations;
let mut commit_interval_messages;
let mut commit_interval_message_bytes;
let progress_heartbeat_interval_conversations =
lexical_rebuild_progress_heartbeat_interval_conversations();
let progress_heartbeat_interval = lexical_rebuild_progress_heartbeat_interval();
let mut indexed_docs = rebuild_state.indexed_docs;
let mut observed_messages = rebuild_state.indexed_docs;
let mut processed_conversations = rebuild_state.processed_conversations;
let mut last_processed_conversation_id = rebuild_state.committed_conversation_id;
let mut conversations_since_commit = 0usize;
let mut messages_since_commit = 0usize;
let mut message_bytes_since_commit = 0usize;
let mut conversations_since_progress_persist = 0usize;
let mut last_progress_persist = Instant::now();
let mut pending_batch: Vec<LexicalRebuildConversationPacket> = Vec::with_capacity(
batch_conversation_limit
.max(initial_batch_conversation_limit)
.max(1),
);
let mut pending_batch_message_count = 0usize;
let mut pending_batch_message_bytes = 0usize;
let mut latest_pipeline_runtime = LexicalRebuildPipelineRuntimeSnapshot::default();
let mut perf_profile = LexicalRebuildPerfProfile::from_env();
let rebuild_profile_started = perf_profile.as_ref().map(|_| Instant::now());
let pipeline_channel_size = lexical_rebuild_pipeline_channel_size();
let startup_commit_interval_conversations =
lexical_rebuild_initial_commit_interval_conversations()
.min(lexical_rebuild_commit_interval_conversations());
let startup_commit_interval_messages = lexical_rebuild_initial_commit_interval_messages()
.min(lexical_rebuild_commit_interval_messages());
let startup_commit_interval_message_bytes =
lexical_rebuild_initial_commit_interval_message_bytes()
.min(lexical_rebuild_commit_interval_message_bytes());
let steady_commit_interval_conversations = lexical_rebuild_commit_interval_conversations();
let steady_commit_interval_messages = lexical_rebuild_commit_interval_messages();
let steady_commit_interval_message_bytes = lexical_rebuild_commit_interval_message_bytes();
let startup_pipeline_budget = lexical_rebuild_runtime_pipeline_budget_snapshot(
initial_batch_conversation_limit,
startup_commit_interval_messages,
startup_commit_interval_message_bytes,
pipeline_channel_size,
startup_commit_interval_conversations,
startup_commit_interval_messages,
startup_commit_interval_message_bytes,
);
let steady_pipeline_budget = lexical_rebuild_runtime_pipeline_budget_snapshot(
batch_conversation_limit,
steady_commit_interval_messages,
steady_commit_interval_message_bytes,
pipeline_channel_size,
steady_commit_interval_conversations,
steady_commit_interval_messages,
steady_commit_interval_message_bytes,
);
let controller_policy = lexical_rebuild_responsiveness_policy();
let available_parallelism = pipeline_settings.available_parallelism;
let reserved_cores = pipeline_settings.reserved_cores;
let controller_loadavg_high_watermark_1m_milli =
lexical_rebuild_controller_loadavg_high_watermark_1m_milli_for_available_and_reserved(
available_parallelism,
reserved_cores,
);
let controller_loadavg_low_watermark_1m_milli =
lexical_rebuild_controller_loadavg_low_watermark_1m_milli_from_high(
controller_loadavg_high_watermark_1m_milli,
);
let start_conservative = match controller_policy {
LexicalRebuildResponsivenessPolicy::Auto => rebuild_state.processed_conversations == 0,
LexicalRebuildResponsivenessPolicy::Steady => false,
LexicalRebuildResponsivenessPolicy::Conservative => true,
};
let mut responsiveness_controller = LexicalRebuildResponsivenessController::new(
controller_policy,
startup_pipeline_budget,
steady_pipeline_budget,
pipeline_channel_size,
start_conservative,
controller_loadavg_high_watermark_1m_milli,
controller_loadavg_low_watermark_1m_milli,
);
let mut current_batch_conversation_limit = responsiveness_controller
.current_budget()
.page_conversation_limit;
let current_budget = responsiveness_controller.current_budget();
commit_interval_conversations = current_budget.commit_interval_conversations;
commit_interval_messages = current_budget.commit_interval_messages;
commit_interval_message_bytes = current_budget.commit_interval_message_bytes;
let first_budget_promotion_commit_thresholds = responsiveness_controller
.waits_for_first_durable_commit()
.then_some((
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
));
let lexical_rebuild_flow_limiter = Arc::new(StreamingByteLimiter::new(
responsiveness_controller
.current_budget()
.max_message_bytes_in_flight,
));
let pipeline_budget_controller = Arc::new(LexicalRebuildPipelineBudgetController::new(
responsiveness_controller.current_budget(),
));
let producer_telemetry = Arc::new(LexicalRebuildProducerTelemetry::default());
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(0, 0, 0),
);
let lexical_rebuild_worker_pool = lexical_rebuild_worker_pool.map(Arc::new);
let (pipeline_tx, pipeline_rx) =
bounded::<LexicalRebuildPipelineMessage>(pipeline_channel_size);
let mut pipeline_tx = Some(pipeline_tx);
let mut pipeline_rx = Some(pipeline_rx);
let mut producer_handle = None;
if restart_from_zero {
producer_handle = Some(spawn_lexical_rebuild_packet_producer(
db_path.to_path_buf(),
rebuild_state.committed_conversation_id,
staged_shard_plan.clone(),
page_size,
pipeline_channel_size,
first_budget_promotion_commit_thresholds,
pipeline_budget_controller.clone(),
pipeline_tx
.take()
.expect("packet producer sender missing before startup overlap"),
lexical_rebuild_flow_limiter.clone(),
lexical_rebuild_worker_pool.clone(),
producer_telemetry.clone(),
));
log_prep_step("start_packet_producer", &mut prep_step_started);
}
if let Some(ref shard_plan) = staged_shard_plan {
rebuild_state.set_execution_mode(LexicalRebuildExecutionMode::StagedShardBuild);
let pipeline_rx = pipeline_rx
.take()
.expect("staged lexical rebuild pipeline receiver missing before consume loop");
let producer_handle = producer_handle
.take()
.expect("staged lexical rebuild packet producer handle missing before consume loop");
return rebuild_tantivy_from_db_via_staged_shards(
db_path,
&index_path,
total_conversations,
progress,
options,
prep_profile.then_some(prep_started),
storage,
rebuild_state,
shard_plan.clone(),
pipeline_settings,
progress_heartbeat_interval_conversations,
progress_heartbeat_interval,
latest_pipeline_runtime,
responsiveness_controller,
pipeline_budget_controller,
current_batch_conversation_limit,
lexical_rebuild_flow_limiter,
producer_telemetry,
lexical_rebuild_worker_pool,
pipeline_rx,
producer_handle,
perf_profile,
rebuild_profile_started,
progress_bump,
);
}
let mut t_index = match (|| -> Result<TantivyIndex> {
let mut t_index = match TantivyIndex::open_or_create(&index_path) {
Ok(index) => index,
Err(err)
if rebuild_state.processed_conversations > 0 || rebuild_state.pending.is_some() =>
{
tracing::warn!(
path = %index_path.display(),
error = %err,
"partial lexical index could not be reopened; restarting lexical rebuild from zero"
);
if let Err(remove_err) = fs::remove_dir_all(&index_path)
&& remove_err.kind() != std::io::ErrorKind::NotFound
{
return Err(remove_err).with_context(|| {
format!("removing unreadable index {}", index_path.display())
});
}
fs::create_dir_all(&index_path).with_context(|| {
format!(
"recreating lexical index directory after open failure {}",
index_path.display()
)
})?;
rebuild_state =
LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE);
TantivyIndex::open_or_create(&index_path)?
}
Err(err) => return Err(err),
};
log_prep_step("open_tantivy", &mut prep_step_started);
t_index.configure_bulk_load_merge_policy();
persist_lexical_rebuild_state_for_active_run_start(&index_path, &rebuild_state)?;
log_prep_step("persist_initial_checkpoint", &mut prep_step_started);
Ok(t_index)
})() {
Ok(index) => index,
Err(err) => {
if let Some(handle) = producer_handle.take() {
lexical_rebuild_flow_limiter.close();
drop(pipeline_rx.take());
if let Err(payload) = handle.join() {
let panic_message = panic_payload_message(payload);
tracing::warn!(
panic_message = %panic_message,
"lexical rebuild packet producer panicked during startup cleanup"
);
}
}
return Err(err);
}
};
if let Some(p) = &progress {
p.phase.store(2, Ordering::Relaxed);
p.is_rebuilding.store(true, Ordering::Relaxed);
p.total.store(total_conversations, Ordering::Relaxed);
p.current
.store(rebuild_state.processed_conversations, Ordering::Relaxed);
p.discovered_agents.store(0, Ordering::Relaxed);
}
if producer_handle.is_none() {
producer_handle = Some(spawn_lexical_rebuild_packet_producer(
db_path.to_path_buf(),
rebuild_state.committed_conversation_id,
staged_shard_plan.clone(),
page_size,
pipeline_channel_size,
first_budget_promotion_commit_thresholds,
pipeline_budget_controller.clone(),
pipeline_tx
.take()
.expect("packet producer sender missing before normal startup"),
lexical_rebuild_flow_limiter.clone(),
lexical_rebuild_worker_pool.clone(),
producer_telemetry.clone(),
));
log_prep_step("start_packet_producer", &mut prep_step_started);
}
let pipeline_rx = pipeline_rx
.take()
.expect("lexical rebuild pipeline receiver missing before consume loop");
let producer_handle = producer_handle
.take()
.expect("lexical rebuild packet producer handle missing before consume loop");
{
let step_ms = prep_step_started.elapsed().as_millis() as u64;
let total_ms = prep_started.elapsed().as_millis() as u64;
if prep_profile {
eprintln!(
"CASS_PREP_PROFILE step=ready_to_index step_ms={} total_ms={}",
step_ms, total_ms
);
}
tracing::info!(
component = "main",
step = "ready_to_index",
total_prep_ms = total_ms,
total_conversations,
restart_from_zero,
"lexical rebuild startup complete, entering consumer loop"
);
}
let lexical_rebuild_started = Instant::now();
let mut max_conversation_id = 0i64;
let mut max_message_id = 0i64;
let mut equivalence_accumulator = LexicalRebuildEquivalenceAccumulator::new();
{
macro_rules! finish_conversation {
($packet:expr) => {{
let packet = $packet;
let conversation_id = packet.identity.conversation_id.ok_or_else(|| {
anyhow::anyhow!(
"authoritative lexical rebuild packet missing stable conversation id at processed_conversations={}",
processed_conversations
)
})?;
let message_count = packet.message_count;
let message_bytes = packet.message_bytes;
observed_messages = observed_messages.saturating_add(message_count);
pending_batch_message_count =
pending_batch_message_count.saturating_add(message_count);
pending_batch_message_bytes =
pending_batch_message_bytes.saturating_add(message_bytes);
pending_batch.push(packet);
last_processed_conversation_id = Some(conversation_id);
processed_conversations = processed_conversations.saturating_add(1);
conversations_since_commit = conversations_since_commit.saturating_add(1);
conversations_since_progress_persist =
conversations_since_progress_persist.saturating_add(1);
if let Some(p) = &progress {
p.current.fetch_add(1, Ordering::Relaxed);
}
bump_index_run_lock_progress_if_present(progress_bump.as_ref());
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
pipeline_rx.len(),
pending_batch.len(),
pending_batch_message_bytes,
),
);
if pending_batch.len() >= current_batch_conversation_limit {
flush_streamed_lexical_rebuild_batch(
&mut pending_batch,
&mut pending_batch_message_count,
&mut pending_batch_message_bytes,
Some(lexical_rebuild_flow_limiter.as_ref()),
lexical_rebuild_worker_pool.as_deref(),
&mut t_index,
&mut indexed_docs,
&mut messages_since_commit,
&mut message_bytes_since_commit,
&mut current_batch_conversation_limit,
batch_conversation_limit,
page_size,
perf_profile.as_mut(),
)?;
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
pipeline_rx.len(),
pending_batch.len(),
pending_batch_message_bytes,
),
);
}
if should_commit_lexical_rebuild(
conversations_since_commit,
messages_since_commit.saturating_add(pending_batch_message_count),
message_bytes_since_commit.saturating_add(pending_batch_message_bytes),
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
) {
flush_streamed_lexical_rebuild_batch(
&mut pending_batch,
&mut pending_batch_message_count,
&mut pending_batch_message_bytes,
Some(lexical_rebuild_flow_limiter.as_ref()),
lexical_rebuild_worker_pool.as_deref(),
&mut t_index,
&mut indexed_docs,
&mut messages_since_commit,
&mut message_bytes_since_commit,
&mut current_batch_conversation_limit,
batch_conversation_limit,
page_size,
perf_profile.as_mut(),
)?;
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
pipeline_rx.len(),
pending_batch.len(),
pending_batch_message_bytes,
),
);
commit_lexical_rebuild_progress(
&index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
&mut t_index,
true,
perf_profile.as_mut(),
)?;
conversations_since_commit = 0;
messages_since_commit = 0;
message_bytes_since_commit = 0;
conversations_since_progress_persist = 0;
last_progress_persist = Instant::now();
(
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
) = lexical_rebuild_commit_intervals_for_state(&rebuild_state);
if let Some(transition) =
responsiveness_controller.record_first_durable_commit()
{
apply_lexical_rebuild_budget_transition(
transition,
lexical_rebuild_flow_limiter.as_ref(),
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
);
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
pipeline_rx.len(),
pending_batch.len(),
pending_batch_message_bytes,
),
);
}
} else if should_persist_lexical_rebuild_progress(
conversations_since_progress_persist,
progress_heartbeat_interval_conversations,
last_progress_persist.elapsed(),
progress_heartbeat_interval,
) {
let heartbeat_progress_started = perf_profile.as_ref().map(|_| Instant::now());
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
pipeline_rx.len(),
pending_batch.len(),
pending_batch_message_bytes,
),
);
persist_pending_lexical_rebuild_progress(
&index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
)?;
if let (Some(profile), Some(started)) =
(perf_profile.as_mut(), heartbeat_progress_started)
{
profile.heartbeat_persist_count =
profile.heartbeat_persist_count.saturating_add(1);
profile.heartbeat_progress_duration += started.elapsed();
}
conversations_since_progress_persist = 0;
last_progress_persist = Instant::now();
}
Ok::<(), anyhow::Error>(())
}};
}
let pipeline_result: Result<()> = (|| {
loop {
match pipeline_rx.recv() {
Ok(LexicalRebuildPipelineMessage::Batch(prepared_page)) => {
if let Some(profile) = perf_profile.as_mut() {
profile.conversation_list_duration +=
prepared_page.conversation_list_duration;
profile.message_stream_duration += prepared_page.message_fetch_duration;
profile.finish_conversation_duration +=
prepared_page.packet_prepare_duration;
}
let first_packet_fingerprint = prepared_page
.packets
.first()
.map(LexicalRebuildConversationPacket::fingerprint_input);
tracing::debug!(
queue_depth = pipeline_rx.len(),
inflight_message_bytes = lexical_rebuild_flow_limiter.bytes_in_flight(),
page_conversations = prepared_page.packets.len(),
page_message_bytes = prepared_page
.packets
.iter()
.map(|packet| packet.message_bytes)
.sum::<usize>(),
page_last_conversation_id = prepared_page.page_last_conversation_id,
planned_shard_index = prepared_page.planned_shard_index,
finishes_planned_shard = prepared_page.finishes_planned_shard,
first_packet_source_id = first_packet_fingerprint
.as_ref()
.map(|fingerprint| fingerprint.source_id)
.unwrap_or(""),
first_packet_origin_kind = first_packet_fingerprint
.as_ref()
.map(|fingerprint| fingerprint.origin_kind)
.unwrap_or(""),
first_packet_message_count = first_packet_fingerprint
.as_ref()
.map(|fingerprint| fingerprint.message_count)
.unwrap_or(0),
first_packet_message_bytes = first_packet_fingerprint
.as_ref()
.map(|fingerprint| fingerprint.message_bytes)
.unwrap_or(0),
"lexical rebuild pipeline received prepared page"
);
if options.defer_initial_content_fingerprint {
max_conversation_id =
max_conversation_id.max(prepared_page.page_last_conversation_id);
}
let LexicalRebuildPreparedPage {
packets,
page_last_conversation_id: _page_last_conversation_id,
planned_shard_index,
finishes_planned_shard,
conversation_list_duration: _conversation_list_duration,
message_fetch_duration: _message_fetch_duration,
packet_prepare_duration: _packet_prepare_duration,
} = prepared_page;
for packet in packets {
if options.defer_initial_content_fingerprint
&& let Some(last_message_id) = packet.last_message_id
{
max_message_id = max_message_id.max(last_message_id);
}
equivalence_accumulator.absorb_packet(&packet);
finish_conversation!(packet)?;
}
if flush_streamed_lexical_rebuild_batch_for_planned_shard_boundary(
planned_shard_index,
finishes_planned_shard,
&mut pending_batch,
&mut pending_batch_message_count,
&mut pending_batch_message_bytes,
Some(lexical_rebuild_flow_limiter.as_ref()),
lexical_rebuild_worker_pool.as_deref(),
&mut t_index,
&mut indexed_docs,
&mut messages_since_commit,
&mut message_bytes_since_commit,
&mut current_batch_conversation_limit,
batch_conversation_limit,
page_size,
perf_profile.as_mut(),
)? {
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
pipeline_rx.len(),
pending_batch.len(),
pending_batch_message_bytes,
),
);
}
}
Ok(LexicalRebuildPipelineMessage::Error(error)) => {
return Err(anyhow::anyhow!(error));
}
Ok(LexicalRebuildPipelineMessage::Done) => break,
Err(_) => {
return Err(anyhow::anyhow!(
"lexical rebuild pipeline channel closed before producer completion"
));
}
}
}
Ok(())
})();
if pipeline_result.is_err() {
lexical_rebuild_flow_limiter.close();
}
drop(pipeline_rx);
match producer_handle.join() {
Ok(()) => {}
Err(payload) => {
let panic_message = panic_payload_message(payload);
if pipeline_result.is_ok() {
return Err(anyhow::anyhow!(
"lexical rebuild packet producer panicked: {}",
panic_message
));
}
tracing::warn!(
error = %panic_message,
"lexical rebuild packet producer panicked while the consumer was already failing"
);
}
}
pipeline_result?;
}
flush_streamed_lexical_rebuild_batch(
&mut pending_batch,
&mut pending_batch_message_count,
&mut pending_batch_message_bytes,
Some(lexical_rebuild_flow_limiter.as_ref()),
lexical_rebuild_worker_pool.as_deref(),
&mut t_index,
&mut indexed_docs,
&mut messages_since_commit,
&mut message_bytes_since_commit,
&mut current_batch_conversation_limit,
batch_conversation_limit,
page_size,
perf_profile.as_mut(),
)?;
refresh_and_maybe_apply_lexical_rebuild_pipeline_runtime(
&mut latest_pipeline_runtime,
progress.as_ref(),
lexical_rebuild_flow_limiter.as_ref(),
Some(producer_telemetry.as_ref()),
&mut responsiveness_controller,
pipeline_budget_controller.as_ref(),
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
LexicalRebuildPipelineSinkRuntimeSnapshot::new(
0,
pending_batch.len(),
pending_batch_message_bytes,
),
);
if conversations_since_commit > 0
|| messages_since_commit > 0
|| message_bytes_since_commit > 0
|| rebuild_state.pending.is_some()
{
commit_lexical_rebuild_progress(
&index_path,
&mut rebuild_state,
last_processed_conversation_id,
processed_conversations,
indexed_docs,
&latest_pipeline_runtime,
&mut t_index,
false,
perf_profile.as_mut(),
)?;
}
drop(t_index);
crate::search::tantivy::validate_searchable_index_contract(&index_path).with_context(|| {
format!(
"validating lexical rebuild after commit: {}",
index_path.display()
)
})?;
if let Some(observed_tantivy_docs) = live_tantivy_doc_count(&index_path)?
&& observed_tantivy_docs != indexed_docs
{
return Err(anyhow::anyhow!(
"lexical rebuild committed {} docs but a fresh Tantivy reader only sees {}",
indexed_docs,
observed_tantivy_docs
));
}
storage.close_without_checkpoint().with_context(|| {
format!(
"closing readonly database after Tantivy rebuild without checkpoint: {}",
db_path.display()
)
})?;
if options.defer_initial_content_fingerprint {
rebuild_state.db.storage_fingerprint = lexical_rebuild_content_fingerprint_value(
total_conversations,
max_conversation_id,
max_message_id,
);
}
let final_observed_messages = observed_messages.max(indexed_docs);
rebuild_state.db.total_messages = final_observed_messages;
rebuild_state.committed_offset = i64::try_from(total_conversations).unwrap_or(i64::MAX);
rebuild_state.committed_conversation_id = last_processed_conversation_id;
rebuild_state.processed_conversations = processed_conversations;
rebuild_state.indexed_docs = indexed_docs;
rebuild_state.mark_completed(completed_lexical_rebuild_meta_fingerprint(
&rebuild_state,
&index_path,
)?);
persist_lexical_rebuild_state(&index_path, &rebuild_state)?;
if let Some(p) = &progress {
p.phase.store(0, Ordering::Relaxed);
p.is_rebuilding.store(false, Ordering::Relaxed);
}
if let Some(profile) = perf_profile.as_mut() {
if let Some(started) = rebuild_profile_started {
profile.total_duration = started.elapsed();
}
profile.log_summary();
}
let lexical_rebuild_duration = lexical_rebuild_started.elapsed();
let publish_started = Instant::now();
let equivalence_evidence = equivalence_accumulator.finalize();
let generation_manifest = persist_lexical_rebuild_generation_artifacts(
&index_path,
&rebuild_state.db.storage_fingerprint,
rebuild_state.processed_conversations,
total_conversations,
final_observed_messages,
indexed_docs,
&equivalence_evidence,
)?;
log_lexical_generation_manifest_published(&generation_manifest, &equivalence_evidence);
let refresh_ledger =
build_authoritative_lexical_refresh_ledger(AuthoritativeLexicalRefreshLedgerInput {
publish_mode: "direct_live_commit",
lexical_duration: lexical_rebuild_duration,
publish_duration: publish_started.elapsed(),
processed_conversations: rebuild_state.processed_conversations,
total_conversations,
final_observed_messages,
indexed_docs,
equivalence_evidence: &equivalence_evidence,
});
persist_lexical_refresh_ledger(&index_path, &refresh_ledger)?;
log_lexical_refresh_ledger_published(&refresh_ledger);
Ok(LexicalRebuildOutcome {
indexed_docs,
observed_messages: Some(final_observed_messages),
exact_checkpoint_persisted: true,
equivalence: Some(equivalence_evidence),
})
}
#[cfg(test)]
fn ingest_batch(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
data_dir: &Path,
convs: &[NormalizedConversation],
progress: &Option<Arc<IndexingProgress>>,
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Result<CanonicalMutationCounts> {
let outcome = ingest_batch_detailed(
storage,
t_index,
data_dir,
convs,
progress,
lexical_strategy,
defer_checkpoints,
)?;
if outcome.lexical_update_deferred {
anyhow::bail!(
"incremental lexical update ran out of memory after SQLite ingest; rerun with CASS_DEFER_LEXICAL_UPDATES=1 or rebuild derived lexical assets"
);
}
Ok(outcome.canonical_mutations)
}
fn ingest_batch_detailed(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
data_dir: &Path,
convs: &[NormalizedConversation],
progress: &Option<Arc<IndexingProgress>>,
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Result<NonWatchIngestOutcome> {
let trace_span =
robot_trace_ingest_start("ingest_batch", convs, lexical_strategy, defer_checkpoints);
let batch_result = persist::persist_conversations_batched_with_raw_mirror_links(
storage,
t_index,
data_dir,
convs,
lexical_strategy,
defer_checkpoints,
);
let batch_outcome = match batch_result {
Ok(batch_outcome) => batch_outcome,
Err(error) => {
robot_trace_ingest_finish(trace_span, "error", 0, 0, Some(&error));
return Err(error);
}
};
if batch_outcome.lexical_update_deferred {
tracing::warn!(
error = ?batch_outcome.lexical_update_error,
"SQLite ingest succeeded but inline lexical update was deferred; scheduling authoritative lexical rebuild"
);
}
if let Some(p) = progress {
p.current.fetch_add(convs.len(), Ordering::Relaxed);
}
robot_trace_ingest_finish(
trace_span,
"ok",
batch_outcome.inserted_conversations,
batch_outcome.inserted_messages,
None,
);
Ok(NonWatchIngestOutcome {
canonical_mutations: CanonicalMutationCounts {
inserted_conversations: batch_outcome.inserted_conversations,
inserted_messages: batch_outcome.inserted_messages,
},
quarantined_conversations: 0,
lexical_update_deferred: batch_outcome.lexical_update_deferred,
})
}
#[allow(clippy::too_many_arguments)]
fn ingest_non_watch_batch_with_oom_split(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
data_dir: &Path,
convs: &[NormalizedConversation],
progress: &Option<Arc<IndexingProgress>>,
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Result<NonWatchIngestOutcome> {
if convs.is_empty() {
return Ok(NonWatchIngestOutcome::default());
}
let first_attempt = ingest_non_watch_batch_once(
storage,
t_index,
data_dir,
convs,
progress,
lexical_strategy,
defer_checkpoints,
);
match first_attempt {
Ok(outcome) => Ok(outcome),
Err(error) if error_is_out_of_memory(&error) => ingest_non_watch_oom_retry_or_quarantine(
storage,
data_dir,
convs,
progress,
lexical_strategy,
defer_checkpoints,
error,
),
Err(error) => Err(error),
}
}
#[allow(clippy::too_many_arguments)]
fn ingest_non_watch_batch_once(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
data_dir: &Path,
convs: &[NormalizedConversation],
progress: &Option<Arc<IndexingProgress>>,
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Result<NonWatchIngestOutcome> {
if should_inject_non_watch_ingest_test_oom(convs) {
return Err(anyhow::Error::new(frankensqlite::FrankenError::OutOfMemory));
}
let outcome = ingest_batch_detailed(
storage,
t_index,
data_dir,
convs,
progress,
lexical_strategy,
defer_checkpoints,
)?;
clear_poison_conversations_after_successful_ingest(
data_dir,
INDEX_INGEST_POISON_FILE,
"index-ingest-out-of-memory",
convs,
);
Ok(outcome)
}
#[allow(clippy::too_many_arguments)]
fn ingest_non_watch_oom_retry_or_quarantine(
storage: &FrankenStorage,
data_dir: &Path,
convs: &[NormalizedConversation],
progress: &Option<Arc<IndexingProgress>>,
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
error: anyhow::Error,
) -> Result<NonWatchIngestOutcome> {
if lexical_population_strategy_requires_inline_tantivy(lexical_strategy) {
tracing::warn!(
conversations = convs.len(),
error = %error,
"non-watch ingest ran out of memory; retrying batch with deferred lexical updates before quarantine"
);
return match ingest_non_watch_batch_once(
storage,
None,
data_dir,
convs,
progress,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
defer_checkpoints,
) {
Ok(mut outcome) => {
outcome.lexical_update_deferred = true;
Ok(outcome)
}
Err(retry_error) if error_is_out_of_memory(&retry_error) => {
ingest_non_watch_oom_retry_or_quarantine(
storage,
data_dir,
convs,
progress,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
defer_checkpoints,
retry_error,
)
}
Err(retry_error) => Err(retry_error),
};
}
if convs.len() > 1 {
let split_at = convs.len() / 2;
tracing::warn!(
conversations = convs.len(),
left = split_at,
right = convs.len().saturating_sub(split_at),
error = %error,
"non-watch deferred ingest batch ran out of memory; retrying as smaller batches"
);
let mut left = ingest_non_watch_batch_with_oom_split(
storage,
None,
data_dir,
&convs[..split_at],
progress,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
defer_checkpoints,
)?;
let right = ingest_non_watch_batch_with_oom_split(
storage,
None,
data_dir,
&convs[split_at..],
progress,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
defer_checkpoints,
)?;
left = left.accumulate(right);
left.lexical_update_deferred = true;
return Ok(left);
}
let conv = &convs[0];
record_index_poison_conversation(data_dir, conv, &error)?;
if let Some(progress) = progress {
progress.current.fetch_add(1, Ordering::Relaxed);
}
tracing::warn!(
agent = %conv.agent_slug,
external_id = conv.external_id.as_deref().unwrap_or(""),
source_path = %conv.source_path.display(),
error = %error,
"single non-watch conversation ran out of memory after deferred lexical retry; quarantined and continuing index refresh"
);
Ok(NonWatchIngestOutcome {
canonical_mutations: CanonicalMutationCounts::default(),
quarantined_conversations: 1,
lexical_update_deferred: true,
})
}
#[allow(clippy::too_many_arguments)]
fn ingest_batch_with_semantic_delta(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
data_dir: &Path,
convs: &[NormalizedConversation],
progress: &Option<Arc<IndexingProgress>>,
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
semantic_delta: Option<&mut WatchSemanticDelta>,
) -> Result<persist::PersistBatchOutcome> {
let trace_span = robot_trace_ingest_start(
"ingest_batch_with_semantic_delta",
convs,
lexical_strategy,
defer_checkpoints,
);
let batch_result = if semantic_delta.is_some() {
persist::persist_conversations_batched_with_semantic_delta_and_raw_mirror_links(
storage,
t_index,
data_dir,
convs,
lexical_strategy,
defer_checkpoints,
)
} else {
persist::persist_conversations_batched_with_raw_mirror_links(
storage,
t_index,
data_dir,
convs,
lexical_strategy,
defer_checkpoints,
)
};
let batch_outcome = match batch_result {
Ok(batch_outcome) => batch_outcome,
Err(error) => {
robot_trace_ingest_finish(trace_span, "error", 0, 0, Some(&error));
return Err(error);
}
};
clear_poison_conversations_after_successful_ingest(
data_dir,
WATCH_INGEST_POISON_FILE,
"watch-ingest-out-of-memory",
convs,
);
if let Some(p) = progress {
p.current.fetch_add(convs.len(), Ordering::Relaxed);
}
robot_trace_ingest_finish(
trace_span,
"ok",
batch_outcome.inserted_conversations,
batch_outcome.inserted_messages,
None,
);
Ok(batch_outcome)
}
#[derive(Debug, Default)]
struct WatchIngestBatchOutcome {
batch_outcome: persist::PersistBatchOutcome,
processed_conversations: usize,
quarantined_conversations: usize,
max_payload_watermark_ms: Option<i64>,
}
impl WatchIngestBatchOutcome {
fn merge(&mut self, other: Self) {
self.batch_outcome.merge(other.batch_outcome);
self.processed_conversations = self
.processed_conversations
.saturating_add(other.processed_conversations);
self.quarantined_conversations = self
.quarantined_conversations
.saturating_add(other.quarantined_conversations);
if let Some(ts) = other.max_payload_watermark_ms {
self.max_payload_watermark_ms = Some(
self.max_payload_watermark_ms
.map_or(ts, |current| current.max(ts)),
);
}
}
}
#[allow(clippy::too_many_arguments)]
fn ingest_watch_batch_with_oom_split(
storage: &FrankenStorage,
t_index: &mut TantivyIndex,
data_dir: &Path,
convs: &[NormalizedConversation],
progress: &Option<Arc<IndexingProgress>>,
defer_checkpoints: bool,
capture_semantic_delta: bool,
) -> Result<WatchIngestBatchOutcome> {
debug_assert!(!convs.is_empty());
let batch_result = if should_inject_watch_ingest_test_oom(convs) {
Err(anyhow::Error::new(frankensqlite::FrankenError::OutOfMemory))
} else {
let mut semantic_delta = WatchSemanticDelta::default();
ingest_batch_with_semantic_delta(
storage,
Some(t_index),
data_dir,
convs,
progress,
LexicalPopulationStrategy::IncrementalInline,
defer_checkpoints,
capture_semantic_delta.then_some(&mut semantic_delta),
)
};
match batch_result {
Ok(batch_outcome) => Ok(WatchIngestBatchOutcome {
batch_outcome,
processed_conversations: convs.len(),
quarantined_conversations: 0,
max_payload_watermark_ms: conversations_payload_watermark_ms(convs),
}),
Err(error) if error_is_out_of_memory(&error) && convs.len() > 1 => {
let split_at = convs.len() / 2;
tracing::warn!(
conversations = convs.len(),
left = split_at,
right = convs.len().saturating_sub(split_at),
error = %error,
"watch ingest batch ran out of memory; retrying as smaller batches"
);
let mut merged = ingest_watch_batch_with_oom_split(
storage,
t_index,
data_dir,
&convs[..split_at],
progress,
defer_checkpoints,
capture_semantic_delta,
)?;
let right = ingest_watch_batch_with_oom_split(
storage,
t_index,
data_dir,
&convs[split_at..],
progress,
defer_checkpoints,
capture_semantic_delta,
)?;
merged.merge(right);
Ok(merged)
}
Err(error) if error_is_out_of_memory(&error) => {
let conv = &convs[0];
record_watch_poison_conversation(data_dir, conv, &error)?;
if let Some(progress) = progress {
progress.current.fetch_add(1, Ordering::Relaxed);
}
tracing::warn!(
agent = %conv.agent_slug,
external_id = conv.external_id.as_deref().unwrap_or(""),
source_path = %conv.source_path.display(),
error = %error,
"single watch conversation ran out of memory; quarantined and advancing watch progress"
);
Ok(WatchIngestBatchOutcome {
batch_outcome: persist::PersistBatchOutcome::default(),
processed_conversations: 1,
quarantined_conversations: 1,
max_payload_watermark_ms: None,
})
}
Err(error) => Err(error),
}
}
fn conversations_payload_watermark_ms(convs: &[NormalizedConversation]) -> Option<i64> {
convs
.iter()
.filter_map(conversation_payload_watermark_ms)
.max()
}
fn sort_watch_conversations_for_watermark(convs: &mut [NormalizedConversation]) {
convs.sort_by(|left, right| {
conversation_payload_watermark_ms(left)
.cmp(&conversation_payload_watermark_ms(right))
.then_with(|| left.source_path.cmp(&right.source_path))
.then_with(|| left.external_id.cmp(&right.external_id))
});
}
fn conversation_payload_watermark_ms(conv: &NormalizedConversation) -> Option<i64> {
conv.started_at
.into_iter()
.chain(conv.ended_at)
.chain(
conv.messages
.iter()
.filter_map(|message| message.created_at),
)
.max()
}
fn save_watch_state_watermark(
data_dir: &Path,
state: &Mutex<HashMap<ConnectorKind, i64>>,
kind: ConnectorKind,
ts_val: i64,
) -> Result<()> {
let mut guard = state
.lock()
.map_err(|_| anyhow::anyhow!("state lock poisoned"))?;
let entry = guard.entry(kind).or_insert(ts_val);
*entry = (*entry).max(ts_val);
save_watch_state(data_dir, &guard)?;
Ok(())
}
fn error_is_out_of_memory(error: &anyhow::Error) -> bool {
error.chain().any(|cause| {
if cause
.downcast_ref::<frankensqlite::FrankenError>()
.is_some_and(|err| matches!(err, frankensqlite::FrankenError::OutOfMemory))
{
return true;
}
error_message_is_exact_out_of_memory(&cause.to_string())
})
}
fn error_message_is_exact_out_of_memory(message: &str) -> bool {
matches!(
message.trim().to_ascii_lowercase().as_str(),
"out of memory" | "not enough memory"
)
}
fn format_error_chain(error: &anyhow::Error) -> String {
let mut parts: Vec<String> = Vec::new();
for cause in error.chain() {
let rendered = cause.to_string();
let trimmed = rendered.trim();
if trimmed.is_empty() {
continue;
}
if parts.last().is_some_and(|prev| prev == trimmed) {
continue;
}
parts.push(trimmed.to_string());
}
if parts.is_empty() {
return error.to_string();
}
parts.join(" | ")
}
fn record_watch_poison_conversation(
data_dir: &Path,
conv: &NormalizedConversation,
error: &anyhow::Error,
) -> Result<()> {
record_poison_conversation(
data_dir,
WATCH_INGEST_POISON_FILE,
"watch-ingest-out-of-memory",
conv,
error,
)
}
fn record_index_poison_conversation(
data_dir: &Path,
conv: &NormalizedConversation,
error: &anyhow::Error,
) -> Result<()> {
record_poison_conversation(
data_dir,
INDEX_INGEST_POISON_FILE,
"index-ingest-out-of-memory",
conv,
error,
)
}
const WATCH_INGEST_POISON_FILE: &str = "watch_ingest_poison.jsonl";
const INDEX_INGEST_POISON_FILE: &str = "index_ingest_poison.jsonl";
const POISON_CONVERSATION_QUARANTINE_SCHEMA_VERSION: i64 = 1;
const INGEST_QUARANTINE_CIRCUIT_DEFAULT_WINDOW_SECONDS: i64 = 3_600;
const INGEST_QUARANTINE_CIRCUIT_DEFAULT_LIMIT: usize = 25;
fn record_poison_conversation(
data_dir: &Path,
file_name: &str,
reason: &str,
conv: &NormalizedConversation,
error: &anyhow::Error,
) -> Result<()> {
let quarantine_dir = data_dir.join("quarantine");
fs::create_dir_all(&quarantine_dir).with_context(|| {
format!(
"creating ingest quarantine directory {}",
quarantine_dir.display()
)
})?;
let now_ms = FrankenStorage::now_millis();
let conversation_id = poison_conversation_id(conv);
let schema_version_at_quarantine = crate::storage::sqlite::CURRENT_SCHEMA_VERSION;
let key = (conversation_id.clone(), schema_version_at_quarantine);
let path = quarantine_dir.join(file_name);
let mut raw_preserved_lines = Vec::new();
let mut records = BTreeMap::<(String, i64), serde_json::Value>::new();
if path.exists() {
let contents = fs::read_to_string(&path)
.with_context(|| format!("reading ingest quarantine file {}", path.display()))?;
for line in contents.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
match serde_json::from_str::<serde_json::Value>(trimmed) {
Ok(value) => {
if let Some(existing_key) = poison_record_key_from_value(&value) {
records.insert(existing_key, value);
} else {
raw_preserved_lines.push(trimmed.to_string());
}
}
Err(_) => raw_preserved_lines.push(trimmed.to_string()),
}
}
}
let previous = records.remove(&key);
let first_quarantined_at_ms = previous
.as_ref()
.and_then(poison_record_first_quarantined_at_ms)
.unwrap_or(now_ms);
let attempt_count = previous
.as_ref()
.and_then(|value| value.get("attempt_count"))
.and_then(serde_json::Value::as_u64)
.unwrap_or(0)
.saturating_add(1);
let full_error_chain = format_error_chain(error);
let record = serde_json::json!({
"schema_version": POISON_CONVERSATION_QUARANTINE_SCHEMA_VERSION,
"conversation_id": conversation_id,
"schema_version_at_quarantine": schema_version_at_quarantine,
"first_quarantined_at_ms": first_quarantined_at_ms,
"last_attempt_at_ms": now_ms,
"attempt_count": attempt_count,
"cass_version_at_quarantine": current_cass_version(),
"reason": reason,
"error_kind": "out-of-memory",
"last_error": full_error_chain,
"error": error.to_string(),
"agent_slug": conv.agent_slug,
"external_id": conv.external_id.as_deref(),
"source_path": conv.source_path.display().to_string(),
"workspace": conv.workspace.as_ref().map(|path| path.display().to_string()),
"started_at": conv.started_at,
"ended_at": conv.ended_at,
"message_count": conv.messages.len(),
});
records.insert(key, record);
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(&path)
.with_context(|| format!("opening ingest quarantine file {}", path.display()))?;
for line in raw_preserved_lines {
writeln!(file, "{line}")
.with_context(|| format!("preserving ingest quarantine record {}", path.display()))?;
}
for record in records.values() {
writeln!(file, "{record}")
.with_context(|| format!("writing ingest quarantine record {}", path.display()))?;
}
file.sync_all()
.with_context(|| format!("syncing ingest quarantine record {}", path.display()))?;
record_structured_poison_quarantine_state(
data_dir,
&conversation_id,
schema_version_at_quarantine,
reason,
error,
now_ms,
);
Ok(())
}
fn current_cass_version() -> &'static str {
env!("CARGO_PKG_VERSION")
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct StaleIndexIngestQuarantineRetry {
stale_records: usize,
legacy_records: usize,
previous_versions: Vec<String>,
}
#[derive(Default)]
struct StalePoisonVersionAccumulator {
stale_keys: BTreeSet<(String, i64)>,
legacy_keys: BTreeSet<(String, i64)>,
previous_versions: BTreeSet<String>,
}
impl StalePoisonVersionAccumulator {
fn observe(&mut self, key: (String, i64), cass_version: Option<&str>, current_version: &str) {
match cass_version {
Some(version) if version == current_version => {}
Some(version) => {
self.stale_keys.insert(key);
self.previous_versions.insert(version.to_string());
}
None => {
self.legacy_keys.insert(key.clone());
self.stale_keys.insert(key);
self.previous_versions.insert("unknown".to_string());
}
}
}
fn finish(self) -> Option<StaleIndexIngestQuarantineRetry> {
(!self.stale_keys.is_empty()).then(|| StaleIndexIngestQuarantineRetry {
stale_records: self.stale_keys.len(),
legacy_records: self.legacy_keys.len(),
previous_versions: self.previous_versions.into_iter().collect(),
})
}
}
fn stale_index_ingest_quarantine_version_retry(
data_dir: &Path,
) -> Result<Option<StaleIndexIngestQuarantineRetry>> {
let current_version = current_cass_version();
let mut accumulator = StalePoisonVersionAccumulator::default();
let path = data_dir.join("quarantine").join(INDEX_INGEST_POISON_FILE);
if path.exists() {
let contents = fs::read_to_string(&path)
.with_context(|| format!("reading index ingest quarantine file {}", path.display()))?;
for line in contents.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let Ok(value) = serde_json::from_str::<serde_json::Value>(trimmed) else {
continue;
};
if let Some(key) = poison_record_key_from_value(&value) {
accumulator.observe(key, poison_record_cass_version(&value), current_version);
}
}
}
let state = QuarantineState::load(data_dir);
for (key, record) in state.iter() {
if record.last_reason.starts_with("index-ingest-out-of-memory") {
accumulator.observe(
(key.conversation_id, i64::from(key.schema_version)),
record.cass_version_at_quarantine.as_deref(),
current_version,
);
}
}
Ok(accumulator.finish())
}
fn mark_stale_index_ingest_quarantine_retry_attempted(data_dir: &Path) -> Result<usize> {
let jsonl_marked = mark_stale_index_ingest_jsonl_retry_attempted(data_dir)?;
let structured_marked = mark_stale_index_ingest_structured_retry_attempted(data_dir);
Ok(jsonl_marked.saturating_add(structured_marked))
}
fn mark_stale_index_ingest_jsonl_retry_attempted(data_dir: &Path) -> Result<usize> {
let path = data_dir.join("quarantine").join(INDEX_INGEST_POISON_FILE);
if !path.exists() {
return Ok(0);
}
let contents = fs::read_to_string(&path)
.with_context(|| format!("reading index ingest quarantine file {}", path.display()))?;
let current_version = current_cass_version();
let mut retained_lines = Vec::new();
let mut marked = 0usize;
for line in contents.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let Ok(mut value) = serde_json::from_str::<serde_json::Value>(trimmed) else {
retained_lines.push(trimmed.to_string());
continue;
};
let should_mark = poison_record_key_from_value(&value).is_some()
&& poison_record_version_needs_retry(
poison_record_cass_version(&value),
current_version,
);
if should_mark && let Some(object) = value.as_object_mut() {
object.insert(
"cass_version_at_quarantine".to_string(),
serde_json::json!(current_version),
);
marked = marked.saturating_add(1);
retained_lines.push(value.to_string());
} else {
retained_lines.push(trimmed.to_string());
}
}
if marked == 0 {
return Ok(0);
}
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(&path)
.with_context(|| format!("opening ingest quarantine file {}", path.display()))?;
for line in retained_lines {
writeln!(file, "{line}")
.with_context(|| format!("rewriting ingest quarantine file {}", path.display()))?;
}
file.sync_all()
.with_context(|| format!("syncing ingest quarantine file {}", path.display()))?;
Ok(marked)
}
fn mark_stale_index_ingest_structured_retry_attempted(data_dir: &Path) -> usize {
let mut state = QuarantineState::load(data_dir);
if state.is_empty() {
return 0;
}
let current_version = current_cass_version();
let mut marked = 0usize;
for record in state.entries.values_mut() {
if record.last_reason.starts_with("index-ingest-out-of-memory")
&& poison_record_version_needs_retry(
record.cass_version_at_quarantine.as_deref(),
current_version,
)
{
record.cass_version_at_quarantine = Some(current_version.to_string());
marked = marked.saturating_add(1);
}
}
if marked == 0 {
return 0;
}
if let Err(err) = state.save(data_dir) {
tracing::warn!(
data_dir = %data_dir.display(),
error = %err,
"failed to persist structured ingest quarantine retry marker"
);
return 0;
}
marked
}
fn clear_poison_conversations_after_successful_ingest(
data_dir: &Path,
file_name: &str,
reason: &str,
convs: &[NormalizedConversation],
) {
if convs.is_empty() {
return;
}
let conversation_ids = convs
.iter()
.map(poison_conversation_id)
.collect::<BTreeSet<_>>();
let jsonl_cleared =
match clear_poison_jsonl_records(data_dir, file_name, reason, &conversation_ids) {
Ok(cleared) => cleared,
Err(err) => {
tracing::warn!(
data_dir = %data_dir.display(),
file_name,
error = %err,
"failed to clear successful ingest records from poison quarantine JSONL"
);
0
}
};
let structured_cleared =
clear_structured_poison_quarantine_records(data_dir, reason, &conversation_ids);
let cleared = jsonl_cleared.saturating_add(structured_cleared);
if cleared > 0 {
tracing::info!(
data_dir = %data_dir.display(),
file_name,
reason,
cleared,
"cleared poison quarantine records after successful ingest retry"
);
}
}
fn clear_poison_jsonl_records(
data_dir: &Path,
file_name: &str,
reason: &str,
conversation_ids: &BTreeSet<String>,
) -> Result<usize> {
let path = data_dir.join("quarantine").join(file_name);
if !path.exists() {
return Ok(0);
}
let contents = fs::read_to_string(&path)
.with_context(|| format!("reading ingest quarantine file {}", path.display()))?;
let mut retained_lines = Vec::new();
let mut cleared = 0usize;
for line in contents.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let should_clear = serde_json::from_str::<serde_json::Value>(trimmed)
.ok()
.and_then(|value| {
let record_reason_matches = value
.get("reason")
.and_then(serde_json::Value::as_str)
.is_none_or(|record_reason| record_reason == reason);
record_reason_matches
.then(|| poison_record_key_from_value(&value))
.flatten()
})
.is_some_and(|(conversation_id, _)| conversation_ids.contains(&conversation_id));
if should_clear {
cleared = cleared.saturating_add(1);
} else {
retained_lines.push(trimmed.to_string());
}
}
if cleared == 0 {
return Ok(0);
}
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(&path)
.with_context(|| format!("opening ingest quarantine file {}", path.display()))?;
for line in retained_lines {
writeln!(file, "{line}")
.with_context(|| format!("rewriting ingest quarantine file {}", path.display()))?;
}
file.sync_all()
.with_context(|| format!("syncing ingest quarantine file {}", path.display()))?;
Ok(cleared)
}
fn clear_structured_poison_quarantine_records(
data_dir: &Path,
reason: &str,
conversation_ids: &BTreeSet<String>,
) -> usize {
let mut state = QuarantineState::load(data_dir);
if state.is_empty() {
return 0;
}
let keys = state
.iter()
.filter(|(key, record)| {
conversation_ids.contains(&key.conversation_id)
&& record.last_reason.starts_with(reason)
})
.map(|(key, _)| key)
.collect::<Vec<_>>();
if keys.is_empty() {
return 0;
}
let mut cleared = 0usize;
for key in keys {
if state.clear(&key) {
cleared = cleared.saturating_add(1);
}
}
if let Err(err) = state.save(data_dir) {
tracing::warn!(
data_dir = %data_dir.display(),
error = %err,
"failed to persist structured ingest quarantine cleanup"
);
return 0;
}
cleared
}
fn record_structured_poison_quarantine_state(
data_dir: &Path,
conversation_id: &str,
schema_version_at_quarantine: i64,
reason: &str,
error: &anyhow::Error,
now_ms: i64,
) {
let Ok(schema_version) = u32::try_from(schema_version_at_quarantine) else {
tracing::warn!(
schema_version_at_quarantine,
"skipping structured ingest quarantine state update because schema version is out of range"
);
return;
};
let Some(now) = chrono::DateTime::<chrono::Utc>::from_timestamp_millis(now_ms) else {
tracing::warn!(
now_ms,
"skipping structured ingest quarantine state update because timestamp is invalid"
);
return;
};
let mut state = QuarantineState::load(data_dir);
let key = QuarantineKey::new(conversation_id, schema_version);
state.record_attempt(&key, format!("{reason}: {error}"), now);
if let Err(err) = state.save(data_dir) {
tracing::warn!(
data_dir = %data_dir.display(),
error = %err,
"failed to persist structured ingest quarantine state"
);
}
}
fn poison_conversation_id(conv: &NormalizedConversation) -> String {
let workspace = conv
.workspace
.as_ref()
.map(|path| path.display().to_string())
.unwrap_or_default();
let external_id = conv.external_id.as_deref().unwrap_or("");
format!(
"{}|{}|{}|{}|{}|{}|{}",
conv.agent_slug,
conv.source_path.display(),
workspace,
external_id,
conv.started_at.unwrap_or_default(),
conv.ended_at.unwrap_or_default(),
conv.messages.len()
)
}
fn poison_record_key_from_value(value: &serde_json::Value) -> Option<(String, i64)> {
let schema_version_at_quarantine = value
.get("schema_version_at_quarantine")
.and_then(serde_json::Value::as_i64)
.unwrap_or(crate::storage::sqlite::CURRENT_SCHEMA_VERSION);
if let Some(conversation_id) = value
.get("conversation_id")
.and_then(serde_json::Value::as_str)
.map(str::to_string)
.filter(|id| !id.trim().is_empty())
{
return Some((conversation_id, schema_version_at_quarantine));
}
let agent_slug = value
.get("agent_slug")
.and_then(serde_json::Value::as_str)?;
let source_path = value
.get("source_path")
.and_then(serde_json::Value::as_str)?;
let workspace = value
.get("workspace")
.and_then(serde_json::Value::as_str)
.unwrap_or("");
let external_id = value
.get("external_id")
.and_then(serde_json::Value::as_str)
.unwrap_or("");
let started_at = value
.get("started_at")
.and_then(serde_json::Value::as_i64)
.unwrap_or_default();
let ended_at = value
.get("ended_at")
.and_then(serde_json::Value::as_i64)
.unwrap_or_default();
let message_count = value
.get("message_count")
.and_then(serde_json::Value::as_u64)
.unwrap_or_default();
Some((
format!(
"{agent_slug}|{source_path}|{workspace}|{external_id}|{started_at}|{ended_at}|{message_count}"
),
schema_version_at_quarantine,
))
}
fn poison_record_cass_version(value: &serde_json::Value) -> Option<&str> {
value
.get("cass_version_at_quarantine")
.and_then(serde_json::Value::as_str)
.map(str::trim)
.filter(|version| !version.is_empty())
}
fn poison_record_version_needs_retry(cass_version: Option<&str>, current_version: &str) -> bool {
!matches!(cass_version, Some(version) if version == current_version)
}
fn poison_record_first_quarantined_at_ms(value: &serde_json::Value) -> Option<i64> {
value
.get("first_quarantined_at_ms")
.and_then(serde_json::Value::as_i64)
.or_else(|| {
value
.get("recorded_at_ms")
.and_then(serde_json::Value::as_i64)
})
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct ConversationIngestQuarantineSummary {
pub schema_version: i64,
pub status: String,
pub quarantined_conversations: usize,
pub recent_quarantined_conversations: usize,
pub recent_window_seconds: i64,
pub circuit_breaker_limit: usize,
pub circuit_breaker_active: bool,
pub quarantine_files: Vec<String>,
pub newest_last_attempt_at_ms: Option<i64>,
pub recommended_action: Option<String>,
}
pub fn conversation_ingest_quarantine_summary(
data_dir: &Path,
) -> ConversationIngestQuarantineSummary {
let quarantine_dir = data_dir.join("quarantine");
let mut keys = BTreeSet::<(String, i64)>::new();
let mut recent_keys = BTreeSet::<(String, i64)>::new();
let mut quarantine_files = Vec::new();
let mut newest_last_attempt_at_ms: Option<i64> = None;
let recent_window_seconds = ingest_quarantine_circuit_window_seconds();
let recent_cutoff_ms =
FrankenStorage::now_millis().saturating_sub(recent_window_seconds.saturating_mul(1_000));
let circuit_breaker_limit = ingest_quarantine_circuit_limit();
let state_path = QuarantineState::path(data_dir);
if state_path.exists() {
quarantine_files.push(state_path.display().to_string());
let state = QuarantineState::load(data_dir);
for (key, record) in state.iter() {
let last_attempt_at_ms = record.last_attempt_at.timestamp_millis();
let summary_key = (key.conversation_id, i64::from(key.schema_version));
keys.insert(summary_key.clone());
if last_attempt_at_ms >= recent_cutoff_ms {
recent_keys.insert(summary_key);
}
newest_last_attempt_at_ms = Some(
newest_last_attempt_at_ms.map_or(last_attempt_at_ms, |current| {
current.max(last_attempt_at_ms)
}),
);
}
}
for file_name in [WATCH_INGEST_POISON_FILE, INDEX_INGEST_POISON_FILE] {
let path = quarantine_dir.join(file_name);
if !path.exists() {
continue;
}
quarantine_files.push(path.display().to_string());
let Ok(contents) = fs::read_to_string(&path) else {
continue;
};
for line in contents.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let Ok(value) = serde_json::from_str::<serde_json::Value>(trimmed) else {
continue;
};
let key = poison_record_key_from_value(&value);
if let Some(last_attempt_at_ms) = value
.get("last_attempt_at_ms")
.and_then(serde_json::Value::as_i64)
.or_else(|| {
value
.get("recorded_at_ms")
.and_then(serde_json::Value::as_i64)
})
{
if let Some(key) = key.as_ref()
&& last_attempt_at_ms >= recent_cutoff_ms
{
recent_keys.insert(key.clone());
}
newest_last_attempt_at_ms = Some(
newest_last_attempt_at_ms.map_or(last_attempt_at_ms, |current| {
current.max(last_attempt_at_ms)
}),
);
}
if let Some(key) = key {
keys.insert(key);
}
}
}
let quarantined_conversations = keys.len();
let recent_quarantined_conversations = recent_keys.len();
let circuit_breaker_active =
circuit_breaker_limit > 0 && recent_quarantined_conversations >= circuit_breaker_limit;
ConversationIngestQuarantineSummary {
schema_version: POISON_CONVERSATION_QUARANTINE_SCHEMA_VERSION,
status: if circuit_breaker_active {
"critical".to_string()
} else if quarantined_conversations > 0 {
"degraded".to_string()
} else {
"ok".to_string()
},
quarantined_conversations,
recent_quarantined_conversations,
recent_window_seconds,
circuit_breaker_limit,
circuit_breaker_active,
quarantine_files,
newest_last_attempt_at_ms,
recommended_action: if circuit_breaker_active {
Some(
"Quarantine volume exceeded the recent circuit-breaker threshold; pause the watcher, inspect the listed quarantine file(s), then retry repaired source paths with `cass index --watch-once <path> --json --no-progress-events` before resuming watch."
.to_string(),
)
} else {
(quarantined_conversations > 0).then(|| {
"Inspect the listed quarantine file(s), then retry repaired source paths with `cass index --watch-once <path> --json --no-progress-events` or run a bounded full refresh."
.to_string()
})
},
}
}
fn ingest_quarantine_circuit_window_seconds() -> i64 {
dotenvy::var("CASS_INGEST_QUARANTINE_CIRCUIT_WINDOW_SECS")
.ok()
.and_then(|raw| raw.trim().parse::<i64>().ok())
.filter(|seconds| *seconds > 0)
.unwrap_or(INGEST_QUARANTINE_CIRCUIT_DEFAULT_WINDOW_SECONDS)
}
fn ingest_quarantine_circuit_limit() -> usize {
dotenvy::var("CASS_INGEST_QUARANTINE_CIRCUIT_LIMIT")
.ok()
.and_then(|raw| raw.trim().parse::<usize>().ok())
.unwrap_or(INGEST_QUARANTINE_CIRCUIT_DEFAULT_LIMIT)
}
#[cfg(test)]
fn should_inject_watch_ingest_test_oom(convs: &[NormalizedConversation]) -> bool {
dotenvy::var("CASS_TEST_WATCH_INGEST_OOM_MIN_CONVS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.is_some_and(|min| min > 0 && convs.len() >= min)
}
#[cfg(not(test))]
fn should_inject_watch_ingest_test_oom(_convs: &[NormalizedConversation]) -> bool {
false
}
#[cfg(test)]
fn should_inject_non_watch_ingest_test_oom(convs: &[NormalizedConversation]) -> bool {
dotenvy::var("CASS_TEST_NON_WATCH_INGEST_OOM_MIN_CONVS")
.ok()
.and_then(|value| value.parse::<usize>().ok())
.is_some_and(|min| min > 0 && convs.len() >= min)
}
#[cfg(not(test))]
fn should_inject_non_watch_ingest_test_oom(_convs: &[NormalizedConversation]) -> bool {
false
}
pub use crate::connectors::get_connector_factories;
fn build_watch_roots(additional_scan_roots: Vec<ScanRoot>) -> Vec<(ConnectorKind, ScanRoot)> {
let factories = configured_connector_factories();
let mut roots = Vec::new();
let mut all_kinds = Vec::new();
for (name, factory) in factories {
if let Some(kind) = ConnectorKind::from_slug(name) {
all_kinds.push(kind);
let conn = factory();
let detection = conn.detect();
if detection.detected {
for root_path in detection.root_paths {
roots.push((kind, ScanRoot::local(root_path)));
}
}
}
}
for configured_root in additional_scan_roots {
for kind in &all_kinds {
roots.push((*kind, configured_root.clone()));
}
}
roots
}
impl ConnectorKind {
fn from_slug(slug: &str) -> Option<Self> {
match slug {
"codex" => Some(Self::Codex),
"cline" => Some(Self::Cline),
"gemini" => Some(Self::Gemini),
"claude" => Some(Self::Claude),
"clawdbot" => Some(Self::Clawdbot),
"vibe" => Some(Self::Vibe),
"amp" => Some(Self::Amp),
"opencode" => Some(Self::OpenCode),
"aider" => Some(Self::Aider),
"cursor" => Some(Self::Cursor),
"chatgpt" => Some(Self::ChatGpt),
"pi_agent" => Some(Self::PiAgent),
"factory" => Some(Self::Factory),
"openclaw" => Some(Self::OpenClaw),
"copilot" => Some(Self::Copilot),
"kimi" => Some(Self::Kimi),
"copilot_cli" => Some(Self::CopilotCli),
"qwen" => Some(Self::Qwen),
_ => None,
}
}
fn slug(self) -> &'static str {
match self {
Self::Codex => "codex",
Self::Cline => "cline",
Self::Gemini => "gemini",
Self::Claude => "claude",
Self::Clawdbot => "clawdbot",
Self::Vibe => "vibe",
Self::Amp => "amp",
Self::OpenCode => "opencode",
Self::Aider => "aider",
Self::Cursor => "cursor",
Self::ChatGpt => "chatgpt",
Self::PiAgent => "pi_agent",
Self::Factory => "factory",
Self::OpenClaw => "openclaw",
Self::Copilot => "copilot",
Self::Kimi => "kimi",
Self::CopilotCli => "copilot_cli",
Self::Qwen => "qwen",
}
}
fn create_connector(&self) -> Box<dyn Connector + Send> {
match self {
Self::Codex => Box::new(CodexConnector::new()),
Self::Cline => Box::new(ClineConnector::new()),
Self::Gemini => Box::new(GeminiConnector::new()),
Self::Claude => Box::new(ClaudeCodeConnector::new()),
Self::Clawdbot => Box::new(ClawdbotConnector::new()),
Self::Vibe => Box::new(VibeConnector::new()),
Self::Amp => Box::new(AmpConnector::new()),
Self::OpenCode => Box::new(OpenCodeConnector::new()),
Self::Aider => Box::new(AiderConnector::new()),
Self::Cursor => Box::new(CursorConnector::new()),
Self::ChatGpt => Box::new(ChatGptConnector::new()),
Self::PiAgent => Box::new(PiAgentConnector::new()),
Self::Factory => Box::new(FactoryConnector::new()),
Self::OpenClaw => Box::new(OpenClawConnector::new()),
Self::Copilot => Box::new(CopilotConnector::new()),
Self::Kimi => Box::new(KimiConnector::new()),
Self::CopilotCli => Box::new(CopilotCliConnector::new()),
Self::Qwen => Box::new(QwenConnector::new()),
}
}
}
fn watch_ingest_chunk_size() -> usize {
match dotenvy::var("CASS_WATCH_INGEST_CHUNK_SIZE")
.ok()
.and_then(|value| value.parse::<usize>().ok())
{
Some(0) => WATCH_INGEST_DEFAULT_CHUNK_SIZE,
Some(value) if value > WATCH_INGEST_CHUNK_SIZE_MAX => {
tracing::warn!(
env_var = "CASS_WATCH_INGEST_CHUNK_SIZE",
requested = value,
cap = WATCH_INGEST_CHUNK_SIZE_MAX,
"watch ingest chunk size exceeds safe cap; clamping"
);
WATCH_INGEST_CHUNK_SIZE_MAX
}
Some(value) => value,
None => WATCH_INGEST_DEFAULT_CHUNK_SIZE,
}
}
fn watch_sources<F: Fn(Vec<PathBuf>, &[(ConnectorKind, ScanRoot)], bool) -> Result<()>>(
watch_once_paths: Option<Vec<PathBuf>>,
roots: Vec<(ConnectorKind, ScanRoot)>,
event_channel: Option<(Sender<IndexerEvent>, Receiver<IndexerEvent>)>,
stale_detector: Arc<StaleDetector>,
watch_interval_secs: u64,
callback: F,
) -> Result<()> {
if let Some(paths) = watch_once_paths {
if !paths.is_empty() {
callback(paths, &roots, false)?;
}
return Ok(());
}
let (tx, rx) = event_channel.unwrap_or_else(crossbeam_channel::unbounded);
let tx_clone = tx.clone();
let mut watcher = recommended_watcher(move |res: notify::Result<notify::Event>| match res {
Ok(event) => {
if event.need_rescan() {
let _ = tx_clone.send(IndexerEvent::Command(ReindexCommand::Full));
return;
}
if !watch_event_should_trigger_reindex(&event) || event.paths.is_empty() {
return;
}
let _ = tx_clone.send(IndexerEvent::Notify(event.paths));
}
Err(e) => {
tracing::warn!("filesystem watcher error: {}", e);
}
})?;
for (_, root) in &roots {
if let Err(e) = watcher.watch(&root.path, RecursiveMode::Recursive) {
tracing::warn!("failed to watch {}: {}", root.path.display(), e);
} else {
tracing::info!("watching {}", root.path.display());
}
}
let debounce = Duration::from_secs(2);
let max_wait = Duration::from_secs(5);
let min_scan_interval = Duration::from_secs(watch_interval_secs.max(1));
let stale_check_interval = Duration::from_secs(300);
let mut pending: Vec<PathBuf> = Vec::new();
let mut first_event: Option<Instant> = None;
let mut last_stale_check = Instant::now();
let mut last_scan = [
min_scan_interval,
Duration::from_secs(60),
Duration::from_secs(1),
]
.iter()
.find_map(|d| Instant::now().checked_sub(*d))
.unwrap_or_else(Instant::now);
tracing::info!(
watch_interval_secs,
"watch mode: minimum interval between scan cycles"
);
loop {
let cooldown_remaining = min_scan_interval.saturating_sub(last_scan.elapsed());
let timeout = if pending.is_empty() {
stale_check_interval
} else {
let now = Instant::now();
let elapsed = now.duration_since(first_event.unwrap_or(now));
if elapsed >= max_wait {
if cooldown_remaining.is_zero() {
if let Err(error) = callback(std::mem::take(&mut pending), &roots, false) {
tracing::warn!(error = %error, "watch incremental callback failed");
}
last_scan = Instant::now();
first_event = None;
continue;
}
cooldown_remaining
} else {
let remaining = max_wait.saturating_sub(elapsed);
debounce.min(remaining).max(cooldown_remaining)
}
};
match rx.recv_timeout(timeout) {
Ok(IndexerEvent::Notify(paths)) => {
if pending.is_empty() {
first_event = Some(Instant::now());
}
pending.extend(paths);
}
Ok(IndexerEvent::Command(cmd)) => match cmd {
ReindexCommand::Full => {
if !pending.is_empty()
&& let Err(error) = callback(std::mem::take(&mut pending), &roots, false)
{
tracing::warn!(error = %error, "watch incremental callback failed");
}
if let Err(error) = callback(vec![], &roots, true) {
tracing::warn!(error = %error, "watch rebuild callback failed");
}
last_scan = Instant::now();
first_event = None;
}
},
Err(crossbeam_channel::RecvTimeoutError::Timeout) => {
if !pending.is_empty() && last_scan.elapsed() >= min_scan_interval {
if let Err(error) = callback(std::mem::take(&mut pending), &roots, false) {
tracing::warn!(error = %error, "watch incremental callback failed");
}
last_scan = Instant::now();
first_event = None;
}
let now = Instant::now();
if now.duration_since(last_stale_check) >= stale_check_interval {
last_stale_check = now;
if let Some(action) = stale_detector.check_stale() {
let stats = stale_detector.stats();
match action {
StaleAction::Warn => {
tracing::warn!(
consecutive_zero_scans = stats.consecutive_zero_scans,
seconds_since_last_ingest = ?stats.seconds_since_last_ingest,
total_ingests = stats.total_ingests,
"watch daemon appears stale: no conversations indexed recently"
);
tracing::info!(
"hint: run 'cass index --full' to rebuild, or set \
CASS_WATCH_STALE_ACTION=rebuild for auto-recovery"
);
}
StaleAction::Rebuild => {
tracing::warn!(
consecutive_zero_scans = stats.consecutive_zero_scans,
seconds_since_last_ingest = ?stats.seconds_since_last_ingest,
"stale state detected, triggering automatic full rebuild"
);
if let Err(error) = callback(vec![], &roots, true) {
tracing::warn!(
error = %error,
"watch stale-rebuild callback failed"
);
}
last_scan = Instant::now();
}
StaleAction::None => {
}
}
}
}
}
Err(crossbeam_channel::RecvTimeoutError::Disconnected) => break,
}
}
Ok(())
}
#[cfg(test)]
fn reset_storage(storage: &FrankenStorage) -> Result<()> {
storage.raw().execute_batch(
"BEGIN TRANSACTION;
DELETE FROM usage_models_daily;
DELETE FROM usage_daily;
DELETE FROM usage_hourly;
DELETE FROM token_daily_stats;
DELETE FROM daily_stats;
DELETE FROM message_metrics;
DELETE FROM token_usage;
DELETE FROM snippets;
DELETE FROM messages;
DELETE FROM conversations;
DELETE FROM agents;
DELETE FROM workspaces;
DELETE FROM tags;
DELETE FROM conversation_tags;
DELETE FROM meta WHERE key = 'last_scan_ts';
COMMIT;",
)?;
storage.rebuild_fts()?;
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn reindex_paths(
opts: &IndexOptions,
paths: Vec<PathBuf>,
roots: &[(ConnectorKind, ScanRoot)],
state: &Mutex<HashMap<ConnectorKind, i64>>,
storage: &Mutex<FrankenStorage>,
t_index: &Mutex<Option<TantivyIndex>>,
index_path: &Path,
force_full: bool,
) -> Result<usize> {
reindex_paths_with_semantic_delta(
opts, paths, roots, state, storage, t_index, index_path, force_full, None,
)
}
#[allow(clippy::too_many_arguments)]
fn reindex_paths_with_semantic_delta(
opts: &IndexOptions,
paths: Vec<PathBuf>,
roots: &[(ConnectorKind, ScanRoot)],
state: &Mutex<HashMap<ConnectorKind, i64>>,
storage: &Mutex<FrankenStorage>,
t_index: &Mutex<Option<TantivyIndex>>,
index_path: &Path,
force_full: bool,
semantic_delta: Option<&mut WatchSemanticDelta>,
) -> Result<usize> {
let triggers = classify_paths(
paths,
roots,
opts.watch_once_paths
.as_ref()
.is_some_and(|paths| !paths.is_empty()),
);
if triggers.is_empty() {
return Ok(0);
}
let mut total_indexed = 0usize;
let mut semantic_delta = semantic_delta;
let preserve_watch_watermark = scan_path_exclusions_active();
let active_source_filter = ActiveSessionSourceFilter::new(
opts.watch && opts.watch_once_paths.as_ref().is_none_or(Vec::is_empty),
);
for (kind, root, min_ts, max_ts) in triggers {
let conn = kind.create_connector();
let detect = conn.detect();
if !detect.detected && root.origin.source_id == "local" && !root.path.exists() {
continue;
}
if let Some(p) = &opts.progress {
p.phase.store(1, Ordering::Relaxed);
}
let explicit_watch_once = opts
.watch_once_paths
.as_ref()
.is_some_and(|paths| !paths.is_empty());
let lexical_strategy_reason = if explicit_watch_once {
"watch_once_targeted_reindex_applies_inline_lexical_updates_for_changed_paths"
} else {
"watch_reindex_applies_inline_lexical_updates_for_changed_paths"
};
record_lexical_population_strategy_if_unset(
opts.progress.as_ref(),
LexicalPopulationStrategy::IncrementalInline,
lexical_strategy_reason,
);
if explicit_watch_once && !force_full && semantic_delta.is_none() {
let unchanged = {
let storage = storage
.lock()
.map_err(|_| anyhow::anyhow!("storage lock poisoned"))?;
explicit_watch_once_root_unchanged_after_last_index(&storage, &root)?
};
if unchanged {
tracing::info!(
?kind,
scan_root = %root.path.display(),
"skipping unchanged explicit watch-once root already covered by last_indexed_at"
);
continue;
}
}
let since_ts = if force_full || explicit_watch_once {
None
} else {
let guard = state
.lock()
.map_err(|_| anyhow::anyhow!("state lock poisoned"))?;
let previous_ts = guard.get(&kind).copied();
match (previous_ts, min_ts) {
(None, None) => None,
(Some(ts), None) | (None, Some(ts)) => Some(ts.saturating_sub(1)),
(Some(prev), Some(batch_min)) => Some(prev.min(batch_min).saturating_sub(1)),
}
};
if root.path.is_file()
&& should_skip_active_session_source(
&active_source_filter,
&root.origin.source_id,
&root.path,
)
{
tracing::debug!(
?kind,
scan_root = %root.path.display(),
"skipping active explicit watch source without advancing watermarks"
);
continue;
}
let ctx = crate::connectors::ScanContext::with_roots(
root.path.clone(),
vec![root.clone()],
since_ts,
);
capture_connector_sources_before_parse(
conn.as_ref(),
&ctx,
&opts.data_dir,
kind.slug(),
std::slice::from_ref(&root),
since_ts,
&active_source_filter,
);
let scan_start = Instant::now();
let mut convs = match conn.scan(&ctx) {
Ok(c) => c,
Err(e) => {
tracing::debug!(
"watch scan failed for {:?} at {}: {}",
kind,
root.path.display(),
e
);
Vec::new()
}
};
let scan_ms = scan_start.elapsed().as_millis() as u64;
let pre_active_filter_count = convs.len();
convs.retain(|conv| {
!should_skip_active_session_source(
&active_source_filter,
&root.origin.source_id,
&conv.source_path,
)
});
let active_sources_skipped = pre_active_filter_count.saturating_sub(convs.len());
if active_sources_skipped > 0 {
tracing::debug!(
?kind,
scan_root = %root.path.display(),
active_sources_skipped,
"skipped active watch sources and preserved watermarks for retry"
);
}
let preserve_this_watch_watermark = preserve_watch_watermark || active_sources_skipped > 0;
for conv in &mut convs {
inject_provenance(conv, &root.origin);
apply_workspace_rewrite(conv, &root);
compact_large_connector_extras("", conv);
attach_raw_mirror_capture(&opts.data_dir, conv);
}
if !explicit_watch_once {
sort_watch_conversations_for_watermark(&mut convs);
}
if let Some(p) = &opts.progress {
p.total.fetch_add(convs.len(), Ordering::Relaxed);
p.phase.store(2, Ordering::Relaxed);
}
let conv_count = convs.len();
if explicit_watch_once {
tracing::warn!(
?kind,
scan_root = %root.path.display(),
conversations = conv_count,
since_ts,
"watch_once_scan"
);
} else {
tracing::info!(?kind, conversations = conv_count, since_ts, "watch_scan");
}
if conv_count == 0 {
continue;
}
let index_start = Instant::now();
let mut inserted_messages = 0usize;
let mut processed_conversations = 0usize;
let mut quarantined_conversations = 0usize;
{
let storage = storage
.lock()
.map_err(|_| anyhow::anyhow!("storage lock poisoned"))?;
let mut t_index_guard = t_index
.lock()
.map_err(|_| anyhow::anyhow!("index lock poisoned"))?;
let ingest_chunk_size = if explicit_watch_once {
conv_count.max(1)
} else {
watch_ingest_chunk_size()
};
let capture_semantic_delta = semantic_delta.is_some();
for chunk in convs.chunks(ingest_chunk_size) {
if t_index_guard.is_none() {
tracing::info!(
index_path = %index_path.display(),
"opening Tantivy lazily for watch ingest"
);
*t_index_guard = Some(TantivyIndex::open_or_create(index_path)?);
}
let chunk_outcome = {
let t_index = t_index_guard
.as_mut()
.expect("lazy watch index must be open before ingest");
ingest_watch_batch_with_oom_split(
&storage,
t_index,
&opts.data_dir,
chunk,
&opts.progress,
!opts.watch,
capture_semantic_delta,
)?
};
inserted_messages =
inserted_messages.saturating_add(chunk_outcome.batch_outcome.inserted_messages);
processed_conversations =
processed_conversations.saturating_add(chunk_outcome.processed_conversations);
quarantined_conversations = quarantined_conversations
.saturating_add(chunk_outcome.quarantined_conversations);
if let Some(delta) = semantic_delta.as_deref_mut() {
delta.extend_from_batch(
chunk_outcome.batch_outcome.semantic_delta_inputs,
chunk_outcome.batch_outcome.semantic_delta_max_message_id,
);
}
let lexical_update_deferred = chunk_outcome.batch_outcome.lexical_update_deferred;
if lexical_update_deferred {
tracing::warn!(
error = ?chunk_outcome.batch_outcome.lexical_update_error,
"dropping uncommitted watch Tantivy writer after deferred lexical update"
);
*t_index_guard = None;
} else {
t_index_guard
.as_mut()
.expect("watch Tantivy writer must still be open before commit")
.commit()?;
}
if lexical_update_deferred {
tracing::warn!(
"skipping watch last_indexed_at update after deferred lexical update so health/status report stale lexical assets"
);
} else {
persist::with_ephemeral_writer(
&storage,
false,
"updating watch last_indexed_at",
|writer| writer.set_last_indexed_at(FrankenStorage::now_millis()),
)?;
}
if !explicit_watch_once
&& !preserve_this_watch_watermark
&& chunk_outcome.quarantined_conversations == 0
&& let Some(ts_val) = chunk_outcome.max_payload_watermark_ms
{
save_watch_state_watermark(&opts.data_dir, state, kind, ts_val)?;
} else if chunk_outcome.quarantined_conversations > 0 {
tracing::info!(
?kind,
quarantined_conversations = chunk_outcome.quarantined_conversations,
"preserving partial watch watermark so quarantined source can be retried"
);
} else if preserve_this_watch_watermark {
tracing::debug!(
?kind,
active_sources_skipped,
"preserving partial watch watermark because scan exclusions or active source skips are active"
);
}
}
}
let index_ms = index_start.elapsed().as_millis() as u64;
if let Some(p) = &opts.progress
&& let Ok(mut stats) = p.stats.lock()
{
let connector_name = convs
.first()
.map(|conv| conv.agent_slug.clone())
.unwrap_or_else(|| format!("{kind:?}").to_ascii_lowercase());
stats.scan_ms = stats.scan_ms.saturating_add(scan_ms);
stats.index_ms = stats.index_ms.saturating_add(index_ms);
stats.total_conversations = stats.total_conversations.saturating_add(conv_count);
stats.total_messages = stats.total_messages.saturating_add(inserted_messages);
stats.connectors.push(ConnectorStats {
name: connector_name.clone(),
conversations: conv_count,
messages: inserted_messages,
scan_ms,
error: None,
});
if !stats
.agents_discovered
.iter()
.any(|name| name == &connector_name)
{
stats.agents_discovered.push(connector_name);
}
}
if quarantined_conversations > 0 {
tracing::warn!(
?kind,
quarantined_conversations,
"watch ingest skipped poison conversations after OOM bisection"
);
}
total_indexed = total_indexed.saturating_add(processed_conversations);
if !explicit_watch_once
&& conv_count > 0
&& !preserve_this_watch_watermark
&& quarantined_conversations == 0
&& let Some(ts_val) = max_ts
{
save_watch_state_watermark(&opts.data_dir, state, kind, ts_val)?;
} else if !explicit_watch_once && quarantined_conversations > 0 {
tracing::info!(
?kind,
quarantined_conversations,
"preserving final watch watermark so quarantined source can be retried"
);
} else if !explicit_watch_once && conv_count > 0 && preserve_this_watch_watermark {
tracing::info!(
?kind,
active_sources_skipped,
"preserving final watch watermark because scan exclusions or active source skips are active"
);
}
}
reset_progress_to_idle(opts.progress.as_ref());
Ok(total_indexed)
}
fn explicit_watch_once_root_unchanged_after_last_index(
storage: &FrankenStorage,
root: &ScanRoot,
) -> Result<bool> {
let metadata = match fs::metadata(&root.path) {
Ok(metadata) if metadata.is_file() => metadata,
_ => return Ok(false),
};
let Some(modified_at_ms) = metadata
.modified()
.ok()
.and_then(system_time_to_epoch_millis)
else {
return Ok(false);
};
let Some(last_indexed_at) = storage.get_last_indexed_at()? else {
return Ok(false);
};
if modified_at_ms > last_indexed_at {
return Ok(false);
}
let source_path = root.path.to_string_lossy();
let matches: Vec<i64> = storage
.raw()
.query_map_collect(
"SELECT id
FROM conversations
WHERE source_id = ?1 AND source_path = ?2
LIMIT 1",
&[
ParamValue::from(root.origin.source_id.as_str()),
ParamValue::from(source_path.as_ref()),
],
|row| row.get_typed(0),
)
.with_context(|| {
format!(
"checking indexed source path freshness for {}",
root.path.display()
)
})?;
Ok(!matches.is_empty())
}
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq, Eq, Hash)]
enum ConnectorKind {
#[serde(rename = "cx", alias = "Codex")]
Codex,
#[serde(rename = "cl", alias = "Cline")]
Cline,
#[serde(rename = "gm", alias = "Gemini")]
Gemini,
#[serde(rename = "cd", alias = "Claude")]
Claude,
#[serde(rename = "cb", alias = "Clawdbot")]
Clawdbot,
#[serde(rename = "vb", alias = "Vibe")]
Vibe,
#[serde(rename = "am", alias = "Amp")]
Amp,
#[serde(rename = "oc", alias = "OpenCode")]
OpenCode,
#[serde(rename = "ai", alias = "Aider")]
Aider,
#[serde(rename = "cu", alias = "Cursor")]
Cursor,
#[serde(rename = "cg", alias = "ChatGpt")]
ChatGpt,
#[serde(rename = "pi", alias = "PiAgent")]
PiAgent,
#[serde(rename = "fa", alias = "Factory")]
Factory,
#[serde(rename = "ow", alias = "OpenClaw")]
OpenClaw,
#[serde(rename = "cp", alias = "Copilot")]
Copilot,
#[serde(rename = "ki", alias = "Kimi")]
Kimi,
#[serde(rename = "cc", alias = "CopilotCli")]
CopilotCli,
#[serde(rename = "qw", alias = "Qwen")]
Qwen,
}
#[derive(serde::Serialize, serde::Deserialize, Debug, Default)]
#[serde(deny_unknown_fields)]
struct WatchState {
#[serde(rename = "v", default, skip_serializing_if = "is_zero_u8")]
version: u8,
#[serde(rename = "m", default, skip_serializing_if = "HashMap::is_empty")]
map: HashMap<ConnectorKind, i64>,
}
fn is_zero_u8(value: &u8) -> bool {
*value == 0
}
fn state_path(data_dir: &Path) -> PathBuf {
data_dir.join("watch_state.json")
}
fn load_watch_state(data_dir: &Path) -> HashMap<ConnectorKind, i64> {
let path = state_path(data_dir);
let Ok(bytes) = fs::read(&path) else {
return HashMap::new();
};
if let Ok(state) = serde_json::from_slice::<WatchState>(&bytes) {
return state.map;
}
if let Ok(map) = serde_json::from_slice::<HashMap<ConnectorKind, i64>>(&bytes) {
return map;
}
HashMap::new()
}
fn replace_file_from_temp(temp_path: &Path, final_path: &Path) -> Result<()> {
#[cfg(windows)]
{
match fs::rename(temp_path, final_path) {
Ok(()) => {
sync_parent_directory(final_path)?;
Ok(())
}
Err(first_err)
if final_path.exists()
&& matches!(
first_err.kind(),
std::io::ErrorKind::AlreadyExists | std::io::ErrorKind::PermissionDenied
) =>
{
let backup_path = unique_replace_backup_path(final_path);
fs::rename(final_path, &backup_path).map_err(|backup_err| {
let _ = fs::remove_file(temp_path);
anyhow::anyhow!(
"failed preparing backup {} before replacing {}: first error: {}; backup error: {}",
backup_path.display(),
final_path.display(),
first_err,
backup_err
)
})?;
match fs::rename(temp_path, final_path) {
Ok(()) => {
sync_parent_directory(final_path)?;
let _ = fs::remove_file(&backup_path);
Ok(())
}
Err(second_err) => {
let restore_result = fs::rename(&backup_path, final_path);
match restore_result {
Ok(()) => {
let _ = fs::remove_file(temp_path);
sync_parent_directory(final_path)?;
Err(anyhow::anyhow!(
"failed replacing {} with {}: first error: {}; second error: {}; restored original file",
final_path.display(),
temp_path.display(),
first_err,
second_err
))
}
Err(restore_err) => Err(anyhow::anyhow!(
"failed replacing {} with {}: first error: {}; second error: {}; restore error: {}; temp file retained at {}",
final_path.display(),
temp_path.display(),
first_err,
second_err,
restore_err,
temp_path.display()
)),
}
}
}
}
Err(rename_err) => Err(rename_err.into()),
}
}
#[cfg(not(windows))]
{
fs::rename(temp_path, final_path)?;
sync_parent_directory(final_path)?;
Ok(())
}
}
fn unique_atomic_temp_path(path: &Path) -> PathBuf {
unique_atomic_sidecar_path(path, "tmp", "watch_state.json")
}
#[cfg(windows)]
fn unique_replace_backup_path(path: &Path) -> PathBuf {
unique_atomic_sidecar_path(path, "bak", "watch_state.json")
}
fn unique_atomic_sidecar_path(path: &Path, suffix: &str, fallback_name: &str) -> PathBuf {
static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_nanos();
let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let file_name = path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or(fallback_name);
path.with_file_name(format!(
".{file_name}.{suffix}.{}.{}.{}",
std::process::id(),
timestamp,
nonce
))
}
fn create_new_atomic_sidecar_file(path: &Path) -> std::io::Result<File> {
OpenOptions::new().write(true).create_new(true).open(path)
}
fn unique_failed_seed_backup_root(backups_dir: &Path, db_name: &str) -> PathBuf {
static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_nanos();
let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
backups_dir.join(format!(
"{db_name}.{timestamp}.{}.{}.failed-baseline-seed.bak",
std::process::id(),
nonce
))
}
fn save_watch_state(data_dir: &Path, state: &HashMap<ConnectorKind, i64>) -> Result<()> {
let path = state_path(data_dir);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
let watch_state = WatchState {
version: 1,
map: state.clone(),
};
let json = serde_json::to_vec(&watch_state)?;
let tmp_path = unique_atomic_temp_path(&path);
{
let file = create_new_atomic_sidecar_file(&tmp_path)?;
let mut writer = BufWriter::new(file);
writer.write_all(&json)?;
writer.flush()?;
writer.get_ref().sync_all()?;
}
replace_file_from_temp(&tmp_path, &path)?;
Ok(())
}
fn set_progress_last_error(progress: Option<&Arc<IndexingProgress>>, error: Option<String>) {
let Some(progress) = progress else {
return;
};
match progress.last_error.lock() {
Ok(mut guard) => *guard = error,
Err(poisoned) => *poisoned.into_inner() = error,
}
}
fn finalize_watch_reindex_result(
result: Result<usize>,
detector: &StaleDetector,
progress: Option<&Arc<IndexingProgress>>,
context: &str,
) -> usize {
match result {
Ok(indexed) => {
set_progress_last_error(progress, None);
detector.record_scan(indexed);
indexed
}
Err(error) => {
tracing::error!(
context,
"watch reindex failed; since_ts not advanced this cycle: {error:#}"
);
reset_progress_to_idle(progress);
set_progress_last_error(progress, Some(format!("{context}: {error}")));
detector.record_scan(0);
0
}
}
}
fn finalize_watch_once_reindex_result(
result: Result<usize>,
detector: &StaleDetector,
progress: Option<&Arc<IndexingProgress>>,
context: &str,
) -> Result<usize> {
match result {
Ok(indexed) => {
set_progress_last_error(progress, None);
detector.record_scan(indexed);
Ok(indexed)
}
Err(error) => {
tracing::error!(
context,
"watch reindex failed; since_ts not advanced this cycle: {error:#}"
);
reset_progress_to_idle(progress);
set_progress_last_error(progress, Some(format!("{context}: {error}")));
detector.record_scan(0);
Err(error)
}
}
}
fn explicit_watch_once_connector_hint(path: &Path) -> Option<ConnectorKind> {
let components: Vec<String> = path
.components()
.map(|component| component.as_os_str().to_string_lossy().to_ascii_lowercase())
.collect();
let has_pair = |left: &str, right: &str| {
components
.windows(2)
.any(|window| window[0] == left && window[1] == right)
};
if has_pair(".codex", "sessions") {
Some(ConnectorKind::Codex)
} else if has_pair(".claude", "projects") {
Some(ConnectorKind::Claude)
} else if has_pair(".gemini", "tmp") {
Some(ConnectorKind::Gemini)
} else {
None
}
}
fn classify_paths(
paths: Vec<PathBuf>,
roots: &[(ConnectorKind, ScanRoot)],
prefer_explicit_paths: bool,
) -> Vec<(ConnectorKind, ScanRoot, Option<i64>, Option<i64>)> {
let mut batch_map: BatchClassificationMap = HashMap::new();
for p in paths {
let hinted_kind = prefer_explicit_paths
.then(|| explicit_watch_once_connector_hint(&p))
.flatten();
if let Ok(meta) = std::fs::metadata(&p)
&& let Ok(time) = meta.modified()
&& let Ok(dur) = time.duration_since(std::time::UNIX_EPOCH)
{
let ts = Some(i64::try_from(dur.as_millis()).unwrap_or(i64::MAX));
let mut matched_root = false;
for (kind, root) in roots {
if let Some(hinted_kind) = hinted_kind
&& *kind != hinted_kind
{
continue;
}
if p.starts_with(&root.path) {
matched_root = true;
let scan_path = if prefer_explicit_paths {
p.clone()
} else {
root.path.clone()
};
let mut scan_root = root.clone();
scan_root.path = scan_path.clone();
let key = (*kind, scan_path);
let entry = batch_map.entry(key).or_insert((scan_root, None, None));
entry.1 = match (entry.1, ts) {
(Some(prev), Some(cur)) => Some(prev.min(cur)),
(None, Some(cur)) => Some(cur),
_ => entry.1,
};
entry.2 = match (entry.2, ts) {
(Some(prev), Some(cur)) => Some(prev.max(cur)),
(None, Some(cur)) => Some(cur),
_ => entry.2,
};
}
}
if prefer_explicit_paths
&& !matched_root
&& let Some(hinted_kind) = hinted_kind
{
let mut scan_root = ScanRoot::local(p.clone());
scan_root.path = p.clone();
let entry = batch_map
.entry((hinted_kind, p.clone()))
.or_insert((scan_root, None, None));
entry.1 = match (entry.1, ts) {
(Some(prev), Some(cur)) => Some(prev.min(cur)),
(None, Some(cur)) => Some(cur),
_ => entry.1,
};
entry.2 = match (entry.2, ts) {
(Some(prev), Some(cur)) => Some(prev.max(cur)),
(None, Some(cur)) => Some(cur),
_ => entry.2,
};
}
}
}
batch_map
.into_iter()
.map(|((kind, _), (root, min_ts, max_ts))| (kind, root, min_ts, max_ts))
.collect()
}
fn watch_event_should_trigger_reindex(event: ¬ify::Event) -> bool {
match event.kind {
notify::event::EventKind::Access(AccessKind::Close(AccessMode::Write)) => true,
notify::event::EventKind::Access(_) => false,
notify::event::EventKind::Create(_)
| notify::event::EventKind::Any
| notify::event::EventKind::Other => true,
notify::event::EventKind::Remove(_) => false,
notify::event::EventKind::Modify(ModifyKind::Metadata(MetadataKind::AccessTime)) => false,
notify::event::EventKind::Modify(_) => true,
}
}
fn sync_sources_config_to_db(storage: &FrankenStorage) {
if dotenvy::var("CASS_IGNORE_SOURCES_CONFIG").is_ok() {
return;
}
let config = match SourcesConfig::load() {
Ok(cfg) => cfg,
Err(e) => {
tracing::debug!("sources config load failed: {e}");
return;
}
};
let records: Vec<Source> = config
.sources
.iter()
.map(|source| {
let platform = source.platform.map(|p| match p {
Platform::Macos => "macos".to_string(),
Platform::Linux => "linux".to_string(),
Platform::Windows => "windows".to_string(),
});
let config_json = serde_json::json!({
"paths": source.paths.clone(),
"path_mappings": source.path_mappings.clone(),
"sync_schedule": source.sync_schedule,
});
Source {
id: source.name.clone(),
kind: source.source_type,
host_label: source.host.clone(),
machine_id: None,
platform,
config_json: Some(config_json),
created_at: None,
updated_at: None,
}
})
.collect();
if let Err(err) =
persist::with_ephemeral_writer(storage, false, "syncing configured sources", |writer| {
for record in &records {
if let Err(upsert_err) = writer.upsert_source(record) {
tracing::warn!(
source_id = %record.id,
error = %upsert_err,
"failed to upsert configured source into db"
);
}
}
Ok(())
})
{
tracing::warn!(
error = %err,
"failed to sync configured sources with a short-lived writer"
);
}
}
fn expand_local_scan_root_path(path: &str) -> PathBuf {
if let Some(stripped) = path.strip_prefix("~/")
&& let Some(home) = dirs::home_dir()
{
return home.join(stripped);
}
if path == "~"
&& let Some(home) = dirs::home_dir()
{
return home;
}
PathBuf::from(path)
}
fn additional_scan_roots_for_scan_or_watch(
storage: &FrankenStorage,
data_dir: &Path,
) -> Vec<ScanRoot> {
sync_sources_config_to_db(storage);
build_scan_roots(storage, data_dir)
.into_iter()
.filter(|root| !(root.origin.source_id == LOCAL_SOURCE_ID && root.path == data_dir))
.collect()
}
pub fn build_scan_roots(storage: &FrankenStorage, data_dir: &Path) -> Vec<ScanRoot> {
let mut roots = Vec::new();
roots.push(ScanRoot::local(data_dir.to_path_buf()));
if dotenvy::var("CASS_IGNORE_SOURCES_CONFIG").is_err()
&& let Ok(config) = SourcesConfig::load()
&& !config.sources.is_empty()
{
for source in &config.sources {
let origin = Origin {
source_id: source.name.clone(),
kind: source.source_type,
host: source.host.clone(),
};
let platform = source.platform;
let workspace_rewrites = source.path_mappings.clone();
for path in &source.paths {
if source.is_remote() {
let expanded_path = if path.starts_with("~/") {
path.to_string()
} else if path.starts_with('~') {
path.replacen('~', "~/", 1)
} else {
path.to_string()
};
let safe_name = path_to_safe_dirname(&expanded_path);
let mirror_base = data_dir.join("remotes").join(&source.name).join("mirror");
let mirror_path = mirror_base.join(&safe_name);
if mirror_path.exists() {
let mut scan_root = ScanRoot::remote(mirror_path, origin.clone(), platform);
scan_root.workspace_rewrites = workspace_rewrites.clone();
roots.push(scan_root);
continue;
}
if path.starts_with("~/") {
let suffix = path.trim_start_matches("~/");
let safe_suffix = path_to_safe_dirname(suffix);
if let Ok(entries) = std::fs::read_dir(&mirror_base) {
for entry in entries.flatten() {
let name = entry.file_name();
let name_str = name.to_string_lossy();
if name_str.ends_with(&safe_suffix) && entry.path().is_dir() {
let mut scan_root =
ScanRoot::remote(entry.path(), origin.clone(), platform);
scan_root.workspace_rewrites = workspace_rewrites.clone();
roots.push(scan_root);
break;
}
}
}
}
} else {
let local_path = expand_local_scan_root_path(path);
if !local_path.exists() {
continue;
}
let mut scan_root = ScanRoot::local(local_path);
scan_root.origin = origin.clone();
scan_root.platform = platform;
scan_root.workspace_rewrites = workspace_rewrites.clone();
roots.push(scan_root);
}
}
}
return roots;
}
if let Ok(sources) = storage.list_sources() {
for source in sources {
let platform =
source
.platform
.as_deref()
.and_then(|p| match p.to_lowercase().as_str() {
"macos" => Some(Platform::Macos),
"linux" => Some(Platform::Linux),
"windows" => Some(Platform::Windows),
_ => None,
});
let workspace_rewrites = source
.config_json
.as_ref()
.and_then(|cfg| cfg.get("path_mappings"))
.and_then(|arr| arr.as_array())
.map(|items| {
items
.iter()
.filter_map(|item| {
let from = item.get("from")?.as_str()?.to_string();
let to = item.get("to")?.as_str()?.to_string();
let agents = item.get("agents").and_then(|a| {
a.as_array().map(|arr| {
arr.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect()
})
});
Some(crate::sources::config::PathMapping { from, to, agents })
})
.collect::<Vec<_>>()
})
.unwrap_or_default();
if let Some(paths) = source
.config_json
.as_ref()
.and_then(|cfg| cfg.get("paths"))
.and_then(|arr| arr.as_array())
{
for path_val in paths {
let Some(path) = path_val.as_str() else {
continue;
};
if source.kind.is_remote() {
let expanded_path = if path.starts_with("~/") {
path.to_string()
} else if path.starts_with('~') {
path.replacen('~', "~/", 1)
} else {
path.to_string()
};
let safe_name = path_to_safe_dirname(&expanded_path);
let mirror_path = data_dir
.join("remotes")
.join(&source.id)
.join("mirror")
.join(&safe_name);
if !mirror_path.exists() {
continue;
}
let origin = Origin {
source_id: source.id.clone(),
kind: source.kind,
host: source.host_label.clone(),
};
let mut scan_root = ScanRoot::remote(mirror_path, origin, platform);
scan_root.workspace_rewrites = workspace_rewrites.clone();
roots.push(scan_root);
} else {
let local_path = expand_local_scan_root_path(path);
if !local_path.exists() {
continue;
}
let origin = Origin {
source_id: source.id.clone(),
kind: source.kind,
host: source.host_label.clone(),
};
let mut scan_root = ScanRoot::local(local_path);
scan_root.origin = origin;
scan_root.platform = platform;
scan_root.workspace_rewrites = workspace_rewrites.clone();
roots.push(scan_root);
}
}
continue;
}
let mirror_path = data_dir.join("remotes").join(&source.id).join("mirror");
if source.kind.is_remote() && mirror_path.exists() {
let origin = Origin {
source_id: source.id.clone(),
kind: source.kind,
host: source.host_label.clone(),
};
let mut scan_root = ScanRoot::remote(mirror_path, origin, platform);
scan_root.workspace_rewrites = workspace_rewrites;
roots.push(scan_root);
}
}
}
roots
}
fn inject_provenance(conv: &mut NormalizedConversation, origin: &Origin) {
if !conv.metadata.is_object() {
conv.metadata = serde_json::json!({});
}
if let Some(obj) = conv.metadata.as_object_mut() {
let cass = obj
.entry("cass".to_string())
.or_insert_with(|| serde_json::json!({}));
if let Some(cass_obj) = cass.as_object_mut() {
cass_obj.insert(
"origin".to_string(),
serde_json::json!({
"source_id": origin.source_id,
"kind": origin.kind.as_str(),
"host": origin.host
}),
);
}
}
}
fn capture_connector_sources_before_parse(
connector: &(dyn crate::connectors::Connector + Send),
ctx: &crate::connectors::ScanContext,
data_dir: &Path,
provider: &str,
fallback_roots: &[ScanRoot],
since_ts: Option<i64>,
active_source_filter: &ActiveSessionSourceFilter,
) {
match connector.discover_source_files(ctx) {
Ok(sources) if !sources.is_empty() => {
let primary_source_count = sources
.iter()
.filter(|source| {
source.role == crate::connectors::DiscoveredSourceRole::PrimarySessionLog
})
.count();
let defer_primary_sources =
primary_source_count > PREPARSE_PRIMARY_SOURCE_CAPTURE_LIMIT;
if defer_primary_sources {
tracing::info!(
provider,
primary_source_count,
limit = PREPARSE_PRIMARY_SOURCE_CAPTURE_LIMIT,
"deferring large primary source raw-mirror capture to per-conversation streaming path"
);
}
for source in sources {
if should_skip_active_session_source(
active_source_filter,
&source.origin.source_id,
&source.source_path,
) {
continue;
}
if defer_primary_sources
&& source.role == crate::connectors::DiscoveredSourceRole::PrimarySessionLog
{
continue;
}
capture_discovered_source_file_before_parse(data_dir, provider, &source);
}
}
Ok(_) => {
for root in fallback_roots {
capture_scan_sources_before_parse(
data_dir,
provider,
root,
since_ts,
active_source_filter,
);
}
}
Err(error) => {
tracing::warn!(
provider,
error = %error,
"provider source discovery failed; falling back to legacy explicit-root preparse capture"
);
for root in fallback_roots {
capture_scan_sources_before_parse(
data_dir,
provider,
root,
since_ts,
active_source_filter,
);
}
}
}
}
fn should_skip_raw_mirror_capture_for_logical_source(path: &Path) -> bool {
match std::fs::symlink_metadata(path) {
Ok(metadata) => {
let file_type = metadata.file_type();
!file_type.is_file() && !file_type.is_symlink()
}
Err(error) => matches!(
error.kind(),
std::io::ErrorKind::NotFound | std::io::ErrorKind::NotADirectory
),
}
}
fn capture_discovered_source_file_before_parse(
data_dir: &Path,
provider: &str,
source: &crate::connectors::DiscoveredSourceFile,
) {
if let Err(reason) = validate_discovered_source_path(source) {
tracing::warn!(
provider,
discovered_provider = %source.provider_slug,
role = source.role.as_str(),
source_id = %source.origin.source_id,
scan_root = %source.scan_root.display(),
path = %source.source_path.display(),
reason,
"refusing to raw-mirror discovered provider source outside its scan root"
);
return;
}
match crate::raw_mirror::capture_source_file(crate::raw_mirror::RawMirrorCaptureInput {
data_dir,
provider,
source_id: &source.origin.source_id,
origin_kind: source.origin.kind.as_str(),
origin_host: source.origin.host.as_deref(),
source_path: &source.source_path,
db_links: &[],
}) {
Ok(record) => {
tracing::debug!(
provider,
discovered_provider = %source.provider_slug,
role = source.role.as_str(),
source_id = %source.origin.source_id,
manifest_id = %record.manifest_id,
blob_blake3 = %record.blob_blake3,
already_present = record.already_present,
required_for_reconstruction = source.required_for_reconstruction,
"captured discovered provider source into raw mirror before connector parse"
);
}
Err(error) => {
tracing::warn!(
provider,
discovered_provider = %source.provider_slug,
role = source.role.as_str(),
source_id = %source.origin.source_id,
path = %source.source_path.display(),
error = %error,
"failed to capture discovered provider source into raw mirror before connector parse"
);
}
}
}
fn validate_discovered_source_path(
source: &crate::connectors::DiscoveredSourceFile,
) -> Result<(), &'static str> {
if source.scan_root.as_os_str().is_empty() {
return Err("empty scan root");
}
if !source.scan_root.is_absolute() {
return Err("scan root is not absolute");
}
if !source.source_path.is_absolute() {
return Err("source path is not absolute");
}
if !path_has_no_relative_escape(&source.scan_root) {
return Err("scan root contains relative components");
}
if !path_has_no_relative_escape(&source.source_path) {
return Err("source path contains relative components");
}
if source.source_path == source.scan_root || source.source_path.starts_with(&source.scan_root) {
if discovered_source_path_has_symlink_component(&source.scan_root, &source.source_path) {
return Err("source path contains symlink component");
}
return Ok(());
}
Err("source path is not contained by scan root")
}
fn discovered_source_path_has_symlink_component(scan_root: &Path, source_path: &Path) -> bool {
if path_component_is_symlink(scan_root) {
return true;
}
let Ok(relative) = source_path.strip_prefix(scan_root) else {
return false;
};
let mut current = scan_root.to_path_buf();
for component in relative.components() {
if let std::path::Component::Normal(part) = component {
current.push(part);
if path_component_is_symlink(¤t) {
return true;
}
}
}
false
}
fn path_component_is_symlink(path: &Path) -> bool {
std::fs::symlink_metadata(path)
.map(|metadata| metadata.file_type().is_symlink())
.unwrap_or(false)
}
fn path_has_no_relative_escape(path: &Path) -> bool {
path.components().all(|component| {
matches!(
component,
std::path::Component::Prefix(_)
| std::path::Component::RootDir
| std::path::Component::CurDir
| std::path::Component::Normal(_)
)
})
}
fn capture_scan_sources_before_parse(
data_dir: &Path,
provider: &str,
root: &ScanRoot,
since_ts: Option<i64>,
active_source_filter: &ActiveSessionSourceFilter,
) {
for capture_root in preparse_capture_roots(provider, root, since_ts) {
capture_scan_root_file_before_parse(
data_dir,
provider,
&capture_root,
active_source_filter,
);
}
}
fn preparse_capture_roots(provider: &str, root: &ScanRoot, since_ts: Option<i64>) -> Vec<ScanRoot> {
if root.path.is_file() {
return vec![root.clone()];
}
if provider == "codex" {
let preflight = crate::connectors::preflight_codex_explicit_file_roots(
std::slice::from_ref(root),
since_ts,
);
let capture_roots: Vec<ScanRoot> = preflight
.scan_roots
.into_iter()
.filter(|scan_root| scan_root.path.is_file())
.collect();
if !capture_roots.is_empty() {
tracing::debug!(
provider,
scan_root = %root.path.display(),
capture_files = capture_roots.len(),
fallback_roots = preflight.fallback_roots,
"expanded codex directory root into explicit raw-mirror preparse capture files"
);
}
return capture_roots;
}
Vec::new()
}
fn capture_scan_root_file_before_parse(
data_dir: &Path,
provider: &str,
root: &ScanRoot,
active_source_filter: &ActiveSessionSourceFilter,
) {
if !root.path.is_file() {
return;
}
if should_skip_active_session_source(active_source_filter, &root.origin.source_id, &root.path) {
return;
}
match crate::raw_mirror::capture_source_file(crate::raw_mirror::RawMirrorCaptureInput {
data_dir,
provider,
source_id: &root.origin.source_id,
origin_kind: root.origin.kind.as_str(),
origin_host: root.origin.host.as_deref(),
source_path: &root.path,
db_links: &[],
}) {
Ok(record) => {
tracing::debug!(
provider,
source_id = %root.origin.source_id,
manifest_id = %record.manifest_id,
blob_blake3 = %record.blob_blake3,
already_present = record.already_present,
"captured explicit scan-root source into raw mirror before connector parse"
);
}
Err(error) => {
tracing::warn!(
provider,
source_id = %root.origin.source_id,
path = %root.path.display(),
error = %error,
"failed to capture explicit scan-root source into raw mirror before connector parse"
);
}
}
}
fn attach_raw_mirror_capture(data_dir: &Path, conv: &mut NormalizedConversation) {
if should_skip_raw_mirror_capture_for_logical_source(&conv.source_path) {
tracing::debug!(
agent = %conv.agent_slug,
source_path = %conv.source_path.display(),
"skipping raw-mirror capture for logical non-file parsed conversation source"
);
return;
}
let (source_id, origin_kind, origin_host) = raw_mirror_origin_from_metadata(&conv.metadata);
let db_link = raw_mirror_db_link_for_conversation(conv);
match crate::raw_mirror::capture_source_file(crate::raw_mirror::RawMirrorCaptureInput {
data_dir,
provider: &conv.agent_slug,
source_id: &source_id,
origin_kind: &origin_kind,
origin_host: origin_host.as_deref(),
source_path: &conv.source_path,
db_links: std::slice::from_ref(&db_link),
}) {
Ok(record) => {
attach_raw_mirror_metadata(conv, &record);
tracing::debug!(
agent = %conv.agent_slug,
source_id = %source_id,
manifest_id = %record.manifest_id,
blob_blake3 = %record.blob_blake3,
already_present = record.already_present,
"captured parsed conversation source into raw mirror before archive upsert"
);
}
Err(error) => {
tracing::warn!(
agent = %conv.agent_slug,
source_id = %source_id,
source_path = %conv.source_path.display(),
error = %error,
"failed to capture parsed conversation source into raw mirror before archive upsert"
);
}
}
}
fn raw_mirror_db_link_for_conversation(
conv: &NormalizedConversation,
) -> crate::raw_mirror::RawMirrorDbLink {
crate::raw_mirror::RawMirrorDbLink {
conversation_id: None,
message_count: Some(conv.messages.len()),
source_path: Some(conv.source_path.display().to_string()),
started_at_ms: conv.started_at,
}
}
fn raw_mirror_origin_from_metadata(
metadata: &serde_json::Value,
) -> (String, String, Option<String>) {
let cass_origin = metadata.get("cass").and_then(|cass| cass.get("origin"));
let source_id = cass_origin
.and_then(|origin| origin.get("source_id"))
.and_then(|value| value.as_str())
.map(str::trim)
.filter(|value| !value.is_empty())
.unwrap_or(LOCAL_SOURCE_ID)
.to_string();
let origin_kind = cass_origin
.and_then(|origin| origin.get("kind"))
.and_then(|value| value.as_str())
.map(str::trim)
.filter(|value| !value.is_empty())
.unwrap_or(if source_id == LOCAL_SOURCE_ID {
"local"
} else {
"ssh"
})
.to_string();
let origin_host = cass_origin
.and_then(|origin| origin.get("host"))
.and_then(|value| value.as_str())
.map(str::trim)
.filter(|value| !value.is_empty())
.map(ToOwned::to_owned);
(source_id, origin_kind, origin_host)
}
fn attach_raw_mirror_metadata(
conv: &mut NormalizedConversation,
record: &crate::raw_mirror::RawMirrorCaptureRecord,
) {
if !conv.metadata.is_object() {
conv.metadata = serde_json::json!({});
}
let Some(metadata) = conv.metadata.as_object_mut() else {
return;
};
let cass = metadata
.entry("cass".to_string())
.or_insert_with(|| serde_json::json!({}));
if !cass.is_object() {
*cass = serde_json::json!({});
}
let Some(cass_obj) = cass.as_object_mut() else {
return;
};
cass_obj.insert(
"raw_mirror".to_string(),
serde_json::json!({
"schema_version": 1,
"manifest_id": record.manifest_id,
"manifest_relative_path": record.manifest_relative_path,
"blob_relative_path": record.blob_relative_path,
"blob_blake3": record.blob_blake3,
"blob_size_bytes": record.blob_size_bytes,
"captured_at_ms": record.captured_at_ms,
"source_mtime_ms": record.source_mtime_ms,
}),
);
}
fn compact_large_connector_extras(connector_name: &str, conv: &mut NormalizedConversation) {
let source_size = fs::metadata(&conv.source_path)
.ok()
.map(|metadata| metadata.len());
compact_large_connector_extras_for_size(connector_name, conv, source_size);
}
fn compact_large_connector_extras_for_size(
connector_name: &str,
conv: &mut NormalizedConversation,
source_size: Option<u64>,
) {
if !should_compact_connector_extra(connector_name, conv, source_size) {
return;
}
for message in &mut conv.messages {
message.extra = compact_indexer_message_extra(&message.extra);
}
}
fn should_compact_connector_extra(
connector_name: &str,
conv: &NormalizedConversation,
source_size: Option<u64>,
) -> bool {
let Some(source_size) = source_size else {
return false;
};
if source_size < CODEX_INDEXER_EXTRA_COMPACT_THRESHOLD_BYTES {
return false;
}
connector_name.eq_ignore_ascii_case("codex") || conv.agent_slug.eq_ignore_ascii_case("codex")
}
fn compact_indexer_message_extra(raw: &serde_json::Value) -> serde_json::Value {
let mut cass = raw
.get("cass")
.and_then(serde_json::Value::as_object)
.cloned()
.unwrap_or_default();
if !cass.contains_key("model")
&& let Some(model) = raw
.get("model")
.or_else(|| raw.pointer("/response/model"))
.and_then(serde_json::Value::as_str)
.filter(|value| !value.trim().is_empty())
{
cass.insert(
"model".to_string(),
serde_json::Value::String(model.to_string()),
);
}
if !cass.contains_key("attachments")
&& let Some(attachments) = raw
.get("attachment_refs")
.or_else(|| raw.get("attachments"))
.cloned()
{
cass.insert("attachments".to_string(), attachments);
}
if cass.is_empty() {
serde_json::json!({})
} else {
let mut out = serde_json::Map::new();
out.insert("cass".to_string(), serde_json::Value::Object(cass));
serde_json::Value::Object(out)
}
}
pub fn apply_workspace_rewrite(conv: &mut NormalizedConversation, root: &ScanRoot) {
if root.workspace_rewrites.is_empty() {
return;
}
let original_workspace = match &conv.workspace {
Some(ws) => ws.to_string_lossy().to_string(),
None => return,
};
let rewritten = root.rewrite_workspace(&original_workspace, Some(&conv.agent_slug));
if rewritten != original_workspace {
if !conv.metadata.is_object() {
conv.metadata = serde_json::json!({});
}
if let Some(obj) = conv.metadata.as_object_mut() {
let cass = obj
.entry("cass".to_string())
.or_insert_with(|| serde_json::json!({}));
if let Some(cass_obj) = cass.as_object_mut() {
cass_obj.insert(
"workspace_original".to_string(),
serde_json::Value::String(original_workspace.clone()),
);
}
}
conv.workspace = Some(std::path::PathBuf::from(&rewritten));
tracing::debug!(
original = %original_workspace,
rewritten = %rewritten,
agent = %conv.agent_slug,
"workspace_rewritten"
);
}
}
pub mod persist {
use super::{LexicalPopulationStrategy, lexical_population_strategy_requires_inline_tantivy};
use std::collections::{HashMap, HashSet};
use std::ops::Range;
use std::path::Path;
use std::time::Duration;
#[cfg(test)]
use std::time::Instant;
use anyhow::{Context, Result, anyhow};
use frankensqlite::FrankenError;
use frankensqlite::compat::{ConnectionExt, ParamValue, RowExt};
use rand::RngExt;
use rayon::prelude::*;
use crate::connectors::NormalizedConversation;
use crate::indexer::semantic::{
EmbeddingInput, packet_embedding_inputs_from_storage_for_message_ids,
};
use crate::model::conversation_packet::{ConversationPacket, ConversationPacketProvenance};
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
use crate::search::tantivy::TantivyIndex;
#[cfg(test)]
use crate::sources::provenance::{Source, SourceKind};
use crate::storage::sqlite::{FrankenStorage, IndexingCache, InsertOutcome};
fn lexical_packet_for_persist(conv: &NormalizedConversation) -> ConversationPacket {
ConversationPacket::from_normalized_conversation(
conv,
ConversationPacketProvenance::local(),
)
}
fn positional_indices_for_inserted(
packet: &ConversationPacket,
inserted_indices: &[i64],
) -> Vec<usize> {
if inserted_indices.is_empty() {
return Vec::new();
}
let inserted: HashSet<i64> = inserted_indices.iter().copied().collect();
packet
.payload
.messages
.iter()
.enumerate()
.filter(|(_, message)| inserted.contains(&message.idx))
.map(|(position, _)| position)
.collect()
}
#[cfg(test)]
#[derive(Debug, Clone, Default)]
struct PersistConversationPerfProfile {
invocations: usize,
messages: usize,
inserted_messages: usize,
total_duration: Duration,
db_duration: Duration,
packet_duration: Duration,
positional_duration: Duration,
tantivy_add_duration: Duration,
}
#[cfg(test)]
impl PersistConversationPerfProfile {
fn millis(duration: Duration) -> f64 {
duration.as_secs_f64() * 1000.0
}
fn log_summary(&self, label: &str) {
let calls = self.invocations.max(1) as f64;
let accounted_duration = self.db_duration
+ self.packet_duration
+ self.positional_duration
+ self.tantivy_add_duration;
let residual_duration = self.total_duration.saturating_sub(accounted_duration);
eprintln!(
concat!(
"CASS_PERSIST_STAGE_PROFILE ",
"label={} calls={} messages={} inserted_messages={} ",
"total_ms={:.3} db_ms={:.3} packet_ms={:.3} positional_ms={:.3} ",
"tantivy_add_ms={:.3} residual_ms={:.3} ",
"avg_total_ms={:.3} avg_db_ms={:.3} avg_packet_ms={:.3} ",
"avg_positional_ms={:.3} avg_tantivy_add_ms={:.3}"
),
label,
self.invocations,
self.messages,
self.inserted_messages,
Self::millis(self.total_duration),
Self::millis(self.db_duration),
Self::millis(self.packet_duration),
Self::millis(self.positional_duration),
Self::millis(self.tantivy_add_duration),
Self::millis(residual_duration),
Self::millis(self.total_duration) / calls,
Self::millis(self.db_duration) / calls,
Self::millis(self.packet_duration) / calls,
Self::millis(self.positional_duration) / calls,
Self::millis(self.tantivy_add_duration) / calls,
);
}
}
#[derive(Debug, Clone, Default)]
pub(super) struct PersistBatchOutcome {
pub inserted_conversations: usize,
pub inserted_messages: usize,
pub semantic_delta_max_message_id: Option<i64>,
pub semantic_delta_inputs: Vec<EmbeddingInput>,
pub lexical_update_deferred: bool,
pub lexical_update_error: Option<String>,
}
impl PersistBatchOutcome {
fn record_insert_outcome(&mut self, outcome: &InsertOutcome) {
self.inserted_conversations = self
.inserted_conversations
.saturating_add(usize::from(outcome.conversation_inserted));
self.inserted_messages = self
.inserted_messages
.saturating_add(outcome.inserted_indices.len());
}
fn extend_semantic_delta(
&mut self,
inputs: Vec<EmbeddingInput>,
max_message_id: Option<i64>,
) {
self.semantic_delta_inputs.extend(inputs);
if let Some(max_message_id) = max_message_id {
self.semantic_delta_max_message_id = Some(
self.semantic_delta_max_message_id
.map_or(max_message_id, |current| current.max(max_message_id)),
);
}
}
fn record_deferred_lexical_update(&mut self, error: &anyhow::Error) {
self.lexical_update_deferred = true;
if self.lexical_update_error.is_none() {
self.lexical_update_error = Some(error.to_string());
}
}
pub(super) fn merge(&mut self, other: Self) {
self.inserted_conversations = self
.inserted_conversations
.saturating_add(other.inserted_conversations);
self.inserted_messages = self
.inserted_messages
.saturating_add(other.inserted_messages);
self.extend_semantic_delta(
other.semantic_delta_inputs,
other.semantic_delta_max_message_id,
);
if other.lexical_update_deferred {
self.lexical_update_deferred = true;
if self.lexical_update_error.is_none() {
self.lexical_update_error = other.lexical_update_error;
}
}
}
}
fn should_defer_incremental_lexical_update_after_error(error: &anyhow::Error) -> bool {
super::error_is_out_of_memory(error)
}
#[cfg(test)]
fn should_inject_incremental_lexical_update_oom() -> bool {
dotenvy::var("CASS_TEST_INCREMENTAL_LEXICAL_UPDATE_OOM").is_ok()
}
#[cfg(not(test))]
fn should_inject_incremental_lexical_update_oom() -> bool {
false
}
fn load_inserted_message_ids_by_idx(
storage: &FrankenStorage,
conversation_id: i64,
inserted_indices: &[i64],
) -> Result<HashMap<i64, i64>> {
if inserted_indices.is_empty() {
return Ok(HashMap::new());
}
let mut sql = String::from(
"SELECT id, idx
FROM messages
WHERE conversation_id = ?1
AND idx IN (",
);
let mut params = Vec::with_capacity(inserted_indices.len() + 1);
params.push(ParamValue::from(conversation_id));
for (offset, idx) in inserted_indices.iter().enumerate() {
if offset > 0 {
sql.push_str(", ");
}
sql.push_str(&format!("?{}", offset + 2));
params.push(ParamValue::from(*idx));
}
sql.push_str(") ORDER BY id ASC");
let rows: Vec<(i64, i64)> = storage.raw().query_map_collect(&sql, ¶ms, |row| {
Ok((row.get_typed(0)?, row.get_typed(1)?))
})?;
Ok(rows.into_iter().map(|(id, idx)| (idx, id)).collect())
}
fn packet_semantic_delta_for_outcome(
storage: &FrankenStorage,
outcome: &InsertOutcome,
) -> Result<(Vec<EmbeddingInput>, Option<i64>)> {
if outcome.inserted_indices.is_empty() {
return Ok((Vec::new(), None));
}
let message_ids_by_idx = load_inserted_message_ids_by_idx(
storage,
outcome.conversation_id,
&outcome.inserted_indices,
)?;
if message_ids_by_idx.len() != outcome.inserted_indices.len() {
tracing::warn!(
conversation_id = outcome.conversation_id,
expected_inserted_indices = outcome.inserted_indices.len(),
resolved_canonical_message_ids = message_ids_by_idx.len(),
"skipping packet semantic delta rows without persisted canonical ids"
);
}
let inserted_message_ids: HashSet<i64> = message_ids_by_idx.values().copied().collect();
let max_message_id = inserted_message_ids.iter().copied().max();
let inputs = packet_embedding_inputs_from_storage_for_message_ids(
storage,
&[outcome.conversation_id],
&inserted_message_ids,
)?;
Ok((inputs, max_message_id))
}
fn raw_mirror_manifest_relative_path(conv: &NormalizedConversation) -> Option<&str> {
conv.metadata
.get("cass")
.and_then(|cass| cass.get("raw_mirror"))
.and_then(|raw_mirror| raw_mirror.get("manifest_relative_path"))
.and_then(serde_json::Value::as_str)
.map(str::trim)
.filter(|value| !value.is_empty())
}
fn persisted_raw_mirror_db_link(
conv: &NormalizedConversation,
outcome: &InsertOutcome,
) -> crate::raw_mirror::RawMirrorDbLink {
crate::raw_mirror::RawMirrorDbLink {
conversation_id: Some(outcome.conversation_id),
message_count: Some(conv.messages.len()),
source_path: Some(conv.source_path.display().to_string()),
started_at_ms: conv.started_at,
}
}
fn record_persisted_raw_mirror_db_link(
data_dir: &Path,
conv: &NormalizedConversation,
outcome: &InsertOutcome,
) {
let Some(manifest_relative_path) = raw_mirror_manifest_relative_path(conv) else {
return;
};
let db_link = persisted_raw_mirror_db_link(conv, outcome);
if let Err(error) = crate::raw_mirror::merge_manifest_db_links(
data_dir,
manifest_relative_path,
std::slice::from_ref(&db_link),
) {
tracing::warn!(
agent = %conv.agent_slug,
conversation_id = outcome.conversation_id,
manifest_relative_path,
error = %error,
"failed to record persisted raw mirror conversation link"
);
}
}
fn record_persisted_raw_mirror_db_links(
data_dir: Option<&Path>,
convs: &[NormalizedConversation],
outcomes: &[InsertOutcome],
) {
let Some(data_dir) = data_dir else {
return;
};
for (conv, outcome) in convs.iter().zip(outcomes.iter()) {
record_persisted_raw_mirror_db_link(data_dir, conv, outcome);
}
}
fn begin_concurrent_writes_enabled() -> bool {
dotenvy::var("CASS_INDEXER_BEGIN_CONCURRENT")
.map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
.unwrap_or(false)
}
const BEGIN_CONCURRENT_RETRY_MAX: usize = 32;
const BEGIN_CONCURRENT_CHUNK_SIZE_MAX: usize = 512;
const BEGIN_CONCURRENT_WRITER_CACHE_KIB_MAX: i64 = 65_536;
fn env_usize_bounded(var: &str, default: usize, max: usize) -> usize {
match dotenvy::var(var).ok().and_then(|v| v.parse::<usize>().ok()) {
Some(0) => default,
Some(v) if v > max => {
tracing::warn!(
env_var = var,
requested = v,
cap = max,
"env var exceeds safe cap; clamping"
);
max
}
Some(v) => v,
None => default,
}
}
fn env_i64_bounded(var: &str, default: i64, max: i64) -> i64 {
match dotenvy::var(var).ok().and_then(|v| v.parse::<i64>().ok()) {
Some(v) if v <= 0 => default,
Some(v) if v > max => {
tracing::warn!(
env_var = var,
requested = v,
cap = max,
"env var exceeds safe cap; clamping"
);
max
}
Some(v) => v,
None => default,
}
}
pub(super) fn begin_concurrent_retry_limit() -> usize {
env_usize_bounded(
"CASS_INDEXER_BEGIN_CONCURRENT_RETRIES",
6,
BEGIN_CONCURRENT_RETRY_MAX,
)
}
fn begin_concurrent_chunk_size() -> usize {
env_usize_bounded(
"CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE",
32,
BEGIN_CONCURRENT_CHUNK_SIZE_MAX,
)
}
fn begin_concurrent_writer_cache_kib() -> i64 {
env_i64_bounded(
"CASS_INDEXER_BEGIN_CONCURRENT_WRITER_CACHE_KIB",
4096,
BEGIN_CONCURRENT_WRITER_CACHE_KIB_MAX,
)
}
fn serial_batch_chunk_size() -> usize {
dotenvy::var("CASS_INDEXER_SERIAL_CHUNK_SIZE")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.filter(|v| *v > 0)
.unwrap_or(128)
}
fn index_writer_busy_timeout_ms() -> u64 {
dotenvy::var("CASS_INDEX_WRITER_BUSY_TIMEOUT_MS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
.filter(|v| *v > 0)
.unwrap_or(60_000)
}
fn index_writer_wal_autocheckpoint_pages(defer_checkpoints: bool) -> i64 {
dotenvy::var("CASS_INDEX_WRITER_WAL_AUTOCHECKPOINT_PAGES")
.ok()
.and_then(|v| v.parse::<i64>().ok())
.filter(|v| *v >= 0)
.unwrap_or(if defer_checkpoints { 0 } else { 1000 })
}
fn defer_lexical_updates_enabled() -> bool {
dotenvy::var("CASS_DEFER_LEXICAL_UPDATES")
.map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
.unwrap_or(false)
}
fn apply_begin_concurrent_writer_tuning(storage: &FrankenStorage, defer_checkpoints: bool) {
let cache_kib = begin_concurrent_writer_cache_kib();
let pragma = format!("PRAGMA cache_size = -{cache_kib};");
if let Err(err) = storage.raw().execute(&pragma) {
tracing::debug!(
cache_kib,
error = %err,
"failed_to_apply_begin_concurrent_writer_cache_size"
);
}
apply_index_writer_checkpoint_policy(storage, defer_checkpoints);
}
pub(super) fn apply_index_writer_busy_timeout(storage: &FrankenStorage) {
let busy_timeout_ms = index_writer_busy_timeout_ms();
if storage.index_writer_busy_timeout_ms() == Some(busy_timeout_ms) {
return;
}
let pragma = format!("PRAGMA busy_timeout = {busy_timeout_ms};");
if let Err(err) = storage.raw().execute(&pragma) {
tracing::debug!(
busy_timeout_ms,
error = %err,
"failed_to_apply_index_writer_busy_timeout"
);
} else {
storage.mark_index_writer_busy_timeout_ms(busy_timeout_ms);
}
}
pub(super) fn apply_index_writer_checkpoint_policy(
storage: &FrankenStorage,
defer_checkpoints: bool,
) {
let wal_autocheckpoint_pages = index_writer_wal_autocheckpoint_pages(defer_checkpoints);
if storage.index_writer_checkpoint_pages() == Some(wal_autocheckpoint_pages) {
return;
}
let pragma = format!("PRAGMA wal_autocheckpoint = {wal_autocheckpoint_pages};");
if let Err(err) = storage.raw().execute(&pragma) {
tracing::debug!(
wal_autocheckpoint_pages,
error = %err,
"failed_to_apply_index_writer_checkpoint_policy"
);
} else {
storage.mark_index_writer_checkpoint_pages(wal_autocheckpoint_pages);
}
}
pub(super) fn with_ephemeral_writer<T, F>(
storage: &FrankenStorage,
defer_checkpoints: bool,
context: &str,
f: F,
) -> Result<T>
where
F: FnOnce(&FrankenStorage) -> Result<T>,
{
let db_path = storage
.database_path()
.with_context(|| format!("resolving database path for {context}"))?;
apply_index_writer_checkpoint_policy(storage, defer_checkpoints);
let (writer, reusable) = storage.acquire_cached_ephemeral_writer().with_context(|| {
format!(
"opening short-lived frankensqlite writer for {context}: {}",
db_path.display()
)
})?;
let release_writer = |writer: FrankenStorage| -> Result<()> {
if reusable {
storage.release_cached_ephemeral_writer(writer);
Ok(())
} else {
writer.close().with_context(|| {
format!(
"closing short-lived frankensqlite writer for {context}: {}",
db_path.display()
)
})
}
};
let discard_writer = |mut writer: FrankenStorage| {
if reusable {
storage.discard_cached_ephemeral_writer(writer);
} else {
writer.close_best_effort_in_place();
}
};
if !storage.ephemeral_writer_preflight_verified() {
if let Err(err) = writer
.raw()
.execute("UPDATE meta SET value = value WHERE key = 'schema_version'")
{
discard_writer(writer);
anyhow::bail!(
"ephemeral writer preflight write failed for {context} at {}: {err}. \
The database may be locked by another process or opened in \
readonly mode. Try closing other cass instances and retrying.",
db_path.display()
);
}
storage.mark_ephemeral_writer_preflight_verified();
}
apply_index_writer_busy_timeout(&writer);
apply_index_writer_checkpoint_policy(&writer, defer_checkpoints);
if let Err(err) = writer.raw().execute("PRAGMA foreign_keys = OFF") {
tracing::debug!(
error = %err,
context,
"failed to disable FK enforcement on ephemeral writer"
);
}
let result = f(&writer);
match result {
Ok(value) => {
release_writer(writer)?;
Ok(value)
}
Err(err) => {
if let Err(close_err) = release_writer(writer) {
tracing::warn!(
error = %close_err,
db_path = %db_path.display(),
context,
"failed to close short-lived writer cleanly after write error"
);
}
Err(err)
}
}
}
fn transient_franken_error(err: &anyhow::Error) -> Option<&FrankenError> {
err.downcast_ref::<FrankenError>()
.or_else(|| err.root_cause().downcast_ref::<FrankenError>())
}
fn is_retryable_franken_error(err: &anyhow::Error) -> bool {
transient_franken_error(err).is_some_and(|inner| {
matches!(
inner,
FrankenError::Busy
| FrankenError::BusyRecovery
| FrankenError::BusySnapshot { .. }
| FrankenError::WriteConflict { .. }
| FrankenError::SerializationFailure { .. }
)
})
}
pub(super) fn with_concurrent_retry<F, T>(max_retries: usize, mut f: F) -> Result<T>
where
F: FnMut() -> Result<T>,
{
let mut rng = rand::rng();
let mut backoff_ms = 4_u64;
for attempt in 0..=max_retries {
match f() {
Ok(val) => return Ok(val),
Err(err) if attempt < max_retries && is_retryable_franken_error(&err) => {
let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
tracing::debug!(
attempt = attempt + 1,
max_retries,
backoff_ms = sleep_ms,
error = %err,
"begin_concurrent_retry"
);
std::thread::sleep(Duration::from_millis(sleep_ms));
backoff_ms = (backoff_ms * 2).min(256);
}
Err(err) => return Err(err),
}
}
Err(anyhow!("exhausted begin-concurrent retries"))
}
enum ChunkPersistResult {
Completed(Vec<(usize, InsertOutcome)>),
RetryableFallback {
completed: Vec<(usize, InsertOutcome)>,
remaining_range: Range<usize>,
error: anyhow::Error,
},
}
fn persist_chunk_with_writer(
franken: &FrankenStorage,
base_idx: usize,
chunk: &[NormalizedConversation],
internal_chunk: &[Conversation],
max_retries: usize,
) -> Result<ChunkPersistResult> {
debug_assert_eq!(
chunk.len(),
internal_chunk.len(),
"parallel pre-map must produce one Conversation per NormalizedConversation"
);
let mut outcomes = Vec::with_capacity(chunk.len());
let mut agent_cache: HashMap<String, i64> = HashMap::new();
let mut workspace_cache: HashMap<std::path::PathBuf, i64> = HashMap::new();
for (offset, (conv, internal)) in chunk.iter().zip(internal_chunk.iter()).enumerate() {
let idx = base_idx + offset;
let agent_slug = conv.agent_slug.clone();
let workspace = conv.workspace.clone();
match with_concurrent_retry(max_retries, || {
let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
*id
} else {
let agent = Agent {
id: None,
slug: agent_slug.clone(),
name: agent_slug.clone(),
version: None,
kind: AgentKind::Cli,
};
let id = franken.ensure_agent(&agent)?;
agent_cache.insert(agent_slug.clone(), id);
id
};
let workspace_id = if let Some(ws) = &workspace {
if let Some(id) = workspace_cache.get(ws) {
Some(*id)
} else {
let id = franken.ensure_workspace(ws, None)?;
workspace_cache.insert(ws.clone(), id);
Some(id)
}
} else {
None
};
franken.insert_conversation_tree(agent_id, workspace_id, internal)
}) {
Ok(outcome) => outcomes.push((idx, outcome)),
Err(err) if is_retryable_franken_error(&err) => {
return Ok(ChunkPersistResult::RetryableFallback {
completed: outcomes,
remaining_range: idx..(base_idx + chunk.len()),
error: err,
});
}
Err(err) => return Err(err),
}
}
Ok(ChunkPersistResult::Completed(outcomes))
}
fn persist_chunk_serial_fallback(
db_path: &std::path::Path,
base_idx: usize,
chunk: &[NormalizedConversation],
internal_chunk: &[Conversation],
max_retries: usize,
defer_checkpoints: bool,
) -> Result<Vec<(usize, InsertOutcome)>> {
let franken = FrankenStorage::open_writer(db_path).with_context(|| {
format!(
"opening frankensqlite writer for begin-concurrent serial fallback: {}",
db_path.display()
)
})?;
apply_begin_concurrent_writer_tuning(&franken, defer_checkpoints);
if let Err(err) = franken.raw().execute("PRAGMA foreign_keys = OFF") {
tracing::debug!(
error = %err,
"failed to disable FK enforcement on serial fallback writer"
);
}
let fallback_retries = max_retries.max(12);
let result =
persist_chunk_with_writer(&franken, base_idx, chunk, internal_chunk, fallback_retries);
let close_result = franken.close().with_context(|| {
format!(
"closing frankensqlite writer for begin-concurrent serial fallback: {}",
db_path.display()
)
});
match result {
Ok(ChunkPersistResult::Completed(outcomes)) => {
close_result?;
Ok(outcomes)
}
Ok(ChunkPersistResult::RetryableFallback {
completed,
remaining_range,
error,
}) => {
if let Err(close_err) = close_result {
tracing::warn!(
error = %close_err,
db_path = %db_path.display(),
"failed to close serial fallback writer cleanly after retry exhaustion"
);
}
ordered_bail_serial_fallback(completed.len(), remaining_range, error)
}
Err(err) => {
if let Err(close_err) = close_result {
tracing::warn!(
error = %close_err,
db_path = %db_path.display(),
"failed to close serial fallback writer cleanly after index error"
);
}
Err(err)
}
}
}
fn ordered_bail_serial_fallback(
completed: usize,
remaining_range: Range<usize>,
error: anyhow::Error,
) -> Result<Vec<(usize, InsertOutcome)>> {
Err(anyhow!(
"begin-concurrent serial fallback exhausted retryable conflicts after persisting {completed} conversations; remaining range {}..{}: {error}",
remaining_range.start,
remaining_range.end
))
}
fn duplicate_conversation_keys_present(convs: &[NormalizedConversation]) -> bool {
let mut seen = HashSet::with_capacity(convs.len());
for conv in convs {
let (source_id, _) = extract_provenance(&conv.metadata);
let key = if let Some(external_id) = conv.external_id.as_deref() {
(
conv.agent_slug.clone(),
source_id,
Some(external_id.to_owned()),
None,
conv.started_at,
)
} else {
(
conv.agent_slug.clone(),
source_id,
None,
Some(conv.source_path.to_string_lossy().to_string()),
None,
)
};
if !seen.insert(key) {
return true;
}
}
false
}
#[allow(clippy::too_many_arguments)]
fn persist_conversations_batched_begin_concurrent(
storage: &FrankenStorage,
db_path: &Path,
mut t_index: Option<&mut TantivyIndex>,
convs: &[NormalizedConversation],
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
capture_semantic_delta: bool,
raw_mirror_data_dir: Option<&Path>,
) -> Result<PersistBatchOutcome> {
if lexical_population_strategy_requires_inline_tantivy(lexical_strategy)
&& t_index.is_none()
{
anyhow::bail!(
"begin-concurrent batched persistence requires a Tantivy writer for {}",
lexical_strategy.as_str()
);
}
let max_retries = begin_concurrent_retry_limit();
let chunk_size = begin_concurrent_chunk_size().min(convs.len().max(1));
let internal_convs: Vec<Conversation> = convs
.par_iter()
.map_init(
super::redact_secrets::MemoizingRedactor::new,
|redactor, conv| map_to_internal_with_redactor(conv, Some(redactor)),
)
.collect();
let indexed_chunks: Vec<Result<ChunkPersistResult>> = convs
.par_chunks(chunk_size)
.enumerate()
.map(|(chunk_idx, chunk)| {
let base_idx = chunk_idx * chunk_size;
let internal_chunk = &internal_convs[base_idx..base_idx + chunk.len()];
let shadow_guard = crate::indexer::parallel_wal_shadow::start_chunk(
chunk_idx,
base_idx,
chunk.len(),
);
let franken = FrankenStorage::open_writer(db_path).with_context(|| {
format!(
"opening frankensqlite writer for begin-concurrent mode: {}",
db_path.display()
)
})?;
apply_begin_concurrent_writer_tuning(&franken, defer_checkpoints);
if let Err(err) = franken.raw().execute("PRAGMA foreign_keys = OFF") {
tracing::debug!(
error = %err,
chunk_idx,
"failed to disable FK enforcement on begin-concurrent writer"
);
}
let result = persist_chunk_with_writer(
&franken,
base_idx,
chunk,
internal_chunk,
max_retries,
);
let close_result = franken.close().with_context(|| {
format!(
"closing frankensqlite writer for begin-concurrent mode: {}",
db_path.display()
)
});
match result {
Ok(outcomes) => {
close_result?;
if let Some(g) = shadow_guard {
g.finish_ok();
}
Ok(outcomes)
}
Err(err) => {
if let Err(close_err) = close_result {
tracing::warn!(
error = %close_err,
db_path = %db_path.display(),
"failed to close begin-concurrent writer cleanly after index error"
);
}
if let Some(g) = shadow_guard {
g.finish_err();
}
Err(err)
}
}
})
.collect();
let mut ordered = Vec::with_capacity(convs.len());
let mut fallback_ranges = Vec::new();
for chunk in indexed_chunks {
match chunk? {
ChunkPersistResult::Completed(outcomes) => ordered.extend(outcomes),
ChunkPersistResult::RetryableFallback {
completed,
remaining_range,
error,
} => {
tracing::warn!(
error = %error,
completed = completed.len(),
remaining = remaining_range.len(),
start = remaining_range.start,
end = remaining_range.end,
"begin-concurrent chunk exhausted retryable conflicts; falling back to serial replay"
);
ordered.extend(completed);
fallback_ranges.push(remaining_range);
}
}
}
for remaining_range in fallback_ranges {
let fallback_outcomes = persist_chunk_serial_fallback(
db_path,
remaining_range.start,
&convs[remaining_range.clone()],
&internal_convs[remaining_range.clone()],
max_retries,
defer_checkpoints,
)?;
ordered.extend(fallback_outcomes);
}
ordered.sort_by_key(|(idx, _)| *idx);
if let Some(data_dir) = raw_mirror_data_dir {
for (idx, outcome) in &ordered {
if let Some(conv) = convs.get(*idx) {
record_persisted_raw_mirror_db_link(data_dir, conv, outcome);
}
}
}
let defer_lexical_updates = defer_lexical_updates_enabled();
let mut batch_outcome = PersistBatchOutcome::default();
let mut skip_inline_lexical_updates = false;
for (idx, outcome) in ordered {
let conv = &convs[idx];
batch_outcome.record_insert_outcome(&outcome);
if defer_lexical_updates || skip_inline_lexical_updates {
if capture_semantic_delta {
let (inputs, max_message_id) =
packet_semantic_delta_for_outcome(storage, &outcome)?;
batch_outcome.extend_semantic_delta(inputs, max_message_id);
}
continue;
}
match lexical_strategy {
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild => continue,
LexicalPopulationStrategy::InlineRebuildFromScan => {
let packet = lexical_packet_for_persist(conv);
t_index
.as_deref_mut()
.expect("inline rebuild requires Tantivy writer")
.add_messages_from_packet(
&packet,
None,
Some(outcome.conversation_id),
|_| Ok(()),
)?;
}
LexicalPopulationStrategy::IncrementalInline => {
if !outcome.inserted_indices.is_empty() {
let packet = lexical_packet_for_persist(conv);
let positional =
positional_indices_for_inserted(&packet, &outcome.inserted_indices);
if !positional.is_empty() {
let add_result = if should_inject_incremental_lexical_update_oom() {
Err(anyhow::anyhow!("out of memory"))
} else {
t_index
.as_deref_mut()
.expect("incremental inline updates require Tantivy writer")
.add_messages_from_packet(
&packet,
Some(&positional),
Some(outcome.conversation_id),
|_| Ok(()),
)
};
if let Err(error) = add_result {
if should_defer_incremental_lexical_update_after_error(&error) {
batch_outcome.record_deferred_lexical_update(&error);
skip_inline_lexical_updates = true;
tracing::warn!(
error = %error,
"incremental lexical update ran out of memory; preserving SQLite ingest and deferring lexical repair"
);
} else {
return Err(error);
}
}
}
}
}
}
if capture_semantic_delta {
let (inputs, max_message_id) =
packet_semantic_delta_for_outcome(storage, &outcome)?;
batch_outcome.extend_semantic_delta(inputs, max_message_id);
}
}
Ok(batch_outcome)
}
fn extract_provenance(metadata: &serde_json::Value) -> (String, Option<String>) {
let cass_origin = metadata.get("cass").and_then(|c| c.get("origin"));
let raw_source_id = cass_origin
.and_then(|o| o.get("source_id"))
.and_then(|v| v.as_str());
let raw_origin_kind = cass_origin
.and_then(|o| o.get("kind"))
.and_then(|v| v.as_str());
let origin_host = crate::search::tantivy::normalized_index_origin_host(
cass_origin
.and_then(|o| o.get("host"))
.and_then(|v| v.as_str()),
);
let source_id = crate::search::tantivy::normalized_index_source_id(
raw_source_id,
raw_origin_kind,
origin_host.as_deref(),
);
(source_id, origin_host)
}
pub fn map_to_internal(conv: &NormalizedConversation) -> Conversation {
map_to_internal_with_redactor(conv, None)
}
pub(crate) fn map_to_internal_with_redactor(
conv: &NormalizedConversation,
mut redactor: Option<&mut super::redact_secrets::MemoizingRedactor>,
) -> Conversation {
let (source_id, origin_host) = extract_provenance(&conv.metadata);
let should_redact = super::redact_secrets::redaction_enabled();
Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: if should_redact {
conv.title.as_ref().map(|t| {
if let Some(r) = redactor.as_mut() {
r.redact_text(t)
} else {
super::redact_secrets::redact_text(t).into_owned()
}
})
} else {
conv.title.clone()
},
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: if should_redact {
let s = serde_json::to_string(&conv.metadata).unwrap_or_default();
let redacted = if let Some(r) = redactor.as_mut() {
r.redact_text(&s)
} else {
super::redact_secrets::redact_text(&s).into_owned()
};
serde_json::from_str(&redacted).unwrap_or_else(|_| conv.metadata.clone())
} else {
conv.metadata.clone()
},
messages: conv
.messages
.iter()
.map(|m| {
let content = if should_redact {
if let Some(r) = redactor.as_mut() {
r.redact_text(&m.content)
} else {
super::redact_secrets::redact_text(&m.content).into_owned()
}
} else {
m.content.clone()
};
let extra_json = if should_redact {
if let Some(r) = redactor.as_mut() {
r.redact_json(&m.extra)
} else {
super::redact_secrets::redact_json(&m.extra)
}
} else {
m.extra.clone()
};
Message {
id: None,
idx: m.idx,
role: map_role(&m.role),
author: m.author.clone(),
created_at: m.created_at,
content,
extra_json,
snippets: m
.snippets
.iter()
.map(|s| Snippet {
id: None,
file_path: s.file_path.clone(),
start_line: s.start_line,
end_line: s.end_line,
language: s.language.clone(),
snippet_text: s.snippet_text.as_ref().map(|snippet_text| {
if should_redact {
if let Some(r) = redactor.as_mut() {
r.redact_text(snippet_text)
} else {
super::redact_secrets::redact_text(snippet_text)
.into_owned()
}
} else {
snippet_text.clone()
}
}),
})
.collect(),
}
})
.collect(),
source_id,
origin_host,
}
}
pub fn persist_conversation(
storage: &FrankenStorage,
t_index: &mut TantivyIndex,
conv: &NormalizedConversation,
) -> Result<()> {
tracing::info!(agent = %conv.agent_slug, messages = conv.messages.len(), "persist_conversation");
let InsertOutcome {
conversation_id,
conversation_inserted: _conversation_inserted,
inserted_indices,
} = with_ephemeral_writer(storage, false, "persist_conversation", |writer| {
let internal_conv = map_to_internal(conv);
let agent = Agent {
id: None,
slug: conv.agent_slug.clone(),
name: conv.agent_slug.clone(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = writer.ensure_agent(&agent)?;
let workspace_id = if let Some(ws) = &conv.workspace {
Some(writer.ensure_workspace(ws, None)?)
} else {
None
};
writer.insert_conversation_tree(agent_id, workspace_id, &internal_conv)
})?;
if !defer_lexical_updates_enabled() && !inserted_indices.is_empty() {
let packet = lexical_packet_for_persist(conv);
let positional = positional_indices_for_inserted(&packet, &inserted_indices);
if !positional.is_empty() {
t_index.add_messages_from_packet(
&packet,
Some(&positional),
Some(conversation_id),
|_| Ok(()),
)?;
}
}
Ok(())
}
#[cfg(test)]
fn persist_conversation_with_profile(
storage: &FrankenStorage,
t_index: &mut TantivyIndex,
conv: &NormalizedConversation,
profile: &mut PersistConversationPerfProfile,
) -> Result<()> {
let total_started = Instant::now();
let db_started = Instant::now();
let InsertOutcome {
conversation_id,
conversation_inserted: _conversation_inserted,
inserted_indices,
} = with_ephemeral_writer(storage, false, "persist_conversation", |writer| {
let internal_conv = map_to_internal(conv);
let agent = Agent {
id: None,
slug: conv.agent_slug.clone(),
name: conv.agent_slug.clone(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = writer.ensure_agent(&agent)?;
let workspace_id = if let Some(ws) = &conv.workspace {
Some(writer.ensure_workspace(ws, None)?)
} else {
None
};
writer.insert_conversation_tree(agent_id, workspace_id, &internal_conv)
})?;
profile.db_duration += db_started.elapsed();
if !defer_lexical_updates_enabled() && !inserted_indices.is_empty() {
let packet_started = Instant::now();
let packet = lexical_packet_for_persist(conv);
profile.packet_duration += packet_started.elapsed();
let positional_started = Instant::now();
let positional = positional_indices_for_inserted(&packet, &inserted_indices);
profile.positional_duration += positional_started.elapsed();
if !positional.is_empty() {
let tantivy_add_started = Instant::now();
t_index.add_messages_from_packet(
&packet,
Some(&positional),
Some(conversation_id),
|_| Ok(()),
)?;
profile.tantivy_add_duration += tantivy_add_started.elapsed();
}
}
profile.invocations = profile.invocations.saturating_add(1);
profile.messages = profile.messages.saturating_add(conv.messages.len());
profile.inserted_messages = profile
.inserted_messages
.saturating_add(inserted_indices.len());
profile.total_duration += total_started.elapsed();
Ok(())
}
#[cfg(test)]
pub(super) fn persist_conversations_batched(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
convs: &[NormalizedConversation],
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Result<PersistBatchOutcome> {
persist_conversations_batched_inner(
storage,
t_index,
convs,
lexical_strategy,
defer_checkpoints,
false,
None,
)
}
pub(super) fn persist_conversations_batched_with_raw_mirror_links(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
data_dir: &Path,
convs: &[NormalizedConversation],
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Result<PersistBatchOutcome> {
persist_conversations_batched_inner(
storage,
t_index,
convs,
lexical_strategy,
defer_checkpoints,
false,
Some(data_dir),
)
}
pub(super) fn persist_conversations_batched_with_semantic_delta_and_raw_mirror_links(
storage: &FrankenStorage,
t_index: Option<&mut TantivyIndex>,
data_dir: &Path,
convs: &[NormalizedConversation],
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
) -> Result<PersistBatchOutcome> {
persist_conversations_batched_inner(
storage,
t_index,
convs,
lexical_strategy,
defer_checkpoints,
true,
Some(data_dir),
)
}
fn persist_conversations_batched_inner(
storage: &FrankenStorage,
mut t_index: Option<&mut TantivyIndex>,
convs: &[NormalizedConversation],
lexical_strategy: LexicalPopulationStrategy,
defer_checkpoints: bool,
capture_semantic_delta: bool,
raw_mirror_data_dir: Option<&Path>,
) -> Result<PersistBatchOutcome> {
if convs.is_empty() {
return Ok(PersistBatchOutcome::default());
}
if lexical_population_strategy_requires_inline_tantivy(lexical_strategy)
&& t_index.is_none()
{
anyhow::bail!(
"batched persistence requires a Tantivy writer for {}",
lexical_strategy.as_str()
);
}
let begin_concurrent_enabled = begin_concurrent_writes_enabled();
let duplicate_keys_present =
begin_concurrent_enabled && duplicate_conversation_keys_present(convs);
if begin_concurrent_enabled && !duplicate_keys_present {
let db_path = storage
.database_path()
.with_context(|| "resolving database path for begin-concurrent write mode")?;
tracing::info!(
conversations = convs.len(),
"using begin-concurrent write path for indexing"
);
return persist_conversations_batched_begin_concurrent(
storage,
&db_path,
t_index,
convs,
lexical_strategy,
defer_checkpoints,
capture_semantic_delta,
raw_mirror_data_dir,
);
}
if duplicate_keys_present {
tracing::info!(
conversations = convs.len(),
"duplicate conversation keys detected; falling back to serial batched indexing path"
);
}
use rayon::prelude::*;
let internal_convs: Vec<Conversation> = convs
.par_iter()
.map_init(
super::redact_secrets::MemoizingRedactor::new,
|redactor, conv| map_to_internal_with_redactor(conv, Some(redactor)),
)
.collect();
let outcomes = with_ephemeral_writer(
storage,
defer_checkpoints,
"serial batched indexing",
|writer| {
let cache_enabled = IndexingCache::is_enabled();
let mut cache = IndexingCache::new();
let mut prepared: Vec<(i64, Option<i64>, Conversation)> =
Vec::with_capacity(convs.len());
for (conv, internal_conv) in convs.iter().zip(internal_convs) {
let agent = Agent {
id: None,
slug: conv.agent_slug.clone(),
name: conv.agent_slug.clone(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = if cache_enabled {
cache.get_or_insert_agent(writer, &agent)?
} else {
writer.ensure_agent(&agent)?
};
let workspace_id = if let Some(ws) = &conv.workspace {
if cache_enabled {
Some(cache.get_or_insert_workspace(writer, ws, None)?)
} else {
Some(writer.ensure_workspace(ws, None)?)
}
} else {
None
};
prepared.push((agent_id, workspace_id, internal_conv));
}
if cache_enabled {
let (hits, misses, hit_rate) = cache.stats();
tracing::debug!(
hits,
misses,
hit_rate = format!("{:.1}%", hit_rate * 100.0),
agents = cache.agent_count(),
workspaces = cache.workspace_count(),
"IndexingCache stats"
);
}
let refs: Vec<(i64, Option<i64>, &Conversation)> =
prepared.iter().map(|(a, w, c)| (*a, *w, c)).collect();
let chunk_size = serial_batch_chunk_size().min(refs.len().max(1));
let mut outcomes = Vec::with_capacity(refs.len());
for start in (0..refs.len()).step_by(chunk_size) {
let end = (start + chunk_size).min(refs.len());
let chunk_refs = &refs[start..end];
outcomes.extend(writer.insert_conversations_batched(chunk_refs)?);
}
Ok(outcomes)
},
)?;
let defer_lexical_updates = defer_lexical_updates_enabled();
let mut batch_outcome = PersistBatchOutcome::default();
record_persisted_raw_mirror_db_links(raw_mirror_data_dir, convs, &outcomes);
if !defer_lexical_updates {
let mut skip_inline_lexical_updates = false;
for (conv, outcome) in convs.iter().zip(outcomes.iter()) {
batch_outcome.record_insert_outcome(outcome);
if skip_inline_lexical_updates {
continue;
}
match lexical_strategy {
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild => continue,
LexicalPopulationStrategy::InlineRebuildFromScan => {
let packet = lexical_packet_for_persist(conv);
t_index
.as_deref_mut()
.expect("inline rebuild requires Tantivy writer")
.add_messages_from_packet(
&packet,
None,
Some(outcome.conversation_id),
|_| Ok(()),
)?;
}
LexicalPopulationStrategy::IncrementalInline => {
if !outcome.inserted_indices.is_empty() {
let packet = lexical_packet_for_persist(conv);
let positional =
positional_indices_for_inserted(&packet, &outcome.inserted_indices);
if !positional.is_empty() {
let add_result = if should_inject_incremental_lexical_update_oom() {
Err(anyhow::anyhow!("out of memory"))
} else {
t_index
.as_deref_mut()
.expect("incremental inline updates require Tantivy writer")
.add_messages_from_packet(
&packet,
Some(&positional),
Some(outcome.conversation_id),
|_| Ok(()),
)
};
if let Err(error) = add_result {
if should_defer_incremental_lexical_update_after_error(&error) {
batch_outcome.record_deferred_lexical_update(&error);
skip_inline_lexical_updates = true;
tracing::warn!(
error = %error,
"incremental lexical update ran out of memory; preserving SQLite ingest and deferring lexical repair"
);
} else {
return Err(error);
}
}
}
}
}
}
}
} else {
for outcome in &outcomes {
batch_outcome.record_insert_outcome(outcome);
}
}
if capture_semantic_delta {
for outcome in outcomes.iter() {
let (inputs, max_message_id) = packet_semantic_delta_for_outcome(storage, outcome)?;
batch_outcome.extend_semantic_delta(inputs, max_message_id);
}
}
Ok(batch_outcome)
}
fn map_role(role: &str) -> MessageRole {
match role {
"user" => MessageRole::User,
"assistant" | "agent" => MessageRole::Agent,
"tool" => MessageRole::Tool,
"system" => MessageRole::System,
other => MessageRole::Other(other.to_string()),
}
}
#[cfg(test)]
mod persist_internal_tests {
use super::*;
use crate::connectors::NormalizedMessage;
use fsqlite_types::value::SqliteValue;
use serial_test::serial;
static ENV_LOCK: std::sync::LazyLock<std::sync::Mutex<()>> =
std::sync::LazyLock::new(|| std::sync::Mutex::new(()));
std::thread_local! {
static ENV_LOCK_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
}
struct EnvGuard {
key: &'static str,
previous: Option<String>,
_lock: Option<std::sync::MutexGuard<'static, ()>>,
}
impl Drop for EnvGuard {
fn drop(&mut self) {
if let Some(value) = &self.previous {
unsafe {
std::env::set_var(self.key, value);
}
} else {
unsafe {
std::env::remove_var(self.key);
}
}
ENV_LOCK_DEPTH.with(|depth| {
let current = depth.get();
debug_assert!(current > 0, "env lock depth underflow");
depth.set(current.saturating_sub(1));
});
}
}
fn acquire_env_lock() -> Option<std::sync::MutexGuard<'static, ()>> {
let mut guard = None;
ENV_LOCK_DEPTH.with(|depth| {
let current = depth.get();
if current == 0 {
guard = Some(ENV_LOCK.lock().expect("env mutation lock"));
}
depth.set(current + 1);
});
guard
}
fn set_env(key: &'static str, value: &str) -> EnvGuard {
let _lock = acquire_env_lock();
let previous = dotenvy::var(key).ok();
unsafe {
std::env::set_var(key, value);
}
EnvGuard {
key,
previous,
_lock,
}
}
#[test]
fn begin_concurrent_flag_parsing() {
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
assert!(begin_concurrent_writes_enabled());
}
#[test]
fn begin_concurrent_chunk_size_parsing() {
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "7");
assert_eq!(begin_concurrent_chunk_size(), 7);
}
#[test]
fn begin_concurrent_retry_limit_parsing() {
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_RETRIES", "9");
assert_eq!(begin_concurrent_retry_limit(), 9);
}
#[test]
fn begin_concurrent_writer_cache_parsing() {
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_WRITER_CACHE_KIB", "2048");
assert_eq!(begin_concurrent_writer_cache_kib(), 2048);
}
#[test]
fn begin_concurrent_writer_cache_invalid_defaults() {
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_WRITER_CACHE_KIB", "0");
assert_eq!(begin_concurrent_writer_cache_kib(), 4096);
}
#[test]
fn begin_concurrent_knobs_are_clamped_to_safe_caps() {
{
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_RETRIES", "1000000000");
assert_eq!(begin_concurrent_retry_limit(), BEGIN_CONCURRENT_RETRY_MAX);
}
{
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "1000000000");
assert_eq!(
begin_concurrent_chunk_size(),
BEGIN_CONCURRENT_CHUNK_SIZE_MAX
);
}
{
let _guard = set_env(
"CASS_INDEXER_BEGIN_CONCURRENT_WRITER_CACHE_KIB",
"9999999999",
);
assert_eq!(
begin_concurrent_writer_cache_kib(),
BEGIN_CONCURRENT_WRITER_CACHE_KIB_MAX
);
}
{
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_WRITER_CACHE_KIB", "-42");
assert_eq!(begin_concurrent_writer_cache_kib(), 4096);
}
}
#[test]
#[serial]
fn wal_autocheckpoint_defaults_follow_bulk_import_mode() {
let _guard = set_env("CASS_INDEX_WRITER_WAL_AUTOCHECKPOINT_PAGES", "-1");
assert_eq!(index_writer_wal_autocheckpoint_pages(true), 0);
assert_eq!(index_writer_wal_autocheckpoint_pages(false), 1000);
}
#[test]
fn defer_lexical_updates_flag_parsing() {
let _guard = set_env("CASS_DEFER_LEXICAL_UPDATES", "1");
assert!(defer_lexical_updates_enabled());
}
#[test]
fn retryable_franken_errors_are_detected() {
let retryable = anyhow::Error::new(FrankenError::BusySnapshot {
conflicting_pages: "1,2".to_string(),
});
assert!(is_retryable_franken_error(&retryable));
let not_retryable = anyhow::Error::new(FrankenError::ConcurrentUnavailable);
assert!(!is_retryable_franken_error(¬_retryable));
}
fn create_franken_db(path: &std::path::Path) -> FrankenStorage {
let fs = FrankenStorage::open(path).expect("open frankensqlite db");
fs.run_migrations().expect("run migrations");
fs
}
#[test]
fn lexical_rebuild_content_fingerprint_uses_table_max_ids() {
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("fingerprint.db");
let storage = create_franken_db(&db_path);
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace_id = storage
.ensure_workspace(std::path::Path::new("/tmp/fingerprint"), None)
.unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO conversations(
id, agent_id, workspace_id, source_id, title, source_path, metadata_json
) VALUES
(4, ?1, ?2, 'local', 'older', '/tmp/fingerprint/older.jsonl', '{}'),
(9, ?1, ?2, 'local', 'newer', '/tmp/fingerprint/newer.jsonl', '{}')",
&[ParamValue::from(agent_id), ParamValue::from(workspace_id)],
)
.unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO messages(id, conversation_id, idx, role, content, extra_json)
VALUES
(10, 4, 0, 'user', 'older message', '{}'),
(11, 9, 0, 'assistant', 'newer message', '{}')",
&[],
)
.unwrap();
let fingerprint =
crate::indexer::lexical_rebuild_content_fingerprint(&storage, 2).unwrap();
assert_eq!(fingerprint, "content-v1:2:9:11");
}
fn tantivy_doc_count(index: &mut crate::search::tantivy::TantivyIndex) -> u64 {
index.commit().expect("commit tantivy");
let reader = index.reader().expect("reader");
reader.reload().expect("reload");
reader.searcher().num_docs()
}
#[test]
fn packet_semantic_delta_for_outcome_replays_persisted_canonical_state() {
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("semantic-delta-replay.db");
let storage = create_franken_db(&db_path);
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
})
.unwrap();
let workspace_id = storage
.ensure_workspace(std::path::Path::new("/tmp/persist-semantic"), None)
.unwrap();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(std::path::PathBuf::from("/tmp/persist-semantic")),
external_id: Some("semantic-delta-replay".into()),
title: Some("Semantic delta replay".into()),
source_path: std::path::PathBuf::from("/tmp/persist-semantic.jsonl"),
started_at: Some(1_700_000_200_000),
ended_at: Some(1_700_000_200_200),
approx_tokens: Some(32),
metadata_json: serde_json::json!({}),
messages: vec![
crate::model::types::Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_200_010),
content: "original user".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
},
crate::model::types::Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_200_020),
content: "original assistant".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
},
],
source_id: "initial-remote".into(),
origin_host: Some("initial-host".into()),
};
let outcome = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)
.unwrap();
let inserted_ids = load_inserted_message_ids_by_idx(
&storage,
outcome.conversation_id,
&outcome.inserted_indices,
)
.unwrap();
let assistant_id = inserted_ids.get(&1).copied().expect("assistant message id");
storage
.upsert_source(&Source {
id: "replayed-remote".into(),
kind: SourceKind::Ssh,
host_label: Some("replayed-host".into()),
machine_id: None,
platform: None,
config_json: None,
created_at: None,
updated_at: None,
})
.unwrap();
storage
.raw()
.execute_compat(
"UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
&[
ParamValue::from("replayed-remote"),
ParamValue::from("replayed-host"),
ParamValue::from(outcome.conversation_id),
],
)
.unwrap();
storage
.raw()
.execute_compat(
"UPDATE messages SET role = ?1, content = ?2 WHERE id = ?3",
&[
ParamValue::from("tool"),
ParamValue::from("persisted tool reply"),
ParamValue::from(assistant_id),
],
)
.unwrap();
let (inputs, max_message_id) =
packet_semantic_delta_for_outcome(&storage, &outcome).unwrap();
assert_eq!(max_message_id, Some(assistant_id));
assert_eq!(inputs.len(), 2);
assert_eq!(inputs[0].content, "original user");
assert_eq!(
crate::indexer::semantic_role_name(inputs[0].role),
Some("user")
);
assert_eq!(inputs[1].content, "persisted tool reply");
assert_eq!(
crate::indexer::semantic_role_name(inputs[1].role),
Some("tool")
);
let expected_source_id = crate::search::tantivy::normalized_index_source_id(
Some("replayed-remote"),
None,
Some("replayed-host"),
);
let expected_source_hash = crc32fast::hash(expected_source_id.as_bytes());
assert!(
inputs
.iter()
.all(|input| input.source_id == expected_source_hash),
"semantic delta should use canonical replay provenance: {inputs:#?}"
);
}
#[test]
fn apply_index_writer_checkpoint_policy_round_trips_pragma() {
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("checkpoint-policy.db");
let storage = create_franken_db(&db_path);
assert_eq!(storage.index_writer_checkpoint_pages(), Some(4096));
apply_index_writer_checkpoint_policy(&storage, true);
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get(0).unwrap(), &SqliteValue::Integer(0));
assert_eq!(storage.index_writer_checkpoint_pages(), Some(0));
apply_index_writer_checkpoint_policy(&storage, false);
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get(0).unwrap(), &SqliteValue::Integer(1000));
assert_eq!(storage.index_writer_checkpoint_pages(), Some(1000));
}
#[test]
fn with_ephemeral_writer_marks_preflight_verified_after_first_success() {
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("preflight-cache.db");
let storage = create_franken_db(&db_path);
assert!(!storage.ephemeral_writer_preflight_verified());
with_ephemeral_writer(&storage, false, "preflight-cache-test", |_writer| Ok(()))
.unwrap();
assert!(storage.ephemeral_writer_preflight_verified());
with_ephemeral_writer(&storage, false, "preflight-cache-test", |_writer| Ok(()))
.unwrap();
assert!(storage.ephemeral_writer_preflight_verified());
}
#[test]
fn with_ephemeral_writer_reuses_cached_connection_when_idle() {
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("ephemeral-writer-reuse.db");
let storage = create_franken_db(&db_path);
with_ephemeral_writer(&storage, false, "ephemeral-writer-reuse", |writer| {
writer
.raw()
.execute("CREATE TEMP TABLE temp_writer_reuse(marker INTEGER NOT NULL);")?;
writer
.raw()
.execute("INSERT INTO temp_writer_reuse(marker) VALUES (1);")?;
Ok(())
})
.unwrap();
let count: i64 =
with_ephemeral_writer(&storage, false, "ephemeral-writer-reuse", |writer| {
Ok(writer.raw().query_row_map(
"SELECT COUNT(*) FROM temp_writer_reuse;",
&[],
|row| row.get_typed(0),
)?)
})
.unwrap();
assert_eq!(count, 1, "temp table should persist on the reused writer");
}
#[test]
fn begin_concurrent_persist_writes_all_conversations() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let index_path = dir.path().join("tantivy");
let frank = create_franken_db(&db_path);
drop(frank); let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs: Vec<NormalizedConversation> = (0..10)
.map(|i| {
let slug = format!("agent-{}", i % 3);
NormalizedConversation {
agent_slug: slug,
external_id: Some(format!("conv-{i}")),
title: Some(format!("Conversation {i}")),
workspace: Some(std::path::PathBuf::from(format!("/ws/{i}"))),
source_path: std::path::PathBuf::from(format!("/log/{i}.jsonl")),
started_at: Some(1000 + i * 100),
ended_at: Some(1000 + i * 100 + 50),
metadata: serde_json::json!({}),
messages: (0..3)
.map(|j| NormalizedMessage {
idx: j,
role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
author: Some("tester".into()),
created_at: Some(1000 + i * 100 + j * 10),
content: format!("begin-concurrent-test conv={i} msg={j}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
})
.collect(),
}
})
.collect();
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "3");
persist_conversations_batched_begin_concurrent(
&FrankenStorage::open(&db_path).unwrap(),
&db_path,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::InlineRebuildFromScan,
false,
false,
None,
)
.expect("begin-concurrent persist should succeed");
let reader = FrankenStorage::open(&db_path).unwrap();
let count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
let persisted_conversations: Vec<(i64, i64, Option<String>, String)> = reader
.raw()
.query_map_collect(
"SELECT id, agent_id, external_id, source_path FROM conversations ORDER BY id",
&[],
|row| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
))
},
)
.unwrap();
let persisted_message_counts: Vec<(i64, i64)> = reader
.raw()
.query_map_collect(
"SELECT conversation_id, COUNT(*) FROM messages GROUP BY conversation_id ORDER BY conversation_id",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
count, 10,
"all 10 conversations should be persisted; rows={persisted_conversations:?}; per_conversation_messages={persisted_message_counts:?}"
);
let msg_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(
msg_count, 30,
"all 30 messages should be persisted; per_conversation={persisted_message_counts:?}"
);
let agent_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(DISTINCT slug) FROM agents", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(agent_count, 3, "3 distinct agent slugs should exist");
t_index.commit().unwrap();
}
#[test]
fn begin_concurrent_single_conversation_works() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let index_path = dir.path().join("tantivy");
let frank = create_franken_db(&db_path);
drop(frank);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs = vec![NormalizedConversation {
agent_slug: "solo-agent".into(),
external_id: Some("solo-1".into()),
title: Some("Solo test".into()),
workspace: None,
source_path: std::path::PathBuf::from("/log/solo.jsonl"),
started_at: Some(5000),
ended_at: Some(5050),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(5000),
content: "single-conv-begin-concurrent-test".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
}];
persist_conversations_batched_begin_concurrent(
&FrankenStorage::open(&db_path).unwrap(),
&db_path,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::InlineRebuildFromScan,
false,
false,
None,
)
.expect("single conversation begin-concurrent persist should succeed");
let reader = FrankenStorage::open(&db_path).unwrap();
let count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(count, 1);
let msg_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(msg_count, 1);
}
#[test]
#[serial]
fn persist_conversations_batched_can_defer_inline_lexical_updates() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "0");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("serial-deferred.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs = vec![NormalizedConversation {
agent_slug: "serial-agent".into(),
external_id: Some("serial-1".into()),
title: Some("Serial Deferred".into()),
workspace: Some(std::path::PathBuf::from("/ws/serial")),
source_path: std::path::PathBuf::from("/log/serial.jsonl"),
started_at: Some(10),
ended_at: Some(20),
metadata: serde_json::json!({}),
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(10),
content: "serial deferred first".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(11),
content: "serial deferred second".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
],
}];
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
false,
)
.expect("serial batched persist should succeed");
let conversation_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
let message_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(conversation_count, 1);
assert_eq!(message_count, 2);
assert_eq!(tantivy_doc_count(&mut t_index), 0);
}
#[test]
#[serial]
fn begin_concurrent_persist_can_defer_inline_lexical_updates() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "1");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("begin-deferred.db");
let index_path = dir.path().join("tantivy");
let frank = create_franken_db(&db_path);
drop(frank);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs = vec![NormalizedConversation {
agent_slug: "begin-agent".into(),
external_id: Some("begin-1".into()),
title: Some("Begin Deferred".into()),
workspace: Some(std::path::PathBuf::from("/ws/begin")),
source_path: std::path::PathBuf::from("/log/begin.jsonl"),
started_at: Some(50),
ended_at: Some(60),
metadata: serde_json::json!({}),
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(50),
content: "begin deferred first".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(51),
content: "begin deferred second".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
],
}];
persist_conversations_batched_begin_concurrent(
&FrankenStorage::open(&db_path).unwrap(),
&db_path,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
false,
false,
None,
)
.expect("begin-concurrent deferred persist should succeed");
let reader = FrankenStorage::open(&db_path).unwrap();
let conversation_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
let message_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(conversation_count, 1);
assert_eq!(message_count, 2);
assert_eq!(tantivy_doc_count(&mut t_index), 0);
}
#[test]
fn lexical_population_strategy_prefers_single_authoritative_pass() {
assert_eq!(
crate::indexer::select_lexical_population_strategy(false, false),
LexicalPopulationStrategy::IncrementalInline
);
assert_eq!(
crate::indexer::select_lexical_population_strategy(true, false),
LexicalPopulationStrategy::InlineRebuildFromScan
);
assert_eq!(
crate::indexer::select_lexical_population_strategy(false, true),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild
);
assert_eq!(
crate::indexer::select_lexical_population_strategy(true, true),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild
);
}
#[test]
fn lexical_population_strategy_reason_covers_full_stale_salvage_and_incremental_modes() {
assert_eq!(
crate::indexer::resolve_lexical_population_strategy(false, false, 0),
(
LexicalPopulationStrategy::IncrementalInline,
"incremental_scan_applies_inline_lexical_updates_only_for_new_messages",
)
);
assert_eq!(
crate::indexer::resolve_lexical_population_strategy(true, false, 0),
(
LexicalPopulationStrategy::InlineRebuildFromScan,
"lexical_index_needs_rebuild_so_scan_results_repopulate_tantivy_directly",
)
);
assert_eq!(
crate::indexer::resolve_lexical_population_strategy(false, true, 0),
(
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
"full_refresh_defers_inline_lexical_writes_to_authoritative_db_rebuild",
)
);
assert_eq!(
crate::indexer::resolve_lexical_population_strategy(true, false, 7),
(
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
"historical_salvage_imported_messages_require_authoritative_db_rebuild",
)
);
}
#[test]
fn incremental_canonical_lexical_repair_short_circuits_when_full_or_force_paths_apply() {
let base = crate::indexer::IncrementalCanonicalLexicalRepairContext {
full_refresh: false,
force_rebuild: false,
resume_lexical_rebuild: false,
targeted_watch_once_only: false,
salvage_messages_imported: 0,
canonical_messages: 0,
tantivy_requires_rebuild: true,
observed_tantivy_docs: None,
published_index_validated_for_current_data: false,
};
assert!(crate::indexer::should_evaluate_incremental_canonical_lexical_repair(&base));
assert!(
!crate::indexer::should_evaluate_incremental_canonical_lexical_repair(
&crate::indexer::IncrementalCanonicalLexicalRepairContext {
full_refresh: true,
..base
}
)
);
assert!(
!crate::indexer::should_evaluate_incremental_canonical_lexical_repair(
&crate::indexer::IncrementalCanonicalLexicalRepairContext {
force_rebuild: true,
..base
}
)
);
assert!(
!crate::indexer::should_evaluate_incremental_canonical_lexical_repair(
&crate::indexer::IncrementalCanonicalLexicalRepairContext {
resume_lexical_rebuild: true,
..base
}
)
);
assert!(
!crate::indexer::should_evaluate_incremental_canonical_lexical_repair(
&crate::indexer::IncrementalCanonicalLexicalRepairContext {
targeted_watch_once_only: true,
..base
}
)
);
assert!(
!crate::indexer::should_evaluate_incremental_canonical_lexical_repair(
&crate::indexer::IncrementalCanonicalLexicalRepairContext {
salvage_messages_imported: 1,
..base
}
)
);
}
#[test]
fn incremental_canonical_lexical_repair_plan_prefers_authoritative_db_for_invalid_tantivy()
{
assert_eq!(
crate::indexer::choose_incremental_canonical_lexical_repair_plan(
crate::indexer::IncrementalCanonicalLexicalRepairContext {
full_refresh: false,
force_rebuild: false,
resume_lexical_rebuild: false,
targeted_watch_once_only: false,
salvage_messages_imported: 0,
canonical_messages: 42,
tantivy_requires_rebuild: true,
observed_tantivy_docs: None,
published_index_validated_for_current_data: false,
},
),
Some(crate::indexer::IncrementalCanonicalLexicalRepairPlan {
canonical_messages: 42,
observed_tantivy_docs: None,
reason: "incremental_index_repairs_missing_or_invalid_tantivy_from_authoritative_canonical_db_before_scan",
})
);
}
#[test]
fn incremental_canonical_lexical_repair_plan_prefers_authoritative_db_for_sparse_tantivy() {
assert_eq!(
crate::indexer::choose_incremental_canonical_lexical_repair_plan(
crate::indexer::IncrementalCanonicalLexicalRepairContext {
full_refresh: false,
force_rebuild: false,
resume_lexical_rebuild: false,
targeted_watch_once_only: false,
salvage_messages_imported: 0,
canonical_messages: 42,
tantivy_requires_rebuild: false,
observed_tantivy_docs: Some(3),
published_index_validated_for_current_data: false,
},
),
Some(crate::indexer::IncrementalCanonicalLexicalRepairPlan {
canonical_messages: 42,
observed_tantivy_docs: Some(3),
reason: "incremental_index_repairs_sparse_tantivy_from_authoritative_canonical_db_before_scan",
})
);
}
#[test]
fn incremental_canonical_lexical_repair_progress_records_authoritative_repair_stats() {
let progress = std::sync::Arc::new(crate::indexer::IndexingProgress::default());
let plan = crate::indexer::IncrementalCanonicalLexicalRepairPlan {
canonical_messages: 42,
observed_tantivy_docs: Some(3),
reason: "incremental_index_repairs_sparse_tantivy_from_authoritative_canonical_db_before_scan",
};
crate::indexer::record_incremental_canonical_lexical_repair(Some(&progress), &plan, 7);
let stats = match progress.stats.lock() {
Ok(stats) => stats,
Err(poisoned) => poisoned.into_inner(),
};
assert_eq!(
stats.lexical_repair,
Some(crate::indexer::LexicalRepairStats {
kind: "authoritative_canonical_db_rebuild".to_string(),
reason:
"incremental_index_repairs_sparse_tantivy_from_authoritative_canonical_db_before_scan"
.to_string(),
canonical_conversations: 7,
canonical_messages: 42,
observed_tantivy_docs: Some(3),
})
);
let json = match serde_json::to_value(&*stats) {
Ok(json) => json,
Err(err) => {
panic!("indexing stats should serialize: {err}");
}
};
assert_eq!(
json["lexical_repair"]["kind"],
"authoritative_canonical_db_rebuild"
);
assert_eq!(json["lexical_repair"]["canonical_conversations"], 7);
assert_eq!(json["lexical_repair"]["canonical_messages"], 42);
assert_eq!(json["lexical_repair"]["observed_tantivy_docs"], 3);
}
#[test]
fn incremental_canonical_lexical_repair_plan_stays_incremental_when_tantivy_covers_db() {
assert_eq!(
crate::indexer::choose_incremental_canonical_lexical_repair_plan(
crate::indexer::IncrementalCanonicalLexicalRepairContext {
full_refresh: false,
force_rebuild: false,
resume_lexical_rebuild: false,
targeted_watch_once_only: false,
salvage_messages_imported: 0,
canonical_messages: 42,
tantivy_requires_rebuild: false,
observed_tantivy_docs: Some(42),
published_index_validated_for_current_data: false,
},
),
None
);
}
#[test]
fn incremental_canonical_lexical_repair_plan_repairs_sparse_live_index_despite_checkpoint()
{
assert_eq!(
crate::indexer::choose_incremental_canonical_lexical_repair_plan(
crate::indexer::IncrementalCanonicalLexicalRepairContext {
full_refresh: false,
force_rebuild: false,
resume_lexical_rebuild: false,
targeted_watch_once_only: false,
salvage_messages_imported: 0,
canonical_messages: 42,
tantivy_requires_rebuild: false,
observed_tantivy_docs: Some(3),
published_index_validated_for_current_data: true,
},
),
Some(crate::indexer::IncrementalCanonicalLexicalRepairPlan {
canonical_messages: 42,
observed_tantivy_docs: Some(3),
reason: "incremental_index_repairs_sparse_tantivy_from_authoritative_canonical_db_before_scan",
}),
"a matching checkpoint is not proof that the current live derived index covers SQLite"
);
assert!(
crate::indexer::choose_incremental_canonical_lexical_repair_plan(
crate::indexer::IncrementalCanonicalLexicalRepairContext {
full_refresh: false,
force_rebuild: false,
resume_lexical_rebuild: false,
targeted_watch_once_only: false,
salvage_messages_imported: 0,
canonical_messages: 42,
tantivy_requires_rebuild: true,
observed_tantivy_docs: None,
published_index_validated_for_current_data: true,
},
)
.is_some(),
"a missing/invalid tantivy index must still rebuild even when the validation flag is set"
);
}
#[test]
#[serial]
fn persist_conversations_batched_falls_back_for_duplicate_keys() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use crate::sources::provenance::{Source, SourceKind};
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "1");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
storage
.upsert_source(&Source {
id: "remote-source".into(),
kind: SourceKind::Ssh,
host_label: Some("example-host".into()),
machine_id: None,
platform: None,
config_json: None,
created_at: None,
updated_at: None,
})
.unwrap();
let metadata = serde_json::json!({
"cass": {
"origin": {
"source_id": "remote-source",
"host": "example-host"
}
}
});
let convs = vec![
NormalizedConversation {
agent_slug: "shared-agent".into(),
external_id: Some("dup-session".into()),
title: Some("Shared Session".into()),
workspace: Some(std::path::PathBuf::from("/ws/shared")),
source_path: std::path::PathBuf::from("/log/first.jsonl"),
started_at: Some(1_000),
ended_at: Some(1_010),
metadata: metadata.clone(),
messages: vec![NormalizedMessage {
idx: 2,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(1_002),
content: "third".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
},
NormalizedConversation {
agent_slug: "shared-agent".into(),
external_id: Some("dup-session".into()),
title: Some("Shared Session".into()),
workspace: Some(std::path::PathBuf::from("/ws/shared")),
source_path: std::path::PathBuf::from("/log/second.jsonl"),
started_at: Some(1_000),
ended_at: Some(1_020),
metadata,
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(1_000),
content: "first".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(1_001),
content: "second".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
],
},
];
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("duplicate-key batch should fall back to serial path");
let reader = FrankenStorage::open(&db_path).unwrap();
let conversation_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 1);
let stored_indices: Vec<i64> = reader
.raw()
.query_map_collect("SELECT idx FROM messages ORDER BY idx", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(stored_indices, vec![0, 1, 2]);
t_index.commit().unwrap();
}
#[test]
#[serial]
fn persist_conversations_batched_parallel_pre_map_preserves_content_and_order() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "0");
let _redact_guard = set_env("CASS_REDACT_SECRETS", "1");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("parallel-pre-map.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs: Vec<NormalizedConversation> = (0..32)
.map(|i| NormalizedConversation {
agent_slug: format!("agent-{}", i % 4),
external_id: Some(format!("session-{i}")),
title: Some(format!(
"Title {i} with sk_live_ABCdef0123456789AAAAbbbb{i:04}"
)),
workspace: Some(std::path::PathBuf::from(format!("/ws/proj-{}", i % 3))),
source_path: std::path::PathBuf::from(format!("/log/s{i}.jsonl")),
started_at: Some(1_000 + i as i64),
ended_at: Some(1_010 + i as i64),
metadata: serde_json::json!({}),
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(1_000 + i as i64),
content: format!("hello from conv {i}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(1_001 + i as i64),
content: format!("reply {i}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
],
})
.collect();
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("parallel pre-map serial batched path should persist all conversations");
let reader = FrankenStorage::open(&db_path).unwrap();
let conversation_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, convs.len() as i64);
let message_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(message_count, (convs.len() * 2) as i64);
let titles: Vec<String> = reader
.raw()
.query_map_collect(
"SELECT title FROM conversations WHERE title IS NOT NULL ORDER BY id",
&[],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(titles.len(), convs.len());
for title in &titles {
assert!(
!title.contains("sk_live_"),
"title should have been redacted but contained a live secret marker: {title}"
);
}
t_index.commit().unwrap();
}
#[test]
#[serial]
fn persist_conversations_batched_parallel_pre_map_preserves_content_in_begin_concurrent_path()
{
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "8");
let _redact_guard = set_env("CASS_REDACT_SECRETS", "1");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("parallel-pre-map-concurrent.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs: Vec<NormalizedConversation> = (0..32)
.map(|i| NormalizedConversation {
agent_slug: format!("agent-bc-{}", i % 4),
external_id: Some(format!("bc-session-{i}")),
title: Some(format!(
"Title bc-{i} sk_live_ABCdef0123456789AAAAbbbb{i:04}"
)),
workspace: Some(std::path::PathBuf::from(format!("/ws/bc-{}", i % 3))),
source_path: std::path::PathBuf::from(format!("/log/bc{i}.jsonl")),
started_at: Some(2_000 + i as i64),
ended_at: Some(2_010 + i as i64),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(2_000 + i as i64),
content: format!("bc hello {i}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
})
.collect();
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("begin-concurrent path with parallel pre-map should persist all conversations");
let reader = FrankenStorage::open(&db_path).unwrap();
let conversation_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, convs.len() as i64);
let titles: Vec<String> = reader
.raw()
.query_map_collect(
"SELECT title FROM conversations WHERE title IS NOT NULL ORDER BY id",
&[],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(titles.len(), convs.len());
for title in &titles {
assert!(
!title.contains("sk_live_"),
"begin-concurrent hoist must preserve secret redaction; found raw token: {title}"
);
}
t_index.commit().unwrap();
}
#[test]
#[serial]
fn parallel_wal_shadow_observer_does_not_change_persisted_state() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
fn run_once(parallel_wal: Option<&str>) -> Vec<(String, i64)> {
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "4");
let _wal_guard = parallel_wal.map(|v| set_env("CASS_INDEXER_PARALLEL_WAL", v));
let _ = _wal_guard;
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("shadow-parity.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs: Vec<NormalizedConversation> = (0..16)
.map(|i| NormalizedConversation {
agent_slug: format!("agent-{}", i % 3),
external_id: Some(format!("shadow-parity-{i}")),
title: Some(format!("Title {i}")),
workspace: Some(std::path::PathBuf::from(format!("/ws/{}", i % 2))),
source_path: std::path::PathBuf::from(format!("/log/{i}.jsonl")),
started_at: Some(1_000 + i as i64),
ended_at: Some(1_010 + i as i64),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(1_000 + i as i64),
content: format!("body {i}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
})
.collect();
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("begin-concurrent path should persist all conversations");
let reader = FrankenStorage::open(&db_path).unwrap();
reader
.raw()
.query_map_collect(
"SELECT external_id, started_at FROM conversations ORDER BY id",
&[],
|row| {
let ext: Option<String> = row.get_typed(0)?;
let started: Option<i64> = row.get_typed(1)?;
Ok((ext.unwrap_or_default(), started.unwrap_or(0)))
},
)
.unwrap()
}
let mut off = run_once(None);
let mut shadow = run_once(Some("shadow"));
off.sort();
shadow.sort();
assert_eq!(off.len(), 16);
assert_eq!(shadow.len(), 16);
assert_eq!(
off, shadow,
"shadow-mode observer must NOT change the SET of persisted rows"
);
}
#[test]
#[serial]
fn parallel_wal_shadow_observer_emits_chunk_telemetry() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "4");
let _wal_guard = set_env("CASS_INDEXER_PARALLEL_WAL", "shadow");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("shadow-tele.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs: Vec<NormalizedConversation> = (0..8)
.map(|i| NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some(format!("shadow-tele-{i}")),
title: Some(format!("Title {i}")),
workspace: None,
source_path: std::path::PathBuf::from(format!("/log/{i}.jsonl")),
started_at: Some(1_000 + i as i64),
ended_at: Some(1_010 + i as i64),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(1_000 + i as i64),
content: format!("body {i}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
})
.collect();
let baseline = crate::indexer::parallel_wal_shadow::telemetry_snapshot();
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("shadow-mode begin-concurrent should persist");
let after = crate::indexer::parallel_wal_shadow::telemetry_snapshot();
assert!(after.active, "active must be true under shadow");
assert!(
after.chunks_observed > baseline.chunks_observed,
"shadow observer must record ≥1 chunk when begin-concurrent path runs"
);
}
#[test]
#[serial]
fn persist_conversations_batched_registers_missing_remote_source_in_serial_path() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "0");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("serial-source.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs = vec![NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some("remote-serial-session".into()),
title: Some("Remote serial session".into()),
workspace: Some(std::path::PathBuf::from("/ws/remote")),
source_path: std::path::PathBuf::from("/log/remote-serial.jsonl"),
started_at: Some(1_000),
ended_at: Some(1_010),
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": "remote-source",
"host": "builder-1"
}
}
}),
messages: vec![NormalizedMessage {
idx: 0,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(1_005),
content: "serial remote content".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
}];
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("serial batched path should auto-register embedded remote sources");
let reader = FrankenStorage::open(&db_path).unwrap();
let source_ids = reader.get_source_ids().unwrap();
assert_eq!(source_ids, vec!["remote-source".to_string()]);
let provenance: Vec<(String, Option<String>)> = reader
.raw()
.query_map_collect(
"SELECT source_id, origin_host FROM conversations",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
provenance,
vec![("remote-source".to_string(), Some("builder-1".to_string()))]
);
}
#[test]
#[serial]
fn persist_conversations_batched_registers_missing_remote_source_in_begin_concurrent_path()
{
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "1");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("begin-concurrent-source.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let convs = vec![NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some("remote-begin-session".into()),
title: Some("Remote begin-concurrent session".into()),
workspace: Some(std::path::PathBuf::from("/ws/remote")),
source_path: std::path::PathBuf::from("/log/remote-begin.jsonl"),
started_at: Some(2_000),
ended_at: Some(2_010),
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": "remote-begin-source",
"host": "builder-2"
}
}
}),
messages: vec![NormalizedMessage {
idx: 0,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(2_005),
content: "begin-concurrent remote content".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
}];
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("begin-concurrent path should auto-register embedded remote sources");
let reader = FrankenStorage::open(&db_path).unwrap();
let source_ids = reader.get_source_ids().unwrap();
assert_eq!(source_ids, vec!["remote-begin-source".to_string()]);
let provenance: Vec<(String, Option<String>)> = reader
.raw()
.query_map_collect(
"SELECT source_id, origin_host FROM conversations",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
provenance,
vec![(
"remote-begin-source".to_string(),
Some("builder-2".to_string())
)]
);
}
#[test]
#[serial]
fn persist_conversations_batched_reuses_auto_registered_remote_source_across_serial_runs() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "0");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("serial-source-reuse.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let metadata = serde_json::json!({
"cass": {
"origin": {
"source_id": "remote-source-reused",
"host": "builder-reuse-1"
}
}
});
for (external_id, started_at, content, source_path) in [
(
"remote-serial-session-1",
10_000_i64,
"serial remote content one",
"/log/remote-serial-1.jsonl",
),
(
"remote-serial-session-2",
20_000_i64,
"serial remote content two",
"/log/remote-serial-2.jsonl",
),
] {
let convs = vec![NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some(external_id.into()),
title: Some(format!("Remote serial session {external_id}")),
workspace: Some(std::path::PathBuf::from("/ws/remote")),
source_path: std::path::PathBuf::from(source_path),
started_at: Some(started_at),
ended_at: Some(started_at + 10),
metadata: metadata.clone(),
messages: vec![NormalizedMessage {
idx: 0,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(started_at + 5),
content: content.into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
}];
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("serial batched path should keep reusing the auto-registered source");
}
let reader = FrankenStorage::open(&db_path).unwrap();
let source_rows: Vec<(String, Option<String>)> = reader
.raw()
.query_map_collect(
"SELECT id, host_label FROM sources WHERE id <> 'local' ORDER BY id",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
source_rows,
vec![(
"remote-source-reused".to_string(),
Some("builder-reuse-1".to_string())
)],
"serial path should upsert the missing remote source once and then reuse it"
);
let provenance: Vec<(String, Option<String>, String)> = reader
.raw()
.query_map_collect(
"SELECT source_id, origin_host, external_id FROM conversations ORDER BY external_id",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
)
.unwrap();
assert_eq!(
provenance,
vec![
(
"remote-source-reused".to_string(),
Some("builder-reuse-1".to_string()),
"remote-serial-session-1".to_string()
),
(
"remote-source-reused".to_string(),
Some("builder-reuse-1".to_string()),
"remote-serial-session-2".to_string()
)
],
"every persisted conversation should retain the recovered source provenance"
);
let conversation_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 2);
let fk_violations = reader.raw().query("PRAGMA foreign_key_check").unwrap();
assert!(
fk_violations.is_empty(),
"serial path should not leave any foreign-key violations after source auto-registration"
);
}
#[test]
#[serial]
fn persist_conversations_batched_reuses_auto_registered_remote_source_across_begin_concurrent_runs()
{
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let _begin_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "1");
let _chunk_guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT_CHUNK_SIZE", "1");
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("begin-source-reuse.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let metadata = serde_json::json!({
"cass": {
"origin": {
"source_id": "remote-begin-reused",
"host": "builder-reuse-2"
}
}
});
for (external_id, started_at, content, source_path) in [
(
"remote-begin-session-1",
30_000_i64,
"begin-concurrent content one",
"/log/remote-begin-1.jsonl",
),
(
"remote-begin-session-2",
40_000_i64,
"begin-concurrent content two",
"/log/remote-begin-2.jsonl",
),
] {
let convs = vec![NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some(external_id.into()),
title: Some(format!("Remote begin session {external_id}")),
workspace: Some(std::path::PathBuf::from("/ws/remote")),
source_path: std::path::PathBuf::from(source_path),
started_at: Some(started_at),
ended_at: Some(started_at + 10),
metadata: metadata.clone(),
messages: vec![NormalizedMessage {
idx: 0,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(started_at + 5),
content: content.into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
}];
persist_conversations_batched(
&storage,
Some(&mut t_index),
&convs,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.expect("begin-concurrent path should keep reusing the auto-registered source");
}
let reader = FrankenStorage::open(&db_path).unwrap();
let source_rows: Vec<(String, Option<String>)> = reader
.raw()
.query_map_collect(
"SELECT id, host_label FROM sources WHERE id <> 'local' ORDER BY id",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
source_rows,
vec![(
"remote-begin-reused".to_string(),
Some("builder-reuse-2".to_string())
)],
"begin-concurrent path should upsert the missing remote source once and then reuse it"
);
let provenance: Vec<(String, Option<String>, String)> = reader
.raw()
.query_map_collect(
"SELECT source_id, origin_host, external_id FROM conversations ORDER BY external_id",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
)
.unwrap();
assert_eq!(
provenance,
vec![
(
"remote-begin-reused".to_string(),
Some("builder-reuse-2".to_string()),
"remote-begin-session-1".to_string()
),
(
"remote-begin-reused".to_string(),
Some("builder-reuse-2".to_string()),
"remote-begin-session-2".to_string()
)
],
"every begin-concurrent persist should retain the recovered source provenance"
);
let conversation_count: i64 = reader
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 2);
let fk_violations = reader.raw().query("PRAGMA foreign_key_check").unwrap();
assert!(
fk_violations.is_empty(),
"begin-concurrent path should not leave any foreign-key violations after source auto-registration"
);
}
#[test]
fn persist_conversation_registers_missing_remote_source() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("single-remote-source.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some("remote-single-session".into()),
title: Some("Remote single session".into()),
workspace: Some(std::path::PathBuf::from("/ws/remote")),
source_path: std::path::PathBuf::from("/log/remote-single.jsonl"),
started_at: Some(3_000),
ended_at: Some(3_010),
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": "remote-single-source",
"host": "builder-3"
}
}
}),
messages: vec![NormalizedMessage {
idx: 0,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(3_005),
content: "single remote content".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
persist_conversation(&storage, &mut t_index, &conv)
.expect("single conversation path should auto-register embedded remote sources");
let reader = FrankenStorage::open(&db_path).unwrap();
let source_ids = reader.get_source_ids().unwrap();
assert_eq!(source_ids, vec!["remote-single-source".to_string()]);
let provenance: Vec<(String, Option<String>)> = reader
.raw()
.query_map_collect(
"SELECT source_id, origin_host FROM conversations",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
provenance,
vec![(
"remote-single-source".to_string(),
Some("builder-3".to_string())
)]
);
}
#[test]
fn persist_conversation_host_only_remote_source_infers_source_id_from_host() {
use crate::connectors::NormalizedConversation;
use crate::search::tantivy::TantivyIndex;
use frankensqlite::compat::{ConnectionExt, RowExt};
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join("single-host-only-remote.db");
let index_path = dir.path().join("tantivy");
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some("host-only-remote-session".into()),
title: Some("Host only remote session".into()),
workspace: Some(std::path::PathBuf::from("/ws/remote")),
source_path: std::path::PathBuf::from("/log/host-only-remote.jsonl"),
started_at: Some(3_100),
ended_at: Some(3_110),
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": " ",
"host": "builder-4"
}
}
}),
messages: vec![NormalizedMessage {
idx: 0,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(3_105),
content: "host only remote content".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
persist_conversation(&storage, &mut t_index, &conv)
.expect("host-only remote provenance should be auto-registered as remote");
let reader = FrankenStorage::open(&db_path).unwrap();
let source_ids = reader.get_source_ids().unwrap();
assert_eq!(source_ids, vec!["builder-4".to_string()]);
let provenance: Vec<(String, Option<String>)> = reader
.raw()
.query_map_collect(
"SELECT source_id, origin_host FROM conversations",
&[],
|row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
)
.unwrap();
assert_eq!(
provenance,
vec![("builder-4".to_string(), Some("builder-4".to_string()))]
);
}
fn make_profiled_remote_conversation(
external_id: i64,
msg_count: usize,
) -> NormalizedConversation {
NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some(format!("profiled-remote-{external_id}")),
title: Some(format!("Profiled remote conversation {external_id}")),
workspace: Some(std::path::PathBuf::from("/ws/profiled-remote")),
source_path: std::path::PathBuf::from(format!(
"/log/profiled-remote-{external_id}.jsonl"
)),
started_at: Some(10_000 + external_id * 100),
ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": "profiled-remote-source",
"host": "builder-profile"
}
}
}),
messages: (0..msg_count)
.map(|idx| NormalizedMessage {
idx: idx as i64,
role: if idx % 2 == 0 { "user" } else { "assistant" }.into(),
author: Some("tester".into()),
created_at: Some(20_000 + external_id * 100 + idx as i64),
content: format!(
"profiled remote content ext={external_id} idx={idx} {}",
"x".repeat(64)
),
extra: serde_json::json!({ "idx": idx }),
snippets: vec![],
invocations: Vec::new(),
})
.collect(),
}
}
#[test]
fn persist_conversation_stage_profile_tracks_steady_state_remote_reuse() {
use crate::search::tantivy::TantivyIndex;
let _defer_guard = set_env("CASS_DEFER_LEXICAL_UPDATES", "0");
let log_profile = std::env::var_os("CASS_PERSIST_STAGE_PROFILE").is_some();
for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
let dir = tempfile::TempDir::new().unwrap();
let db_path = dir.path().join(format!("profile-{msg_count}.db"));
let index_path = dir.path().join(format!("tantivy-{msg_count}"));
let storage = create_franken_db(&db_path);
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
persist_conversation(
&storage,
&mut t_index,
&make_profiled_remote_conversation(0, msg_count),
)
.unwrap();
let mut profile = PersistConversationPerfProfile::default();
for external_id in 1..=iterations {
persist_conversation_with_profile(
&storage,
&mut t_index,
&make_profiled_remote_conversation(external_id as i64, msg_count),
&mut profile,
)
.unwrap();
}
assert_eq!(profile.invocations, iterations);
assert_eq!(profile.messages, iterations * msg_count);
assert_eq!(profile.inserted_messages, iterations * msg_count);
assert!(
profile.total_duration >= profile.db_duration,
"db stage cannot exceed total duration"
);
assert!(
profile.total_duration
>= profile.db_duration
+ profile.packet_duration
+ profile.positional_duration
+ profile.tantivy_add_duration,
"accounted stage durations cannot exceed total duration"
);
if log_profile {
profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
}
}
}
#[test]
fn duplicate_conversation_keys_present_for_shared_source_path_without_external_id() {
use crate::connectors::NormalizedConversation;
let convs = vec![
NormalizedConversation {
agent_slug: "shared-agent".into(),
external_id: None,
title: Some("Shared Session".into()),
workspace: Some(std::path::PathBuf::from("/ws/shared")),
source_path: std::path::PathBuf::from("/log/shared.jsonl"),
started_at: Some(1_000),
ended_at: Some(1_010),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("tester".into()),
created_at: Some(1_000),
content: "first".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
},
NormalizedConversation {
agent_slug: "shared-agent".into(),
external_id: None,
title: Some("Shared Session".into()),
workspace: Some(std::path::PathBuf::from("/ws/shared")),
source_path: std::path::PathBuf::from("/log/shared.jsonl"),
started_at: Some(9_999),
ended_at: Some(10_010),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("tester".into()),
created_at: Some(1_001),
content: "second".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
},
];
assert!(duplicate_conversation_keys_present(&convs));
}
#[test]
fn begin_concurrent_disabled_falls_through_to_default() {
let _guard = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "0");
assert!(!begin_concurrent_writes_enabled());
let _guard2 = set_env("CASS_INDEXER_BEGIN_CONCURRENT", "false");
assert!(!begin_concurrent_writes_enabled());
}
#[test]
fn positional_indices_for_inserted_maps_idx_values_to_packet_positions() {
use serde_json::Value;
let conv = NormalizedConversation {
agent_slug: "codex".to_string(),
external_id: Some("idx-mapping".to_string()),
title: None,
workspace: None,
source_path: std::path::PathBuf::from("/tmp/idx-mapping.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_010_000),
metadata: Value::Null,
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".to_string(),
author: None,
created_at: Some(1_700_000_000_000),
content: "first".to_string(),
extra: Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
NormalizedMessage {
idx: 5,
role: "assistant".to_string(),
author: None,
created_at: Some(1_700_000_001_000),
content: "second".to_string(),
extra: Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
NormalizedMessage {
idx: 7,
role: "tool".to_string(),
author: None,
created_at: Some(1_700_000_002_000),
content: "third".to_string(),
extra: Value::Null,
snippets: Vec::new(),
invocations: Vec::new(),
},
],
};
let packet = lexical_packet_for_persist(&conv);
assert_eq!(
packet.payload.messages.len(),
3,
"packet must preserve all conversation messages"
);
let positional_empty = positional_indices_for_inserted(&packet, &[]);
assert_eq!(positional_empty, Vec::<usize>::new());
let positional_one = positional_indices_for_inserted(&packet, &[5]);
assert_eq!(
positional_one,
vec![1],
"idx=5 must map to packet position 1, got {positional_one:?}"
);
let positional_two_unordered = positional_indices_for_inserted(&packet, &[7, 0]);
assert_eq!(
positional_two_unordered,
vec![0, 2],
"result must be in source order (positions 0, 2) regardless of \
inserted_indices ordering, got {positional_two_unordered:?}"
);
let positional_unmatched = positional_indices_for_inserted(&packet, &[99, 100]);
assert_eq!(
positional_unmatched,
Vec::<usize>::new(),
"unmatched idx values must produce an empty positional set"
);
let positional_all = positional_indices_for_inserted(&packet, &[0, 5, 7]);
assert_eq!(positional_all, vec![0, 1, 2]);
let positional_dupe = positional_indices_for_inserted(&packet, &[5, 5, 5]);
assert_eq!(
positional_dupe,
vec![1],
"duplicate idx values must collapse to a single position"
);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::connectors::{
Connector, DetectionResult, NormalizedConversation, NormalizedMessage,
};
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use crate::sources::provenance::SourceKind;
use frankensqlite::compat::{ConnectionExt, ParamValue, RowExt};
use fsqlite_types::value::SqliteValue;
use serial_test::serial;
use tempfile::TempDir;
#[test]
fn scan_path_exclusions_value_active_handles_commas_and_newlines() {
assert!(!scan_path_exclusions_value_active(None));
assert!(!scan_path_exclusions_value_active(Some(" \n , ")));
assert!(scan_path_exclusions_value_active(Some(
" /tmp/active.jsonl ,\n/var/log/cass "
)));
}
#[test]
fn raw_mirror_capture_attaches_conversation_metadata_before_persist() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let source_path = temp.path().join("rollout-test.jsonl");
let source_bytes = b"{\"type\":\"message\",\"role\":\"user\",\"content\":\"hello\"}\n";
std::fs::write(&source_path, source_bytes).expect("write source");
let mut conv = NormalizedConversation {
agent_slug: "codex".to_string(),
external_id: Some("raw-mirror-metadata".to_string()),
title: Some("Raw mirror metadata".to_string()),
workspace: None,
source_path: source_path.clone(),
started_at: Some(1_733_000_000_000),
ended_at: Some(1_733_000_000_100),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".to_string(),
author: None,
created_at: Some(1_733_000_000_000),
content: "hello".to_string(),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
}],
};
inject_provenance(&mut conv, &Origin::local());
attach_raw_mirror_capture(&data_dir, &mut conv);
let raw_mirror = &conv.metadata["cass"]["raw_mirror"];
let manifest_id = raw_mirror["manifest_id"]
.as_str()
.expect("manifest id metadata");
assert!(manifest_id.starts_with("doctor-raw-mirror-manifest-id-v1-"));
assert_eq!(
raw_mirror["blob_size_bytes"].as_u64(),
Some(source_bytes.len() as u64)
);
let manifest_relative = raw_mirror["manifest_relative_path"]
.as_str()
.expect("manifest relative path");
let blob_relative = raw_mirror["blob_relative_path"]
.as_str()
.expect("blob relative path");
assert!(
data_dir
.join("raw-mirror/v1")
.join(manifest_relative)
.exists()
);
assert_eq!(
std::fs::read(data_dir.join("raw-mirror/v1").join(blob_relative))
.expect("raw mirror blob"),
source_bytes
);
let manifest: serde_json::Value = serde_json::from_slice(
&std::fs::read(data_dir.join("raw-mirror/v1").join(manifest_relative))
.expect("raw mirror manifest"),
)
.expect("manifest json");
assert_eq!(manifest["db_links"][0]["message_count"].as_u64(), Some(1));
assert_eq!(
manifest["db_links"][0]["started_at_ms"].as_i64(),
Some(1_733_000_000_000)
);
assert_eq!(
std::fs::read(&source_path).expect("source bytes"),
source_bytes
);
}
#[test]
fn raw_mirror_capture_handles_explicit_file_root_before_parse() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let source_path = temp.path().join("parse-failure-candidate.jsonl");
std::fs::write(&source_path, b"{not valid connector json\n").expect("write source");
let root = ScanRoot::local(source_path.clone());
let active_filter = ActiveSessionSourceFilter::default();
capture_scan_root_file_before_parse(&data_dir, "codex", &root, &active_filter);
let manifest_root = data_dir.join("raw-mirror/v1/manifests");
let manifests = std::fs::read_dir(&manifest_root)
.expect("manifest dir")
.collect::<std::io::Result<Vec<_>>>()
.expect("manifest entries");
assert_eq!(manifests.len(), 1);
let manifest: serde_json::Value =
serde_json::from_slice(&std::fs::read(manifests[0].path()).expect("manifest bytes"))
.expect("manifest json");
assert_eq!(manifest["provider"].as_str(), Some("codex"));
assert_eq!(manifest["source_id"].as_str(), Some("local"));
assert_eq!(
manifest["verification"]["status"].as_str(),
Some("captured")
);
assert_eq!(
std::fs::read(&source_path).expect("source remains untouched"),
b"{not valid connector json\n"
);
}
#[test]
fn raw_mirror_capture_expands_codex_directory_root_before_parse() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let codex_root = temp.path().join(".codex");
let sessions = codex_root.join("sessions");
std::fs::create_dir_all(&sessions).expect("sessions dir");
let first_path = sessions.join("rollout-first.jsonl");
let second_path = sessions.join("rollout-second.json");
let ignored_path = sessions.join("notes.jsonl");
let first_bytes = b"{\"type\":\"event_msg\",\"payload\":{\"type\":\"user_message\",\"message\":\"first\"}}\n";
let second_bytes =
b"{\"type\":\"event_msg\",\"payload\":{\"type\":\"user_message\",\"message\":\"second\"}}\n";
std::fs::write(&first_path, first_bytes).expect("first source");
std::fs::write(&second_path, second_bytes).expect("second source");
std::fs::write(&ignored_path, b"{\"not\":\"a rollout file\"}\n").expect("ignored source");
let root = ScanRoot::local(codex_root);
let active_filter = ActiveSessionSourceFilter::default();
capture_scan_sources_before_parse(&data_dir, "codex", &root, None, &active_filter);
let manifest_root = data_dir.join("raw-mirror/v1/manifests");
let manifests = std::fs::read_dir(&manifest_root)
.expect("manifest dir")
.collect::<std::io::Result<Vec<_>>>()
.expect("manifest entries");
assert_eq!(
manifests.len(),
2,
"codex directory preparse capture should mirror rollout files only"
);
let mut manifest_sources = manifests
.iter()
.map(|entry| {
let manifest: serde_json::Value =
serde_json::from_slice(&std::fs::read(entry.path()).expect("manifest bytes"))
.expect("manifest json");
assert_eq!(manifest["provider"].as_str(), Some("codex"));
assert_eq!(manifest["source_id"].as_str(), Some("local"));
assert_eq!(
manifest["verification"]["status"].as_str(),
Some("captured")
);
manifest["original_path"]
.as_str()
.expect("original path")
.to_string()
})
.collect::<Vec<_>>();
manifest_sources.sort();
assert_eq!(
manifest_sources,
vec![
first_path.display().to_string(),
second_path.display().to_string()
]
);
assert_eq!(
std::fs::read(&first_path).expect("first bytes"),
first_bytes
);
assert_eq!(
std::fs::read(&second_path).expect("second bytes"),
second_bytes
);
assert_eq!(
std::fs::read(&ignored_path).expect("ignored bytes"),
b"{\"not\":\"a rollout file\"}\n"
);
}
struct FailingDiscoveryConnector {
sources: Vec<crate::connectors::DiscoveredSourceFile>,
}
impl Connector for FailingDiscoveryConnector {
fn detect(&self) -> DetectionResult {
DetectionResult::not_found()
}
fn scan(
&self,
_ctx: &crate::connectors::ScanContext,
) -> anyhow::Result<Vec<NormalizedConversation>> {
Err(anyhow::anyhow!(
"intentional parser failure after source discovery"
))
}
fn discover_source_files(
&self,
_ctx: &crate::connectors::ScanContext,
) -> anyhow::Result<Vec<crate::connectors::DiscoveredSourceFile>> {
Ok(self.sources.clone())
}
}
fn discovered_test_source(
root: &ScanRoot,
source_path: std::path::PathBuf,
role: crate::connectors::DiscoveredSourceRole,
) -> crate::connectors::DiscoveredSourceFile {
crate::connectors::DiscoveredSourceFile::new("synthetic", root, source_path, role, true)
.with_fs_metadata()
}
fn raw_mirror_manifest_values(data_dir: &Path) -> Vec<serde_json::Value> {
let manifest_root = data_dir.join("raw-mirror/v1/manifests");
if !manifest_root.exists() {
return Vec::new();
}
let mut manifests: Vec<serde_json::Value> = std::fs::read_dir(&manifest_root)
.expect("manifest dir")
.collect::<std::io::Result<Vec<_>>>()
.expect("manifest entries")
.into_iter()
.map(|entry| {
serde_json::from_slice::<serde_json::Value>(
&std::fs::read(entry.path()).expect("manifest bytes"),
)
.expect("manifest json")
})
.collect::<Vec<_>>();
manifests.sort_by(|left, right| {
left["original_path"]
.as_str()
.cmp(&right["original_path"].as_str())
});
manifests
}
#[test]
fn raw_mirror_capture_uses_discovered_sources_before_parser_failure() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let provider_root = temp.path().join("provider-root");
std::fs::create_dir_all(&provider_root).expect("provider root");
let source_path = provider_root.join("parse-fails.jsonl");
let source_bytes = b"{this connector will fail after discovery\n";
std::fs::write(&source_path, source_bytes).expect("write source");
let root = ScanRoot::local(provider_root);
let discovered = crate::connectors::DiscoveredSourceFile::new(
"synthetic",
&root,
source_path.clone(),
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
true,
)
.with_fs_metadata();
let connector = FailingDiscoveryConnector {
sources: vec![discovered],
};
let ctx =
crate::connectors::ScanContext::with_roots(temp.path().to_path_buf(), vec![root], None);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
assert!(
connector.scan(&ctx).is_err(),
"test connector must fail after discovery to model parser failure"
);
let manifest_root = data_dir.join("raw-mirror/v1/manifests");
let manifests = std::fs::read_dir(&manifest_root)
.expect("manifest dir")
.collect::<std::io::Result<Vec<_>>>()
.expect("manifest entries");
assert_eq!(
manifests.len(),
1,
"discovered source should be captured even when later parsing fails"
);
let manifest: serde_json::Value =
serde_json::from_slice(&std::fs::read(manifests[0].path()).expect("manifest bytes"))
.expect("manifest json");
assert_eq!(manifest["provider"].as_str(), Some("synthetic"));
assert_eq!(manifest["source_id"].as_str(), Some("local"));
assert_eq!(manifest["db_links"].as_array().map(Vec::len), Some(0));
assert_eq!(
manifest["verification"]["status"].as_str(),
Some("captured")
);
assert_eq!(
std::fs::read(&source_path).expect("source remains untouched"),
source_bytes
);
}
#[test]
fn raw_mirror_capture_rejects_relative_discovered_source_paths() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let relative_root = ScanRoot::local(std::path::PathBuf::from("relative-provider-root"));
let absolute_root = ScanRoot::local(temp.path().join("provider-root"));
let connector = FailingDiscoveryConnector {
sources: vec![
discovered_test_source(
&relative_root,
std::path::PathBuf::from("relative-provider-root/session.jsonl"),
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
),
discovered_test_source(
&absolute_root,
std::path::PathBuf::from("relative-session.jsonl"),
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
),
],
};
let ctx = crate::connectors::ScanContext::with_roots(
temp.path().to_path_buf(),
vec![relative_root, absolute_root],
None,
);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
assert!(
raw_mirror_manifest_values(&data_dir).is_empty(),
"relative discovered roots or source paths must not publish raw-mirror manifests"
);
}
#[test]
fn raw_mirror_capture_rejects_discovered_sources_that_escape_scan_root() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let provider_root = temp.path().join("provider-root");
std::fs::create_dir_all(&provider_root).expect("provider root");
let outside_dotdot = provider_root.join("../outside-dotdot.jsonl");
let outside_absolute = temp.path().join("outside-absolute.jsonl");
std::fs::write(&outside_dotdot, b"escaped through parent component\n")
.expect("dotdot source");
std::fs::write(&outside_absolute, b"escaped through absolute path\n")
.expect("absolute source");
let root = ScanRoot::local(provider_root);
let connector = FailingDiscoveryConnector {
sources: vec![
discovered_test_source(
&root,
outside_dotdot.clone(),
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
),
discovered_test_source(
&root,
outside_absolute.clone(),
crate::connectors::DiscoveredSourceRole::MetadataSidecar,
),
],
};
let ctx =
crate::connectors::ScanContext::with_roots(temp.path().to_path_buf(), vec![root], None);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
assert!(
raw_mirror_manifest_values(&data_dir).is_empty(),
"escaped discovered sources must not publish raw-mirror manifests"
);
assert_eq!(
std::fs::read(&outside_dotdot).expect("dotdot source remains"),
b"escaped through parent component\n"
);
assert_eq!(
std::fs::read(&outside_absolute).expect("absolute source remains"),
b"escaped through absolute path\n"
);
}
#[test]
fn raw_mirror_capture_handles_deleted_after_discovery_source_without_manifest() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let provider_root = temp.path().join("provider-root");
std::fs::create_dir_all(&provider_root).expect("provider root");
let missing_source = provider_root.join("deleted-after-discovery.jsonl");
let root = ScanRoot::local(provider_root);
let connector = FailingDiscoveryConnector {
sources: vec![discovered_test_source(
&root,
missing_source,
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
)],
};
let ctx =
crate::connectors::ScanContext::with_roots(temp.path().to_path_buf(), vec![root], None);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
assert!(
raw_mirror_manifest_values(&data_dir).is_empty(),
"missing discovered sources must not publish partial raw-mirror manifests"
);
}
#[test]
fn raw_mirror_preparse_defers_large_primary_source_sets_but_keeps_sidecars() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let provider_root = temp.path().join("provider-root");
std::fs::create_dir_all(&provider_root).expect("provider root");
let root = ScanRoot::local(provider_root.clone());
let mut sources = Vec::new();
for i in 0..=PREPARSE_PRIMARY_SOURCE_CAPTURE_LIMIT {
let source_path = provider_root.join(format!("session-{i}.jsonl"));
std::fs::write(&source_path, format!("primary session bytes {i}\n"))
.expect("primary source");
sources.push(discovered_test_source(
&root,
source_path,
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
));
}
let sidecar = provider_root.join("metadata.json");
std::fs::write(&sidecar, b"{\"metadata\":true}\n").expect("sidecar source");
sources.push(discovered_test_source(
&root,
sidecar.clone(),
crate::connectors::DiscoveredSourceRole::MetadataSidecar,
));
let connector = FailingDiscoveryConnector { sources };
let ctx =
crate::connectors::ScanContext::with_roots(temp.path().to_path_buf(), vec![root], None);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
let manifests = raw_mirror_manifest_values(&data_dir);
assert_eq!(
manifests.len(),
1,
"large primary session-log sets should stream raw-mirror capture per parsed conversation instead of blocking preparse"
);
assert_eq!(manifests[0]["provider"].as_str(), Some("synthetic"));
let sidecar_display = sidecar.display().to_string();
assert_eq!(
manifests[0]["original_path"].as_str(),
Some(sidecar_display.as_str())
);
assert_eq!(
manifests[0]["verification"]["status"].as_str(),
Some("captured")
);
}
#[test]
fn raw_mirror_capture_deduplicates_duplicate_discovered_sources_and_keeps_multi_file_set() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let provider_root = temp.path().join("provider-root");
std::fs::create_dir_all(&provider_root).expect("provider root");
let primary = provider_root.join("session.jsonl");
let sidecar = provider_root.join("metadata.json");
std::fs::write(&primary, b"primary session bytes\n").expect("primary source");
std::fs::write(&sidecar, b"{\"metadata\":true}\n").expect("sidecar source");
let root = ScanRoot::local(provider_root);
let primary_source = discovered_test_source(
&root,
primary.clone(),
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
);
let connector = FailingDiscoveryConnector {
sources: vec![
primary_source.clone(),
primary_source,
discovered_test_source(
&root,
sidecar.clone(),
crate::connectors::DiscoveredSourceRole::MetadataSidecar,
),
],
};
let ctx =
crate::connectors::ScanContext::with_roots(temp.path().to_path_buf(), vec![root], None);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
let manifests = raw_mirror_manifest_values(&data_dir);
assert_eq!(
manifests.len(),
2,
"duplicate discovered sources should collapse to one manifest while multi-file sessions keep each source file"
);
let original_paths = manifests
.iter()
.map(|manifest| manifest["original_path"].as_str().expect("original path"))
.collect::<Vec<_>>();
assert_eq!(
original_paths,
vec![sidecar.display().to_string(), primary.display().to_string()]
);
for manifest in manifests {
assert_eq!(manifest["provider"].as_str(), Some("synthetic"));
assert_eq!(
manifest["verification"]["status"].as_str(),
Some("captured")
);
assert_eq!(manifest["db_links"].as_array().map(Vec::len), Some(0));
}
assert_eq!(
std::fs::read(&primary).expect("primary remains"),
b"primary session bytes\n"
);
assert_eq!(
std::fs::read(&sidecar).expect("sidecar remains"),
b"{\"metadata\":true}\n"
);
}
#[test]
fn raw_mirror_capture_skips_logical_non_file_conversation_sources() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let db_path = temp.path().join("opencode.db");
std::fs::write(&db_path, b"not a real sqlite fixture for this test")
.expect("logical db source");
let mut conv = NormalizedConversation {
agent_slug: "opencode".to_string(),
external_id: Some("ses_2b0314216ffeBBw5c7WZ41fFMl".to_string()),
title: Some("OpenCode logical DB row".to_string()),
workspace: None,
source_path: db_path.join("ses_2b0314216ffeBBw5c7WZ41fFMl"),
started_at: Some(1_733_000_000_000),
ended_at: Some(1_733_000_000_100),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".to_string(),
author: None,
created_at: Some(1_733_000_000_000),
content: "hello".to_string(),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
}],
};
attach_raw_mirror_capture(&data_dir, &mut conv);
assert!(
conv.metadata
.get("cass")
.and_then(|cass| cass.get("raw_mirror"))
.is_none(),
"logical DB-backed conversation paths must not receive file raw-mirror metadata"
);
assert!(
raw_mirror_manifest_values(&data_dir).is_empty(),
"logical non-file conversation paths must not publish failed raw-mirror manifests"
);
assert_eq!(
std::fs::read(&db_path).expect("db source remains"),
b"not a real sqlite fixture for this test"
);
}
#[cfg(unix)]
#[test]
fn raw_mirror_logical_source_skip_preserves_symlink_validation_path() {
let temp = TempDir::new().expect("tempdir");
let real_source = temp.path().join("real.jsonl");
let symlink_source = temp.path().join("link.jsonl");
std::fs::write(&real_source, b"real source bytes\n").expect("real source");
std::os::unix::fs::symlink(&real_source, &symlink_source).expect("source symlink");
assert!(
!should_skip_raw_mirror_capture_for_logical_source(&symlink_source),
"symlink source paths should still reach raw-mirror's validator"
);
let db_path = temp.path().join("opencode.db");
std::fs::write(&db_path, b"not a directory").expect("db source");
assert!(
should_skip_raw_mirror_capture_for_logical_source(&db_path.join("session-row-id")),
"logical DB row paths should be treated as non-file source identifiers"
);
}
#[cfg(unix)]
#[test]
fn raw_mirror_capture_rejects_discovered_symlink_source_without_manifest() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let provider_root = temp.path().join("provider-root");
std::fs::create_dir_all(&provider_root).expect("provider root");
let real_source = provider_root.join("real.jsonl");
let symlink_source = provider_root.join("link.jsonl");
std::fs::write(&real_source, b"real source bytes\n").expect("real source");
std::os::unix::fs::symlink(&real_source, &symlink_source).expect("source symlink");
let root = ScanRoot::local(provider_root);
let connector = FailingDiscoveryConnector {
sources: vec![discovered_test_source(
&root,
symlink_source,
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
)],
};
let ctx =
crate::connectors::ScanContext::with_roots(temp.path().to_path_buf(), vec![root], None);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
assert!(
raw_mirror_manifest_values(&data_dir).is_empty(),
"symlink discovered sources must not publish raw-mirror manifests"
);
assert_eq!(
std::fs::read(&real_source).expect("real source remains"),
b"real source bytes\n"
);
}
#[cfg(unix)]
#[test]
fn raw_mirror_capture_rejects_discovered_source_under_symlink_parent() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let provider_root = temp.path().join("provider-root");
let outside_root = temp.path().join("outside-root");
std::fs::create_dir_all(&provider_root).expect("provider root");
std::fs::create_dir_all(&outside_root).expect("outside root");
let real_source = outside_root.join("escaped.jsonl");
let symlink_parent = provider_root.join("linkdir");
let discovered_source = symlink_parent.join("escaped.jsonl");
std::fs::write(&real_source, b"escaped through symlink parent\n").expect("real source");
std::os::unix::fs::symlink(&outside_root, &symlink_parent).expect("parent symlink");
let root = ScanRoot::local(provider_root);
let connector = FailingDiscoveryConnector {
sources: vec![discovered_test_source(
&root,
discovered_source,
crate::connectors::DiscoveredSourceRole::PrimarySessionLog,
)],
};
let ctx =
crate::connectors::ScanContext::with_roots(temp.path().to_path_buf(), vec![root], None);
let active_filter = ActiveSessionSourceFilter::default();
capture_connector_sources_before_parse(
&connector,
&ctx,
&data_dir,
"synthetic",
&[],
None,
&active_filter,
);
assert!(
raw_mirror_manifest_values(&data_dir).is_empty(),
"sources under symlink parents must not publish raw-mirror manifests"
);
assert_eq!(
std::fs::read(&real_source).expect("real source remains"),
b"escaped through symlink parent\n"
);
}
#[test]
fn raw_mirror_capture_enriches_preparse_manifest_after_successful_parse() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
let source_path = temp.path().join("preparse-then-parsed.jsonl");
let source_bytes = b"{\"type\":\"message\",\"role\":\"user\",\"content\":\"hello\"}\n";
std::fs::write(&source_path, source_bytes).expect("write source");
let root = ScanRoot::local(source_path.clone());
let active_filter = ActiveSessionSourceFilter::default();
capture_scan_root_file_before_parse(&data_dir, "codex", &root, &active_filter);
let mut conv = NormalizedConversation {
agent_slug: "codex".to_string(),
external_id: Some("raw-mirror-preparse-enrichment".to_string()),
title: Some("Raw mirror preparse enrichment".to_string()),
workspace: None,
source_path: source_path.clone(),
started_at: Some(1_733_000_000_000),
ended_at: Some(1_733_000_000_100),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".to_string(),
author: None,
created_at: Some(1_733_000_000_000),
content: "hello".to_string(),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
}],
};
inject_provenance(&mut conv, &Origin::local());
attach_raw_mirror_capture(&data_dir, &mut conv);
let manifest_root = data_dir.join("raw-mirror/v1/manifests");
let manifests = std::fs::read_dir(&manifest_root)
.expect("manifest dir")
.collect::<std::io::Result<Vec<_>>>()
.expect("manifest entries");
assert_eq!(manifests.len(), 1);
let manifest: serde_json::Value =
serde_json::from_slice(&std::fs::read(manifests[0].path()).expect("manifest bytes"))
.expect("manifest json");
assert_eq!(manifest["db_links"].as_array().map(Vec::len), Some(1));
assert_eq!(manifest["db_links"][0]["message_count"].as_u64(), Some(1));
assert_eq!(
manifest["db_links"][0]["started_at_ms"].as_i64(),
Some(1_733_000_000_000)
);
assert_eq!(
std::fs::read(&source_path).expect("source bytes"),
source_bytes
);
}
#[test]
fn ingest_batch_records_persisted_raw_mirror_conversation_id_link() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
std::fs::create_dir_all(&data_dir).expect("data dir");
let db_path = data_dir.join("raw-mirror-link.db");
let storage = FrankenStorage::open(&db_path).expect("storage");
ensure_fts_schema(&storage);
let source_path = temp.path().join("persisted-link.jsonl");
let source_bytes = b"{\"type\":\"message\",\"role\":\"user\",\"content\":\"hello\"}\n";
std::fs::write(&source_path, source_bytes).expect("write source");
let mut conv = NormalizedConversation {
agent_slug: "codex".to_string(),
external_id: Some("raw-mirror-persisted-link".to_string()),
title: Some("Raw mirror persisted link".to_string()),
workspace: None,
source_path: source_path.clone(),
started_at: Some(1_733_000_000_000),
ended_at: Some(1_733_000_000_100),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".to_string(),
author: None,
created_at: Some(1_733_000_000_000),
content: "hello".to_string(),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
}],
};
inject_provenance(&mut conv, &Origin::local());
attach_raw_mirror_capture(&data_dir, &mut conv);
let manifest_relative = conv.metadata["cass"]["raw_mirror"]["manifest_relative_path"]
.as_str()
.expect("manifest relative path")
.to_string();
let mutations = ingest_batch(
&storage,
None,
&data_dir,
&[conv],
&None,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
false,
)
.expect("ingest batch");
assert_eq!(mutations.inserted_conversations, 1);
assert_eq!(mutations.inserted_messages, 1);
let conversation_ids: Vec<i64> = storage
.raw()
.query_map_collect(
"SELECT id FROM conversations WHERE external_id = ?1",
&[ParamValue::from("raw-mirror-persisted-link")],
|row| row.get_typed(0),
)
.expect("conversation id query");
assert_eq!(conversation_ids.len(), 1);
let conversation_id = conversation_ids[0];
let manifest_path = data_dir.join("raw-mirror/v1").join(manifest_relative);
let manifest: serde_json::Value =
serde_json::from_slice(&std::fs::read(manifest_path).expect("manifest bytes"))
.expect("manifest json");
let db_links = manifest["db_links"].as_array().expect("db links");
assert!(
db_links.iter().any(|link| {
link["conversation_id"].as_i64() == Some(conversation_id)
&& link["message_count"].as_u64() == Some(1)
&& link["started_at_ms"].as_i64() == Some(1_733_000_000_000)
}),
"manifest should include persisted conversation_id link: {manifest:#}"
);
assert_eq!(
std::fs::read(&source_path).expect("source bytes"),
source_bytes
);
}
#[test]
#[serial]
fn batch_index_captures_explicit_file_root_before_failed_scan() {
let temp = TempDir::new().expect("tempdir");
let data_dir = temp.path().join("cass-data");
std::fs::create_dir_all(&data_dir).expect("data dir");
let source_path = temp.path().join("parse-failure-candidate.jsonl");
let source_bytes = b"{not valid connector json\n";
std::fs::write(&source_path, source_bytes).expect("write source");
*FAILING_EXPLICIT_FILE_ROOT
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some(source_path.clone());
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).expect("storage");
ensure_fts_schema(&storage);
let opts = IndexOptions {
full: true,
force_rebuild: false,
watch: false,
watch_once_paths: None,
db_path,
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
};
let mutations = run_batch_index_with_connector_factories(
&storage,
None,
&opts,
None,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
Vec::new(),
vec![("codex", failing_explicit_file_root_connector_factory)],
FrankenStorage::now_millis(),
)
.expect("failed scan should not abort batch indexing");
*FAILING_EXPLICIT_FILE_ROOT
.lock()
.unwrap_or_else(|e| e.into_inner()) = None;
assert_eq!(mutations, CanonicalMutationCounts::default());
let manifest_root = data_dir.join("raw-mirror/v1/manifests");
let manifests = std::fs::read_dir(&manifest_root)
.expect("manifest dir")
.collect::<std::io::Result<Vec<_>>>()
.expect("manifest entries");
assert_eq!(
manifests.len(),
1,
"source should be mirrored before the connector parse failure"
);
let manifest: serde_json::Value =
serde_json::from_slice(&std::fs::read(manifests[0].path()).expect("manifest bytes"))
.expect("manifest json");
assert_eq!(manifest["provider"].as_str(), Some("codex"));
assert_eq!(
manifest["verification"]["status"].as_str(),
Some("captured")
);
assert_eq!(
std::fs::read(&source_path).expect("source bytes"),
source_bytes
);
}
#[test]
fn semantic_tier_for_embedder_id_maps_known_ids() {
assert_eq!(
super::semantic_tier_for_embedder_id("minilm-384"),
Some(super::SemanticTierKind::Quality)
);
assert_eq!(
super::semantic_tier_for_embedder_id("fnv1a-384"),
Some(super::SemanticTierKind::Fast)
);
assert_eq!(super::semantic_tier_for_embedder_id("unknown"), None);
}
#[test]
fn semantic_model_revision_for_embedder_id_known_ids() {
assert_eq!(
super::semantic_model_revision_for_embedder_id("fnv1a-384"),
"hash"
);
assert!(
!super::semantic_model_revision_for_embedder_id("minilm-384").is_empty(),
"minilm revision should resolve to ModelManifest::minilm_v2().revision"
);
}
#[test]
fn staged_field_or_null_emits_null_outside_rebuild() {
assert_eq!(staged_field_or_null(true, 6), serde_json::json!(6));
assert_eq!(staged_field_or_null(true, 0), serde_json::json!(0));
assert_eq!(staged_field_or_null(false, 6), serde_json::Value::Null);
assert_eq!(staged_field_or_null(false, 0), serde_json::Value::Null);
}
struct EnvGuard {
key: &'static str,
previous: Option<String>,
}
impl Drop for EnvGuard {
fn drop(&mut self) {
if let Some(value) = &self.previous {
unsafe {
std::env::set_var(self.key, value);
}
} else {
unsafe {
std::env::remove_var(self.key);
}
}
}
}
struct ActiveSessionSkipReset;
impl Drop for ActiveSessionSkipReset {
fn drop(&mut self) {
ACTIVE_SESSION_SOURCE_SKIP_OBSERVED.store(false, Ordering::Relaxed);
}
}
fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
let previous = dotenvy::var(key).ok();
unsafe {
std::env::set_var(key, value.as_ref());
}
EnvGuard { key, previous }
}
fn unset_env_var(key: &'static str) -> EnvGuard {
let previous = dotenvy::var(key).ok();
unsafe {
std::env::remove_var(key);
}
EnvGuard { key, previous }
}
fn ignore_sources_config() -> EnvGuard {
let key = "CASS_IGNORE_SOURCES_CONFIG";
let previous = dotenvy::var(key).ok();
unsafe {
std::env::set_var(key, "1");
}
EnvGuard { key, previous }
}
fn never_constructed_connector_factory() -> Box<dyn Connector + Send> {
panic!("test should not construct connector factories while filtering by config");
}
fn set_env(key: &'static str, value: &str) -> EnvGuard {
let previous = dotenvy::var(key).ok();
unsafe {
std::env::set_var(key, value);
}
EnvGuard { key, previous }
}
#[test]
fn active_session_recent_write_detection_is_watch_opt_in() {
let tmp = TempDir::new().expect("tempdir");
let source_path = tmp.path().join("active-session.jsonl");
std::fs::write(&source_path, b"{\"type\":\"session\"}\n").expect("write source");
let inactive_filter = ActiveSessionSourceFilter::with_recent_write_window_for_test(None);
assert_eq!(inactive_filter.active_writer_reason(&source_path), None);
let active_filter = ActiveSessionSourceFilter::with_recent_write_window_for_test(Some(
Duration::from_secs(3_600),
));
assert_eq!(
active_filter.active_writer_reason(&source_path),
Some(ActiveSessionSourceReason::RecentlyModified)
);
}
#[test]
fn out_of_memory_classifier_rejects_contextual_substrings() {
let typed: anyhow::Error = frankensqlite::FrankenError::OutOfMemory.into();
assert!(error_is_out_of_memory(&typed));
let exact = anyhow::anyhow!("out of memory");
assert!(error_is_out_of_memory(&exact));
let contextual = anyhow::anyhow!("connector parse failed: not enough memory in record");
assert!(
!error_is_out_of_memory(&contextual),
"contextual text must not be promoted into permanent OOM quarantine"
);
}
#[test]
fn poison_quarantine_record_preserves_full_error_chain() {
use crate::connectors::{NormalizedConversation, NormalizedMessage};
let tmp = TempDir::new().expect("tempdir");
let data_dir = tmp.path().join("poison-chain");
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some("chain-capture-conv".into()),
title: Some("chain capture".into()),
workspace: Some(std::path::PathBuf::from("/ws/chain")),
source_path: std::path::PathBuf::from("/log/chain.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_010),
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1_700_000_000_005),
content: "trigger".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let typed: anyhow::Error = frankensqlite::FrankenError::OutOfMemory.into();
let with_inner_ctx = typed.context("vdbe register allocation");
let error: anyhow::Error = with_inner_ctx.context("ingest_batch_with_semantic_delta");
super::record_poison_conversation(
&data_dir,
WATCH_INGEST_POISON_FILE,
"watch-ingest-out-of-memory",
&conv,
&error,
)
.expect("record poison");
let path = data_dir.join("quarantine").join(WATCH_INGEST_POISON_FILE);
let contents = std::fs::read_to_string(&path).expect("read quarantine file");
let line = contents
.lines()
.find(|line| !line.trim().is_empty())
.expect("at least one record");
let value: serde_json::Value = serde_json::from_str(line).expect("parse record");
let last_error = value
.get("last_error")
.and_then(serde_json::Value::as_str)
.expect("last_error string");
assert!(
last_error.contains("ingest_batch_with_semantic_delta"),
"last_error must include outer context: {last_error}"
);
assert!(
last_error.contains("vdbe register allocation"),
"last_error must include intermediate context: {last_error}"
);
assert!(
last_error.contains("out of memory"),
"last_error must include innermost cause: {last_error}"
);
assert!(
last_error.contains(" | "),
"last_error must be ` | `-joined chain: {last_error}"
);
}
#[test]
fn heartbeat_index_run_lock_refreshes_updated_at_without_touching_identity_fields() {
let tmp = TempDir::new().unwrap();
let lock_path = tmp.path().join("index-run.lock");
std::fs::write(
&lock_path,
"pid=123\nstarted_at_ms=111\nupdated_at_ms=222\ndb_path=/tmp/db.sqlite\nmode=index\njob_id=lexical_refresh-123\njob_kind=lexical_refresh\nphase=index\n",
)
.unwrap();
heartbeat_index_run_lock(tmp.path()).unwrap();
heartbeat_index_run_lock(tmp.path()).unwrap();
let refreshed = std::fs::read_to_string(&lock_path).unwrap();
assert!(refreshed.contains("pid=123"));
assert!(refreshed.contains("started_at_ms=111"));
assert!(refreshed.contains("db_path=/tmp/db.sqlite"));
assert!(refreshed.contains("job_id=lexical_refresh-123"));
let updated_at_raw_values: Vec<&str> = refreshed
.lines()
.filter_map(|line| line.strip_prefix("updated_at_ms="))
.collect();
assert_eq!(
updated_at_raw_values.len(),
1,
"heartbeat refresh should replace the existing updated_at_ms line"
);
let updated_at_value = updated_at_raw_values
.first()
.and_then(|value| value.parse::<i64>().ok());
assert!(
updated_at_value.is_some_and(|value| value >= 222),
"updated_at_ms should be parseable and should not move backwards: {updated_at_raw_values:?}"
);
let temp_artifacts: Vec<_> = std::fs::read_dir(tmp.path())
.unwrap()
.filter_map(|entry| entry.ok())
.map(|entry| entry.file_name().to_string_lossy().into_owned())
.filter(|name| name.starts_with(".index-run.lock.tmp."))
.collect();
assert!(
temp_artifacts.is_empty(),
"successful heartbeat refresh should not leave temp files: {temp_artifacts:?}"
);
}
#[test]
fn acquire_index_run_lock_writes_last_progress_at_ms_field() -> Result<()> {
use crate::search::asset_state::read_search_maintenance_snapshot;
let tmp = TempDir::new()?;
let db_path = tmp.path().join("agent_search.db");
std::fs::write(&db_path, b"placeholder")?;
let before_ms = crate::storage::sqlite::FrankenStorage::now_millis();
let guard = acquire_index_run_lock(tmp.path(), &db_path, SearchMaintenanceMode::Index)?;
let after_ms = crate::storage::sqlite::FrankenStorage::now_millis();
let lock_path = tmp.path().join("index-run.lock");
let raw = std::fs::read_to_string(&lock_path)?;
let last_progress_lines: Vec<&str> = raw
.lines()
.filter_map(|line| line.strip_prefix("last_progress_at_ms="))
.collect();
assert_eq!(
last_progress_lines.len(),
1,
"lock file must contain exactly one last_progress_at_ms line; got {raw:?}",
);
let value: i64 = last_progress_lines
.first()
.context("last_progress_at_ms line missing")?
.parse()
.context("last_progress_at_ms must parse as i64")?;
assert!(
(before_ms..=after_ms).contains(&value),
"last_progress_at_ms ({value}) must be within [before_ms={before_ms}, after_ms={after_ms}] of the acquire_index_run_lock call",
);
let snapshot = read_search_maintenance_snapshot(tmp.path());
assert_eq!(
snapshot.last_progress_at_ms,
Some(value),
"snapshot reader must surface last_progress_at_ms; #258 stall detector is inert otherwise",
);
drop(guard);
Ok(())
}
#[test]
fn heartbeat_preserves_last_progress_at_ms_field_for_stall_detection() {
let tmp = TempDir::new().unwrap();
let lock_path = tmp.path().join("index-run.lock");
std::fs::write(
&lock_path,
"pid=123\nstarted_at_ms=111\nupdated_at_ms=222\nlast_progress_at_ms=333\ndb_path=/tmp/db.sqlite\nmode=watch_startup\njob_id=lexical_refresh-123\njob_kind=lexical_refresh\nphase=watch_startup\n",
)
.unwrap();
heartbeat_index_run_lock(tmp.path()).unwrap();
heartbeat_index_run_lock(tmp.path()).unwrap();
let refreshed = std::fs::read_to_string(&lock_path).unwrap();
let last_progress_lines: Vec<&str> = refreshed
.lines()
.filter_map(|line| line.strip_prefix("last_progress_at_ms="))
.collect();
assert_eq!(
last_progress_lines.len(),
1,
"heartbeat must keep exactly one last_progress_at_ms line, got {refreshed:?}",
);
assert_eq!(
last_progress_lines.first().copied(),
Some("333"),
"heartbeat must preserve last_progress_at_ms verbatim — refreshing it would silently disable the #258 stall detector",
);
let updated_at_value: Option<i64> = refreshed
.lines()
.filter_map(|line| line.strip_prefix("updated_at_ms="))
.next()
.and_then(|raw| raw.parse().ok());
assert!(
updated_at_value.is_some_and(|value| value > 222),
"updated_at_ms must advance past initial 222, got {refreshed:?}",
);
}
#[test]
fn heartbeat_folds_indexer_progress_atomic_into_last_progress_at_ms() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
std::fs::write(&db_path, b"placeholder").unwrap();
let guard = acquire_index_run_lock(tmp.path(), &db_path, SearchMaintenanceMode::Index)
.expect("acquire index run lock");
let lock_path = tmp.path().join("index-run.lock");
let before = std::fs::read_to_string(&lock_path).unwrap();
let old_progress = before
.lines()
.find_map(|line| line.strip_prefix("last_progress_at_ms="))
.and_then(|raw| raw.parse::<i64>().ok())
.expect("initial progress line");
let bumped_progress = old_progress.saturating_add(1_000);
guard
.last_progress_at_ms_atomic
.store(bumped_progress, Ordering::Relaxed);
heartbeat_index_run_lock_with_lock_and_progress(
tmp.path(),
Some(&guard.metadata_write_lock),
guard.last_progress_at_ms_atomic.load(Ordering::Relaxed),
)
.expect("heartbeat should persist indexer-owned progress");
let refreshed = std::fs::read_to_string(&lock_path).unwrap();
let expected_line = format!("last_progress_at_ms={bumped_progress}");
assert!(
refreshed.lines().any(|line| line == expected_line),
"heartbeat must persist the indexer-owned progress bump, got {refreshed:?}"
);
drop(guard);
}
#[test]
fn heartbeat_index_run_lock_surfaces_in_place_open_errors() {
let tmp = TempDir::new().unwrap();
let parent_as_file = tmp.path().join("not-a-directory");
std::fs::write(&parent_as_file, b"not a directory").unwrap();
let lock_path = parent_as_file.join("index-run.lock");
let err = write_index_run_lock_heartbeat_in_place(&lock_path, "updated_at_ms=123\n")
.expect_err("non-directory parent should make heartbeat open fail");
let err_text = format!("{err:#}");
assert!(
err_text.contains("opening index-run lock heartbeat"),
"unexpected heartbeat error: {err_text}"
);
}
#[cfg(unix)]
#[test]
fn heartbeat_index_run_lock_preserves_lock_file_inode() {
use std::os::unix::fs::MetadataExt;
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
std::fs::write(&db_path, b"placeholder").unwrap();
let guard = acquire_index_run_lock(tmp.path(), &db_path, SearchMaintenanceMode::Index)
.expect("acquire index run lock");
let lock_path = tmp.path().join("index-run.lock");
let before = std::fs::metadata(&lock_path).expect("lock metadata before heartbeat");
heartbeat_index_run_lock_with_lock(tmp.path(), Some(&guard.metadata_write_lock))
.expect("heartbeat refresh should succeed");
let after = std::fs::metadata(&lock_path).expect("lock metadata after heartbeat");
assert_eq!(
(before.dev(), before.ino()),
(after.dev(), after.ino()),
"heartbeat refresh must not replace index-run.lock while a flock is held"
);
drop(guard);
}
#[test]
fn heartbeat_index_run_lock_waits_for_shared_metadata_write_lock() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
std::fs::write(&db_path, b"placeholder").unwrap();
let guard = acquire_index_run_lock(tmp.path(), &db_path, SearchMaintenanceMode::Index)
.expect("acquire index run lock");
let metadata_write_lock = Arc::clone(&guard.metadata_write_lock);
let held_lock = metadata_write_lock
.lock()
.expect("hold metadata write lock");
let started = Arc::new(std::sync::atomic::AtomicBool::new(false));
let finished = Arc::new(std::sync::atomic::AtomicBool::new(false));
let started_flag = Arc::clone(&started);
let finished_flag = Arc::clone(&finished);
let data_dir = tmp.path().to_path_buf();
let metadata_write_lock_for_thread = Arc::clone(&metadata_write_lock);
let handle = thread::spawn(move || {
started_flag.store(true, Ordering::SeqCst);
heartbeat_index_run_lock_with_lock(&data_dir, Some(&metadata_write_lock_for_thread))
.expect("heartbeat refresh with shared lock");
finished_flag.store(true, Ordering::SeqCst);
});
while !started.load(Ordering::SeqCst) {
thread::yield_now();
}
thread::sleep(Duration::from_millis(50));
assert!(
!finished.load(Ordering::SeqCst),
"heartbeat must wait for the shared metadata write lock before rewriting the lock file"
);
drop(held_lock);
handle.join().expect("heartbeat thread should join cleanly");
assert!(finished.load(Ordering::SeqCst));
drop(guard);
}
#[derive(Clone, Default)]
struct LogBuffer(std::sync::Arc<std::sync::Mutex<Vec<u8>>>);
impl std::io::Write for LogBuffer {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.0
.lock()
.expect("log buffer lock")
.extend_from_slice(buf);
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
fn capture_logs<F: FnOnce()>(f: F) -> String {
let writer = LogBuffer::default();
let drain = writer.clone();
let subscriber = tracing_subscriber::fmt()
.with_writer(move || writer.clone())
.with_ansi(false)
.with_target(false)
.with_max_level(tracing::Level::INFO)
.finish();
tracing::subscriber::with_default(subscriber, f);
let bytes = drain.0.lock().expect("log buffer lock").clone();
String::from_utf8_lossy(&bytes).to_string()
}
fn ensure_fts_schema(storage: &FrankenStorage) {
let count: i64 = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.unwrap();
if count == 0 {
storage
.raw()
.execute(
"CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
message_id UNINDEXED,
tokenize='porter'
)",
)
.unwrap();
}
assert!(
storage
.raw()
.query("SELECT rowid FROM fts_messages LIMIT 1")
.is_ok(),
"fts_messages should remain queryable via frankensqlite in tests"
);
}
fn norm_msg(idx: i64, created_at: i64) -> NormalizedMessage {
NormalizedMessage {
idx,
role: "user".into(),
author: Some("u".into()),
created_at: Some(created_at),
content: format!("msg-{idx}"),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
}
}
fn norm_conv(
external_id: Option<&str>,
msgs: Vec<NormalizedMessage>,
) -> NormalizedConversation {
NormalizedConversation {
agent_slug: "tester".into(),
external_id: external_id.map(std::borrow::ToOwned::to_owned),
title: Some("Demo".into()),
workspace: Some(PathBuf::from("/workspace/demo")),
source_path: PathBuf::from("/logs/demo.jsonl"),
started_at: msgs.first().and_then(|m| m.created_at),
ended_at: msgs.last().and_then(|m| m.created_at),
metadata: serde_json::json!({}),
messages: msgs,
}
}
fn seed_lexical_rebuild_fixture(storage: &FrankenStorage) {
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
for (external_id, base_ts) in [
("lexical-fixture-1", 1_700_000_000_000_i64),
("lexical-fixture-2", 1_700_000_001_000_i64),
] {
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.to_string()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
}
}
#[test]
fn lexical_rebuild_packet_matches_canonical_and_normalized_semantics_for_host_only_remote() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("packet-contract.db");
let storage = FrankenStorage::open(&db_path).unwrap();
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let raw_conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some("packet-host-only".into()),
title: Some("Packet host-only remote".into()),
workspace: Some(PathBuf::from("/tmp/workspace")),
source_path: PathBuf::from("/tmp/packet-host-only.jsonl"),
started_at: Some(1_700_000_100_000),
ended_at: Some(1_700_000_100_100),
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": " ",
"host": "builder-packet"
}
}
}),
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("user".into()),
created_at: Some(1_700_000_100_010),
content: "first packet message".into(),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("assistant".into()),
created_at: Some(1_700_000_100_020),
content: "second packet message".into(),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
},
],
};
let canonical = Conversation {
id: None,
agent_slug: raw_conv.agent_slug.clone(),
workspace: raw_conv.workspace.clone(),
external_id: raw_conv.external_id.clone(),
title: raw_conv.title.clone(),
source_path: raw_conv.source_path.clone(),
started_at: raw_conv.started_at,
ended_at: raw_conv.ended_at,
approx_tokens: Some(64),
metadata_json: raw_conv.metadata.clone(),
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_100_010),
content: "first packet message".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_100_020),
content: "second packet message".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
},
],
source_id: " ".into(),
origin_host: Some("builder-packet".into()),
};
let workspace_id = storage
.ensure_workspace(raw_conv.workspace.as_deref().expect("workspace path"), None)
.unwrap();
let inserted = storage
.insert_conversation_tree(agent_id, Some(workspace_id), &canonical)
.unwrap();
let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
let replay_row = storage
.list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
.unwrap()
.into_iter()
.next()
.expect("canonical replay row");
let mut fetched = storage
.fetch_messages_for_lexical_rebuild_batch(&[inserted.conversation_id], None, None)
.unwrap();
let replay_messages = fetched
.remove(&inserted.conversation_id)
.expect("canonical replay messages");
let source_map = storage
.list_sources()
.unwrap_or_default()
.into_iter()
.map(|source| (source.id, (source.kind, source.host_label)))
.collect::<HashMap<_, _>>();
let canonical_packet = LexicalRebuildConversationPacket::from_canonical_replay_messages(
replay_row,
replay_messages,
&source_map,
)
.unwrap();
let normalized_packet =
LexicalRebuildConversationPacket::from_normalized_conversation(&raw_conv);
assert_eq!(
canonical_packet.semantic_view(),
normalized_packet.semantic_view(),
"canonical replay and normalized builders should agree on the hot-path semantics"
);
assert_eq!(
canonical_packet.fingerprint_input(),
normalized_packet.fingerprint_input(),
"canonical replay and normalized builders should agree on the fingerprint input contract"
);
assert_eq!(
canonical_packet.diagnostics.provenance_mode,
LexicalRebuildPacketProvenanceMode::SourceMapLookup
);
assert_eq!(
normalized_packet.diagnostics.provenance_mode,
LexicalRebuildPacketProvenanceMode::HostFallback
);
}
#[test]
fn lexical_rebuild_packet_empty_conversation_has_zero_budget_and_no_docs() {
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(77),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("empty-packet".into()),
title: Some("Empty packet".into()),
source_path: PathBuf::from("/tmp/empty-packet.jsonl"),
started_at: Some(1_700_000_200_000),
ended_at: Some(1_700_000_200_000),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRows::new(),
None,
&HashMap::new(),
);
assert_eq!(packet.message_count, 0);
assert_eq!(packet.message_bytes, 0);
assert!(packet.prebuilt_docs().is_empty());
assert!(!packet.diagnostics.missing_conversation_id);
}
#[test]
fn lexical_rebuild_packet_missing_conversation_id_is_explicit_and_non_indexable() {
let mut grouped = crate::storage::sqlite::LexicalRebuildGroupedMessageRows::new();
grouped.push(crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_300_010),
content: "missing-id packet body".into(),
});
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("missing-id-packet".into()),
title: Some("Missing id packet".into()),
source_path: PathBuf::from("/tmp/missing-id-packet.jsonl"),
started_at: Some(1_700_000_300_000),
ended_at: Some(1_700_000_300_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
grouped,
None,
&HashMap::new(),
);
assert!(packet.diagnostics.missing_conversation_id);
assert_eq!(packet.message_count, 1);
assert_eq!(packet.message_bytes, "missing-id packet body".len());
assert!(packet.prebuilt_docs().is_empty());
}
#[test]
fn lexical_rebuild_packet_large_payload_budget_is_exact_and_preserves_doc_fallbacks() {
let large_body = "x".repeat(32 * 1024);
let mut grouped = crate::storage::sqlite::LexicalRebuildGroupedMessageRows::new();
grouped.push(crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: None,
content: large_body.clone(),
});
grouped.push(crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_400_020),
content: "tail".into(),
});
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(88),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("large-packet".into()),
title: Some("Large packet".into()),
source_path: PathBuf::from("/tmp/large-packet.jsonl"),
started_at: Some(1_700_000_400_000),
ended_at: Some(1_700_000_400_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
grouped,
Some(123),
&HashMap::new(),
);
let docs = packet.prebuilt_docs();
assert_eq!(packet.message_count, 2);
assert_eq!(packet.message_bytes, large_body.len() + 4);
assert_eq!(packet.last_message_id, Some(123));
assert_eq!(docs.len(), 2);
assert_eq!(docs[0].conversation_id, Some(88));
assert_eq!(docs[0].created_at, Some(1_700_000_400_000));
assert_eq!(docs[0].content.len(), large_body.len());
assert_eq!(docs[0].source_id, LOCAL_SOURCE_ID);
assert_eq!(docs[1].created_at, Some(1_700_000_400_020));
}
#[test]
fn prepare_lexical_rebuild_packet_batch_preserves_order_and_parallel_equivalence() {
let source_map = HashMap::from([(
"builder-prep".to_string(),
(SourceKind::Ssh, Some("builder-prep".to_string())),
)]);
let conversation_page = vec![
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(11),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace-a")),
external_id: Some("prep-a".into()),
title: Some("Prep A".into()),
source_path: PathBuf::from("/tmp/prep-a.jsonl"),
started_at: Some(1_700_000_500_000),
ended_at: Some(1_700_000_500_100),
source_id: "builder-prep".into(),
origin_host: Some("builder-prep".into()),
},
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(22),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace-b")),
external_id: Some("prep-empty".into()),
title: Some("Prep Empty".into()),
source_path: PathBuf::from("/tmp/prep-empty.jsonl"),
started_at: Some(1_700_000_500_200),
ended_at: Some(1_700_000_500_200),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(33),
agent_slug: "codex".into(),
workspace: None,
external_id: Some("prep-c".into()),
title: Some("Prep C".into()),
source_path: PathBuf::from("/tmp/prep-c.jsonl"),
started_at: Some(1_700_000_500_400),
ended_at: Some(1_700_000_500_500),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
];
let grouped_messages = HashMap::from([
(
11_i64,
vec![
Message {
id: Some(501),
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_500_010),
content: "prep-a-0".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
},
Message {
id: Some(502),
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_500_020),
content: "prep-a-1".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
},
],
),
(
33_i64,
vec![Message {
id: Some(601),
idx: 0,
role: MessageRole::Tool,
author: Some("tool".into()),
created_at: Some(1_700_000_500_410),
content: "prep-c-0".into(),
extra_json: serde_json::json!({}),
snippets: Vec::new(),
}],
),
]);
let serial_packets = prepare_lexical_rebuild_packet_batch(
conversation_page.clone(),
grouped_messages.clone(),
&source_map,
None,
)
.unwrap();
let worker_pool = ThreadPoolBuilder::new().num_threads(2).build().unwrap();
let parallel_packets = prepare_lexical_rebuild_packet_batch(
conversation_page,
grouped_messages,
&source_map,
Some(&worker_pool),
)
.unwrap();
let serial_views = serial_packets
.iter()
.map(LexicalRebuildConversationPacket::semantic_view)
.collect::<Vec<_>>();
let parallel_views = parallel_packets
.iter()
.map(LexicalRebuildConversationPacket::semantic_view)
.collect::<Vec<_>>();
assert_eq!(serial_views, parallel_views);
assert_eq!(
parallel_packets
.iter()
.map(|packet| packet.identity.external_id.as_deref())
.collect::<Vec<_>>(),
vec![Some("prep-a"), Some("prep-empty"), Some("prep-c")]
);
assert_eq!(parallel_packets[0].last_message_id, Some(502));
assert_eq!(parallel_packets[1].message_count, 0);
assert_eq!(parallel_packets[2].provenance.origin_kind, "local");
}
fn lexical_rebuild_test_source_map(
storage: &FrankenStorage,
) -> HashMap<String, (SourceKind, Option<String>)> {
storage
.list_sources()
.unwrap_or_default()
.into_iter()
.map(|source| (source.id, (source.kind, source.host_label)))
.collect()
}
fn legacy_offset_lexical_rebuild_packets(
storage: &FrankenStorage,
page_size: i64,
) -> Result<Vec<LexicalRebuildConversationPacket>> {
let source_map = lexical_rebuild_test_source_map(storage);
let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups()?;
let mut packets = Vec::new();
let mut offset = 0_i64;
loop {
let conversation_page = storage.list_conversations_for_lexical_rebuild_by_offset(
page_size,
offset,
&agent_slugs,
&workspace_paths,
)?;
if conversation_page.is_empty() {
break;
}
offset = offset.saturating_add(i64::try_from(conversation_page.len()).unwrap_or(0));
for conversation in conversation_page {
let conversation_id = conversation.id.ok_or_else(|| {
anyhow::anyhow!("legacy lexical rebuild row missing conversation id")
})?;
let grouped_messages = HashMap::from([(
conversation_id,
storage.fetch_messages_for_lexical_rebuild(conversation_id)?,
)]);
let mut prepared = prepare_lexical_rebuild_packet_batch(
vec![conversation],
grouped_messages,
&source_map,
None,
)?;
packets.push(
prepared
.pop()
.expect("single-conversation legacy packet should be prepared"),
);
}
}
Ok(packets)
}
fn keyset_batched_lexical_rebuild_packets(
storage: &FrankenStorage,
page_size: i64,
batch_limit: usize,
) -> Result<Vec<LexicalRebuildConversationPacket>> {
let source_map = lexical_rebuild_test_source_map(storage);
let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups()?;
let mut packets = Vec::new();
let mut after_conversation_id = 0_i64;
let batch_limit = batch_limit.max(1);
loop {
let conversation_page = storage.list_conversations_for_lexical_rebuild_after_id(
page_size,
after_conversation_id,
&agent_slugs,
&workspace_paths,
)?;
if conversation_page.is_empty() {
break;
}
after_conversation_id = conversation_page
.last()
.and_then(|conversation| conversation.id)
.ok_or_else(|| {
anyhow::anyhow!("keyset lexical rebuild page missing terminal id")
})?;
for chunk in conversation_page.chunks(batch_limit) {
let conversation_chunk = chunk.to_vec();
let conversation_ids = conversation_chunk
.iter()
.filter_map(|conversation| conversation.id)
.collect::<Vec<_>>();
let grouped_messages = storage.fetch_messages_for_lexical_rebuild_batch(
&conversation_ids,
None,
None,
)?;
let mut prepared = prepare_lexical_rebuild_packet_batch(
conversation_chunk,
grouped_messages,
&source_map,
None,
)?;
packets.append(&mut prepared);
}
}
Ok(packets)
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct LexicalRebuildEquivalenceEvidence {
document_count: usize,
manifest_fingerprint: String,
golden_query_digest: String,
}
fn lexical_rebuild_packet_artifact(
packets: &[LexicalRebuildConversationPacket],
) -> serde_json::Value {
let packet_rows = packets
.iter()
.map(|packet| {
serde_json::json!({
"conversation_id": packet.identity.conversation_id,
"external_id": packet.identity.external_id.as_deref(),
"agent": &packet.identity.agent,
"workspace": packet.identity.workspace.as_deref(),
"source_path": &packet.identity.source_path,
"source_id": &packet.provenance.source_id,
"origin_kind": &packet.provenance.origin_kind,
"origin_host": packet.provenance.origin_host.as_deref(),
"message_count": packet.message_count,
"message_bytes": packet.message_bytes,
"last_message_id": packet.last_message_id,
"messages": packet.messages.iter().map(|message| {
serde_json::json!({
"idx": message.idx,
"is_tool_role": message.is_tool_role,
"created_at": message.created_at,
"content": &message.content,
})
}).collect::<Vec<_>>(),
})
})
.collect::<Vec<_>>();
let document_rows = packets
.iter()
.flat_map(|packet| {
packet.prebuilt_docs().into_iter().map(|doc| {
serde_json::json!({
"conversation_id": doc.conversation_id,
"agent": doc.agent,
"workspace": doc.workspace,
"source_path": doc.source_path,
"msg_idx": doc.msg_idx,
"created_at": doc.created_at,
"title": doc.title,
"content": doc.content,
"source_id": doc.source_id,
"origin_kind": doc.origin_kind,
"origin_host": doc.origin_host,
})
})
})
.collect::<Vec<_>>();
serde_json::json!({
"packets": packet_rows,
"documents": document_rows,
})
}
fn stable_json_digest(value: &serde_json::Value) -> String {
let bytes = serde_json::to_vec(value).expect("equivalence artifact should serialize");
blake3::hash(&bytes).to_hex().to_string()
}
fn lexical_rebuild_golden_query_digest(packets: &[LexicalRebuildConversationPacket]) -> String {
let queries = [
"lexical-fixture-1",
"lexical-fixture-2-second",
"missing-golden-query",
];
let hits = queries
.iter()
.flat_map(|query| {
packets.iter().flat_map(move |packet| {
packet.prebuilt_docs().into_iter().filter_map(move |doc| {
let title = doc.title.unwrap_or("");
let workspace = doc.workspace.unwrap_or("");
if doc.content.contains(query)
|| title.contains(query)
|| workspace.contains(query)
|| doc.source_path.contains(query)
{
Some(serde_json::json!({
"query": query,
"conversation_id": doc.conversation_id,
"agent": doc.agent,
"source_path": doc.source_path,
"msg_idx": doc.msg_idx,
"created_at": doc.created_at,
"content": doc.content,
}))
} else {
None
}
})
})
})
.collect::<Vec<_>>();
stable_json_digest(&serde_json::json!({
"queries": queries,
"hits": hits,
}))
}
fn lexical_rebuild_equivalence_evidence(
packets: &[LexicalRebuildConversationPacket],
) -> LexicalRebuildEquivalenceEvidence {
let artifact = lexical_rebuild_packet_artifact(packets);
LexicalRebuildEquivalenceEvidence {
document_count: packets
.iter()
.map(|packet| packet.prebuilt_docs().len())
.sum(),
manifest_fingerprint: stable_json_digest(&artifact),
golden_query_digest: lexical_rebuild_golden_query_digest(packets),
}
}
#[test]
fn keyset_batched_lexical_rebuild_matches_legacy_offset_replay_evidence() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("lexical-equivalence.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let legacy_packets = legacy_offset_lexical_rebuild_packets(&storage, 1).unwrap();
let keyset_packets = keyset_batched_lexical_rebuild_packets(&storage, 2, 2).unwrap();
let legacy_evidence = lexical_rebuild_equivalence_evidence(&legacy_packets);
let keyset_evidence = lexical_rebuild_equivalence_evidence(&keyset_packets);
let legacy_artifact =
serde_json::to_string_pretty(&lexical_rebuild_packet_artifact(&legacy_packets))
.unwrap();
let keyset_artifact =
serde_json::to_string_pretty(&lexical_rebuild_packet_artifact(&keyset_packets))
.unwrap();
assert_eq!(
legacy_packets
.iter()
.map(LexicalRebuildConversationPacket::semantic_view)
.collect::<Vec<_>>(),
keyset_packets
.iter()
.map(LexicalRebuildConversationPacket::semantic_view)
.collect::<Vec<_>>(),
"legacy offset replay and keyset batched replay diverged\nlegacy_artifact={legacy_artifact}\nkeyset_artifact={keyset_artifact}"
);
assert_eq!(
legacy_evidence, keyset_evidence,
"equivalence evidence diverged\nlegacy_artifact={legacy_artifact}\nkeyset_artifact={keyset_artifact}"
);
assert_eq!(keyset_evidence.document_count, 4);
}
#[test]
fn assign_lexical_rebuild_flow_reservation_bytes_preserves_exact_total() {
let make_packet = |conversation_id: i64, content: &str| {
let messages = if content.is_empty() {
crate::storage::sqlite::LexicalRebuildGroupedMessageRows::new()
} else {
vec![crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_600_000 + conversation_id),
content: content.to_string(),
}]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>()
};
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(conversation_id),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(format!("reservation-{conversation_id}")),
title: Some("Reservation packet".into()),
source_path: PathBuf::from(format!("/tmp/reservation-{conversation_id}.jsonl")),
started_at: Some(1_700_000_600_000),
ended_at: Some(1_700_000_600_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
messages,
Some(conversation_id),
&HashMap::new(),
)
};
let mut packets = vec![
make_packet(1, "abcdefghij"),
make_packet(2, ""),
make_packet(3, "0123456789abcdefghij0123456789"),
];
assign_lexical_rebuild_flow_reservation_bytes(&mut packets, 17);
assert_eq!(
packets
.iter()
.map(|packet| packet.flow_reservation_bytes)
.sum::<usize>(),
17
);
assert_eq!(packets[1].flow_reservation_bytes, 0);
assert!(packets[2].flow_reservation_bytes >= packets[0].flow_reservation_bytes);
}
#[test]
fn streaming_batch_flow_reservation_releases_on_drop_and_is_idempotent() {
let limiter = StreamingByteLimiter::new(16);
let reserved = limiter.acquire(7).unwrap();
{
let _guard = StreamingBatchFlowReservation::new(Some(&limiter), reserved);
assert_eq!(limiter.bytes_in_flight(), reserved);
}
assert_eq!(limiter.bytes_in_flight(), 0);
let reserved = limiter.acquire(5).unwrap();
{
let mut guard = StreamingBatchFlowReservation::new(Some(&limiter), reserved);
guard.release_now();
guard.release_now();
}
assert_eq!(limiter.bytes_in_flight(), 0);
}
#[test]
fn flush_streamed_lexical_rebuild_batch_releases_flow_reservation_bytes() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("tantivy");
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let mut pending_batch = vec![
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(101),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("flush-a".into()),
title: Some("Flush A".into()),
source_path: PathBuf::from("/tmp/flush-a.jsonl"),
started_at: Some(1_700_000_700_000),
ended_at: Some(1_700_000_700_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_700_010),
content: "alpha".into(),
}]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(501),
&HashMap::new(),
),
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(102),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("flush-b".into()),
title: Some("Flush B".into()),
source_path: PathBuf::from("/tmp/flush-b.jsonl"),
started_at: Some(1_700_000_700_200),
ended_at: Some(1_700_000_700_300),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_700_210),
content: "beta".into(),
}]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(502),
&HashMap::new(),
),
];
let total_messages = pending_batch
.iter()
.map(|packet| packet.message_count)
.sum::<usize>();
let total_message_bytes = pending_batch
.iter()
.map(|packet| packet.message_bytes)
.sum::<usize>();
let limiter = StreamingByteLimiter::new(total_message_bytes.max(1));
let reserved = limiter.acquire(total_message_bytes).unwrap();
assign_lexical_rebuild_flow_reservation_bytes(&mut pending_batch, reserved);
assert_eq!(limiter.bytes_in_flight(), reserved);
let mut pending_batch_message_count = total_messages;
let mut pending_batch_message_bytes = total_message_bytes;
let mut indexed_docs = 0usize;
let mut messages_since_commit = 0usize;
let mut message_bytes_since_commit = 0usize;
let mut current_batch_conversation_limit = 8usize;
flush_streamed_lexical_rebuild_batch(
&mut pending_batch,
&mut pending_batch_message_count,
&mut pending_batch_message_bytes,
Some(&limiter),
None,
&mut t_index,
&mut indexed_docs,
&mut messages_since_commit,
&mut message_bytes_since_commit,
&mut current_batch_conversation_limit,
8,
8,
None,
)
.unwrap();
assert_eq!(limiter.bytes_in_flight(), 0);
assert!(pending_batch.is_empty());
assert_eq!(indexed_docs, 2);
assert_eq!(messages_since_commit, total_messages);
assert_eq!(message_bytes_since_commit, total_message_bytes);
}
#[test]
fn flush_streamed_lexical_rebuild_batch_for_planned_shard_boundary_flushes_even_below_batch_limit()
{
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("tantivy");
let mut t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let mut pending_batch = vec![
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(201),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("boundary-a".into()),
title: Some("Boundary A".into()),
source_path: PathBuf::from("/tmp/boundary-a.jsonl"),
started_at: Some(1_700_000_800_000),
ended_at: Some(1_700_000_800_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_800_010),
content: "alpha".into(),
}]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(601),
&HashMap::new(),
),
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(202),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("boundary-b".into()),
title: Some("Boundary B".into()),
source_path: PathBuf::from("/tmp/boundary-b.jsonl"),
started_at: Some(1_700_000_800_200),
ended_at: Some(1_700_000_800_300),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_800_210),
content: "beta".into(),
}]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(602),
&HashMap::new(),
),
];
let total_messages = pending_batch
.iter()
.map(|packet| packet.message_count)
.sum::<usize>();
let total_message_bytes = pending_batch
.iter()
.map(|packet| packet.message_bytes)
.sum::<usize>();
let limiter = StreamingByteLimiter::new(total_message_bytes.max(1));
let reserved = limiter.acquire(total_message_bytes).unwrap();
assign_lexical_rebuild_flow_reservation_bytes(&mut pending_batch, reserved);
let mut pending_batch_message_count = total_messages;
let mut pending_batch_message_bytes = total_message_bytes;
let mut indexed_docs = 0usize;
let mut messages_since_commit = 0usize;
let mut message_bytes_since_commit = 0usize;
let mut current_batch_conversation_limit = 3usize;
let flushed = flush_streamed_lexical_rebuild_batch_for_planned_shard_boundary(
Some(4),
true,
&mut pending_batch,
&mut pending_batch_message_count,
&mut pending_batch_message_bytes,
Some(&limiter),
None,
&mut t_index,
&mut indexed_docs,
&mut messages_since_commit,
&mut message_bytes_since_commit,
&mut current_batch_conversation_limit,
8,
8,
None,
)
.unwrap();
assert!(
flushed,
"planned shard boundary should flush below the batch limit"
);
assert_eq!(limiter.bytes_in_flight(), 0);
assert!(pending_batch.is_empty());
assert_eq!(indexed_docs, 2);
assert_eq!(messages_since_commit, total_messages);
assert_eq!(message_bytes_since_commit, total_message_bytes);
assert_eq!(
current_batch_conversation_limit, 3,
"shard-boundary flush should not silently promote the active batch limit"
);
}
#[test]
fn lexical_rebuild_shard_indices_can_be_built_and_merged_from_packet_batches() {
let tmp = TempDir::new().unwrap();
let shard_a_path = tmp.path().join("shard-a");
let shard_b_path = tmp.path().join("shard-b");
let merged_path = tmp.path().join("merged");
let make_packet =
|conversation_id: i64, external_id: &str, message_a: &str, message_b: &str| {
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(conversation_id),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.into()),
title: Some(format!("Shard {external_id}")),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(1_700_000_900_000 + conversation_id),
ended_at: Some(1_700_000_900_100 + conversation_id),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_900_010 + conversation_id),
content: message_a.into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_900_020 + conversation_id),
content: message_b.into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(conversation_id * 10),
&HashMap::new(),
)
};
let shard_a = vec![
make_packet(1, "segment-a1", "alpha-1", "alpha-2"),
make_packet(2, "segment-a2", "alpha-3", "alpha-4"),
];
let shard_b = vec![
make_packet(3, "segment-b1", "beta-1", "beta-2"),
make_packet(4, "segment-b2", "beta-3", "beta-4"),
];
assert_eq!(
build_lexical_rebuild_shard_index(&shard_a_path, &shard_a, None).unwrap(),
4
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_b_path, &shard_b, None).unwrap(),
4
);
let merged_index =
crate::search::tantivy::TantivyIndex::merge_compatible_index_directories(
&merged_path,
&[&shard_a_path, &shard_b_path],
)
.unwrap();
let reader = merged_index.reader().unwrap();
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 8);
assert_eq!(
merged_index.segment_count(),
1,
"merged shard indices should collapse into a single final segment"
);
}
#[test]
fn lexical_rebuild_validates_built_shard_before_merge_frontier() {
let tmp = TempDir::new().unwrap();
let shard_path = tmp.path().join("validated-shard");
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(7),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("validated-shard".into()),
title: Some("Validated shard".into()),
source_path: PathBuf::from("/tmp/validated-shard.jsonl"),
started_at: Some(1_700_000_935_000),
ended_at: Some(1_700_000_935_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_935_010),
content: "validated alpha".into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_935_020),
content: "validated beta".into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(70),
&HashMap::new(),
);
let indexed_docs = build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap();
let shard = LexicalShardPlanShard {
shard_index: 0,
first_conversation_id: 7,
last_conversation_id: 7,
conversation_count: 1,
message_count: 2,
message_bytes: "validated alpha".len() + "validated beta".len(),
conversation_id_fingerprint: lexical_shard_conversation_ids_fingerprint(&[7]),
oversized_single_conversation: false,
};
let result = LexicalRebuildShardBuildResult {
shard: shard.clone(),
indexed_docs,
segments: 1,
shard_index_path: shard_path.clone(),
message_bytes: shard.message_bytes,
index_size_bytes: 0,
build_duration_ms: 0,
amplification_milli: None,
};
let artifact = validate_lexical_rebuild_shard_build_result(&result).unwrap();
assert_eq!(artifact.first_shard_index, 0);
assert_eq!(artifact.last_shard_index, 0);
assert_eq!(artifact.index_path, shard_path);
validate_complete_lexical_rebuild_shard_artifacts(
&LexicalShardPlan {
planner_version: LEXICAL_SHARD_PLAN_VERSION,
plan_id: "validated-plan".into(),
budgets: LexicalShardPlannerBudgets {
max_conversations_per_shard: 1,
max_messages_per_shard: 2,
max_message_bytes_per_shard: 10_000,
},
total_conversations: 1,
total_messages: 2,
total_message_bytes: "validated alpha".len() + "validated beta".len(),
oversized_conversation_ids: Vec::new(),
shards: vec![shard],
},
&[artifact],
)
.unwrap();
}
#[test]
fn lexical_rebuild_rejects_shard_doc_count_mismatch_before_merge_frontier() {
let tmp = TempDir::new().unwrap();
let shard_path = tmp.path().join("mismatched-shard");
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(8),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("mismatched-shard".into()),
title: Some("Mismatched shard".into()),
source_path: PathBuf::from("/tmp/mismatched-shard.jsonl"),
started_at: Some(1_700_000_936_000),
ended_at: Some(1_700_000_936_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_936_010),
content: "mismatch alpha".into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_936_020),
content: "mismatch beta".into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(80),
&HashMap::new(),
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap(),
2
);
let result = LexicalRebuildShardBuildResult {
shard: LexicalShardPlanShard {
shard_index: 0,
first_conversation_id: 8,
last_conversation_id: 8,
conversation_count: 1,
message_count: 2,
message_bytes: "mismatch alpha".len() + "mismatch beta".len(),
conversation_id_fingerprint: lexical_shard_conversation_ids_fingerprint(&[8]),
oversized_single_conversation: false,
},
indexed_docs: 1,
segments: 1,
shard_index_path: shard_path,
message_bytes: "mismatch alpha".len() + "mismatch beta".len(),
index_size_bytes: 0,
build_duration_ms: 0,
amplification_milli: None,
};
let err = validate_lexical_rebuild_shard_build_result(&result)
.unwrap_err()
.to_string();
assert!(
err.contains("reported 1 docs but a fresh Tantivy reader sees 2"),
"expected doc-count mismatch error, got {err}"
);
}
#[test]
fn shard_validate_tolerates_filter_induced_doc_lt_message_count_gap() {
let tmp = TempDir::new().unwrap();
let shard_path = tmp.path().join("filter-tolerance-shard");
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(80),
agent_slug: "codex".into(),
workspace: None,
external_id: Some("filter-gap".into()),
title: Some("filter gap".into()),
source_path: PathBuf::from("/tmp/filter-gap.jsonl"),
started_at: Some(1_700_000_940_000),
ended_at: Some(1_700_000_940_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_940_010),
content: "filter alpha".into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_940_020),
content: "filter beta".into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(80),
&HashMap::new(),
);
let indexed_docs = build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap();
assert_eq!(
indexed_docs, 2,
"fixture must build exactly 2 docs (precondition)"
);
let shard = LexicalShardPlanShard {
shard_index: 0,
first_conversation_id: 80,
last_conversation_id: 80,
conversation_count: 1,
message_count: 3,
message_bytes: "filter alpha".len() + "filter beta".len() + "dropped".len(),
conversation_id_fingerprint: lexical_shard_conversation_ids_fingerprint(&[80]),
oversized_single_conversation: false,
};
let result = LexicalRebuildShardBuildResult {
shard: shard.clone(),
indexed_docs,
segments: 1,
shard_index_path: shard_path.clone(),
message_bytes: shard.message_bytes,
index_size_bytes: 0,
build_duration_ms: 0,
amplification_milli: None,
};
let artifact = validate_lexical_rebuild_shard_build_result(&result)
.expect("filter-induced doc<message gap must NOT fail validation");
assert_eq!(artifact.first_shard_index, 0);
assert_eq!(artifact.last_shard_index, 0);
assert_eq!(artifact.index_path, shard_path);
validate_complete_lexical_rebuild_shard_artifacts(
&LexicalShardPlan {
planner_version: LEXICAL_SHARD_PLAN_VERSION,
plan_id: "filter-tolerance".into(),
budgets: LexicalShardPlannerBudgets {
max_conversations_per_shard: 1,
max_messages_per_shard: 3,
max_message_bytes_per_shard: 10_000,
},
total_conversations: 1,
total_messages: 3,
total_message_bytes: shard.message_bytes,
oversized_conversation_ids: Vec::new(),
shards: vec![shard],
},
&[artifact],
)
.expect("plan-level validator must tolerate filter-induced gaps too");
}
#[test]
fn shard_validate_rejects_doc_count_exceeding_shard_plan_message_count() {
let tmp = TempDir::new().unwrap();
let shard_path = tmp.path().join("inflation-shard");
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(81),
agent_slug: "codex".into(),
workspace: None,
external_id: Some("inflation".into()),
title: Some("inflation".into()),
source_path: PathBuf::from("/tmp/inflation.jsonl"),
started_at: Some(1_700_000_950_000),
ended_at: Some(1_700_000_950_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_950_010),
content: "inflation alpha".into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_950_020),
content: "inflation beta".into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(81),
&HashMap::new(),
);
let indexed_docs = build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap();
assert_eq!(indexed_docs, 2);
let shard = LexicalShardPlanShard {
shard_index: 0,
first_conversation_id: 81,
last_conversation_id: 81,
conversation_count: 1,
message_count: 1,
message_bytes: "inflation alpha".len(),
conversation_id_fingerprint: lexical_shard_conversation_ids_fingerprint(&[81]),
oversized_single_conversation: false,
};
let result = LexicalRebuildShardBuildResult {
shard: shard.clone(),
indexed_docs,
segments: 1,
shard_index_path: shard_path.clone(),
message_bytes: shard.message_bytes,
index_size_bytes: 0,
build_duration_ms: 0,
amplification_milli: None,
};
let err = validate_lexical_rebuild_shard_build_result(&result)
.unwrap_err()
.to_string();
assert!(
err.contains("EXCEEDS"),
"expected inflation hard-error to use the explicit EXCEEDS phrasing; got: {err}"
);
assert!(
err.contains("2 docs") && err.contains("1 source messages"),
"error must name observed (2) and planned (1) counts; got: {err}"
);
}
#[test]
fn lexical_rebuild_shard_index_tree_merges_multiple_rounds() {
let tmp = TempDir::new().unwrap();
let merge_stage_root = tmp.path().join("merge-stage");
let merged_path = tmp.path().join("merged");
let make_packet =
|conversation_id: i64, external_id: &str, message_a: &str, message_b: &str| {
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(conversation_id),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.into()),
title: Some(format!("Shard {external_id}")),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(1_700_000_910_000 + conversation_id),
ended_at: Some(1_700_000_910_100 + conversation_id),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_910_010 + conversation_id),
content: message_a.into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_910_020 + conversation_id),
content: message_b.into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(conversation_id * 10),
&HashMap::new(),
)
};
let shard_paths = (0..5)
.map(|idx| {
let shard_path = tmp.path().join(format!("tree-shard-{idx}"));
let packet = make_packet(
i64::from(idx + 1),
&format!("tree-segment-{idx}"),
&format!("alpha-{idx}"),
&format!("beta-{idx}"),
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap(),
2
);
shard_path
})
.collect::<Vec<_>>();
let merged_index = merge_lexical_rebuild_shard_index_tree(
&merged_path,
&shard_paths,
&merge_stage_root,
2,
)
.unwrap();
let reader = merged_index.reader().unwrap();
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 10);
assert_eq!(
merged_index.segment_count(),
1,
"tree-merged shard indices should collapse into a single final segment"
);
}
#[test]
fn finalize_staged_lexical_rebuild_publish_artifact_reuses_single_input_without_remerge() {
let tmp = TempDir::new().unwrap();
let merge_stage_root = tmp.path().join("merge-stage");
let merged_path = tmp.path().join("merged");
let shard_path = tmp.path().join("single-shard");
let packet = LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(1),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("single-segment".into()),
title: Some("Single shard".into()),
source_path: PathBuf::from("/tmp/single-segment.jsonl"),
started_at: Some(1_700_000_915_000),
ended_at: Some(1_700_000_915_100),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_915_010),
content: "alpha".into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_915_020),
content: "beta".into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(10),
&HashMap::new(),
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap(),
2
);
let final_artifact = finalize_staged_lexical_rebuild_publish_artifact(
&merged_path,
std::slice::from_ref(&shard_path),
&merge_stage_root,
2,
)
.unwrap();
assert_eq!(final_artifact.docs, 2);
assert_eq!(
final_artifact.publish_path, shard_path,
"single-input finalization should reuse the existing shard artifact directly"
);
assert!(
!merged_path.exists(),
"single-input finalization should skip materializing a redundant merged directory"
);
}
#[test]
fn finalize_staged_lexical_rebuild_publish_artifact_publishes_federated_multi_input_frontier_without_doc_remerge()
{
let tmp = TempDir::new().unwrap();
let merge_stage_root = tmp.path().join("merge-stage");
let merged_path = tmp.path().join("merged");
let make_packet =
|conversation_id: i64, external_id: &str, message_a: &str, message_b: &str| {
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(conversation_id),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.into()),
title: Some(format!("Shard {external_id}")),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(1_700_000_916_000 + conversation_id),
ended_at: Some(1_700_000_916_100 + conversation_id),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_916_010 + conversation_id),
content: message_a.into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_916_020 + conversation_id),
content: message_b.into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(conversation_id * 10),
&HashMap::new(),
)
};
let shard_paths = (0..3)
.map(|idx| {
let shard_path = tmp.path().join(format!("finalize-shard-{idx}"));
let packet = make_packet(
i64::from(idx + 1),
&format!("finalize-{idx}"),
&format!("alpha-{idx}"),
&format!("beta-{idx}"),
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap(),
2
);
shard_path
})
.collect::<Vec<_>>();
let final_artifact = finalize_staged_lexical_rebuild_publish_artifact(
&merged_path,
&shard_paths,
&merge_stage_root,
3,
)
.unwrap();
assert_eq!(final_artifact.docs, 6);
assert_eq!(
final_artifact.segments, 3,
"multi-input finalization should preserve the final shard frontier without remerging docs"
);
assert_eq!(
final_artifact.publish_path, merged_path,
"multi-input finalization should materialize the federated publish bundle at the requested output path"
);
assert_eq!(
crate::search::tantivy::open_federated_search_readers(
&merged_path,
frankensearch::lexical::ReloadPolicy::Manual,
)
.unwrap()
.expect("federated readers")
.len(),
3,
"multi-input finalization should publish the three final shard artifacts as a federated lexical bundle"
);
assert!(
!merge_stage_root.join("round-00000").exists(),
"multi-input finalization should not materialize a fallback merge-tree round"
);
}
#[test]
fn merge_lexical_rebuild_shard_index_tree_merges_small_frontier_without_round_directory() {
let tmp = TempDir::new().unwrap();
let merge_stage_root = tmp.path().join("merge-stage");
let merged_path = tmp.path().join("merged");
let make_packet =
|conversation_id: i64, external_id: &str, message_a: &str, message_b: &str| {
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(conversation_id),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.into()),
title: Some(format!("Shard {external_id}")),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(1_700_000_918_000 + conversation_id),
ended_at: Some(1_700_000_918_100 + conversation_id),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_918_010 + conversation_id),
content: message_a.into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_918_020 + conversation_id),
content: message_b.into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(conversation_id * 10),
&HashMap::new(),
)
};
let shard_paths = (0..3)
.map(|idx| {
let shard_path = tmp.path().join(format!("small-frontier-shard-{idx}"));
let packet = make_packet(
i64::from(idx + 1),
&format!("small-frontier-{idx}"),
&format!("alpha-{idx}"),
&format!("beta-{idx}"),
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap(),
2
);
shard_path
})
.collect::<Vec<_>>();
let merged_index = merge_lexical_rebuild_shard_index_tree(
&merged_path,
&shard_paths,
&merge_stage_root,
3,
)
.unwrap();
let reader = merged_index.reader().unwrap();
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 6);
assert!(
!merge_stage_root.join("round-00000").exists(),
"small final frontiers should merge directly without materializing a merge-tree round directory"
);
}
#[test]
fn reduce_staged_lexical_final_merge_frontier_via_workers_reduces_large_frontier_to_single_artifact()
{
let tmp = TempDir::new().unwrap();
let stage_root = tmp.path().join("final-frontier-stage");
let (merge_work_tx, merge_work_rx) = bounded::<LexicalRebuildShardMergeJob>(2);
let (merge_result_tx, merge_result_rx) = bounded::<LexicalRebuildShardMergeMessage>(8);
let merge_worker_handles =
spawn_lexical_rebuild_shard_merge_workers(2, merge_work_rx, merge_result_tx);
let make_packet =
|conversation_id: i64, external_id: &str, message_a: &str, message_b: &str| {
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(conversation_id),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.into()),
title: Some(format!("Shard {external_id}")),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(1_700_000_930_000 + conversation_id),
ended_at: Some(1_700_000_930_100 + conversation_id),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_930_010 + conversation_id),
content: message_a.into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_930_020 + conversation_id),
content: message_b.into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(conversation_id * 10),
&HashMap::new(),
)
};
let frontier_ranges = [
(0usize, 3usize),
(4, 7),
(8, 11),
(12, 12),
(13, 13),
(14, 14),
];
let frontier = frontier_ranges
.iter()
.enumerate()
.map(|(idx, (first_shard_index, last_shard_index))| {
let shard_path = tmp.path().join(format!("frontier-artifact-{idx}"));
let packet = make_packet(
i64::try_from(idx + 1).unwrap(),
&format!("frontier-{idx}"),
&format!("alpha-{idx}"),
&format!("beta-{idx}"),
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap(),
2
);
LexicalRebuildShardMergeArtifact {
first_shard_index: *first_shard_index,
last_shard_index: *last_shard_index,
index_path: shard_path,
docs: 2,
segments: 1,
}
})
.collect::<Vec<_>>();
let reduced = reduce_staged_lexical_final_merge_frontier_via_workers(
frontier,
&stage_root,
2,
&merge_work_tx,
&merge_result_rx,
)
.unwrap();
assert_eq!(reduced.len(), 1);
assert_eq!(reduced[0].first_shard_index, 0);
assert_eq!(reduced[0].last_shard_index, 14);
let merged_index =
crate::search::tantivy::TantivyIndex::open_or_create(&reduced[0].index_path).unwrap();
let reader = merged_index.reader().unwrap();
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 12);
drop(merge_work_tx);
for handle in merge_worker_handles {
handle.join().unwrap();
}
}
#[test]
fn lexical_rebuild_eager_merge_coordinator_reduces_full_groups_before_final_merge() {
let tmp = TempDir::new().unwrap();
let eager_merge_stage_root = tmp.path().join("eager-merge-stage");
let final_merge_stage_root = tmp.path().join("final-merge-stage");
let merged_path = tmp.path().join("merged");
let (merge_work_tx, merge_work_rx) = bounded::<LexicalRebuildShardMergeJob>(1);
let (merge_result_tx, merge_result_rx) = bounded::<LexicalRebuildShardMergeMessage>(8);
let merge_worker_handles =
spawn_lexical_rebuild_shard_merge_workers(1, merge_work_rx, merge_result_tx);
let make_packet =
|conversation_id: i64, external_id: &str, message_a: &str, message_b: &str| {
LexicalRebuildConversationPacket::from_canonical_replay(
crate::storage::sqlite::LexicalRebuildConversationRow {
id: Some(conversation_id),
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.into()),
title: Some(format!("Shard {external_id}")),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(1_700_000_920_000 + conversation_id),
ended_at: Some(1_700_000_920_100 + conversation_id),
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
vec![
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 0,
is_tool_role: false,
created_at: Some(1_700_000_920_010 + conversation_id),
content: message_a.into(),
},
crate::storage::sqlite::LexicalRebuildGroupedMessageRow {
idx: 1,
is_tool_role: false,
created_at: Some(1_700_000_920_020 + conversation_id),
content: message_b.into(),
},
]
.into_iter()
.collect::<crate::storage::sqlite::LexicalRebuildGroupedMessageRows>(),
Some(conversation_id * 10),
&HashMap::new(),
)
};
let shard_count = LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN + 1;
let shard_paths = (0..shard_count)
.map(|idx| {
let shard_path = tmp.path().join(format!("eager-shard-{idx}"));
let packet = make_packet(
i64::try_from(idx + 1).unwrap(),
&format!("eager-segment-{idx}"),
&format!("alpha-{idx}"),
&format!("beta-{idx}"),
);
assert_eq!(
build_lexical_rebuild_shard_index(&shard_path, &[packet], None).unwrap(),
2
);
shard_path
})
.collect::<Vec<_>>();
let mut merge_coordinator =
LexicalRebuildShardMergeCoordinator::new(eager_merge_stage_root);
merge_coordinator
.set_allowed_pending_merge_jobs(8, &merge_work_tx)
.unwrap();
for (idx, shard_path) in shard_paths.iter().enumerate() {
merge_coordinator
.queue_base_artifact(
LexicalRebuildShardMergeArtifact {
first_shard_index: idx,
last_shard_index: idx,
index_path: shard_path.clone(),
docs: 2,
segments: 1,
},
&merge_work_tx,
)
.unwrap();
}
assert_eq!(
merge_coordinator.pending_merge_jobs(),
1,
"one more than the eager fan-in should trigger one eager merge with one shard left unmerged"
);
let eager_merge_result = match merge_result_rx
.recv_timeout(Duration::from_secs(30))
.unwrap()
{
LexicalRebuildShardMergeMessage::Built(result) => result,
LexicalRebuildShardMergeMessage::Error {
output_level,
first_shard_index,
last_shard_index,
error,
} => panic!(
"unexpected eager merge failure at level {output_level} for {first_shard_index}..={last_shard_index}: {error}"
),
};
merge_coordinator
.complete_merge(eager_merge_result, &merge_work_tx)
.unwrap();
assert_eq!(merge_coordinator.pending_merge_jobs(), 0);
let final_merge_inputs = merge_coordinator
.final_merge_input_artifacts()
.into_iter()
.map(|artifact| artifact.index_path)
.collect::<Vec<_>>();
assert_eq!(
final_merge_inputs.len(),
2,
"eager reduction should shrink one fan-in group down to one merged artifact plus one tail shard"
);
drop(merge_work_tx);
for handle in merge_worker_handles {
handle.join().unwrap();
}
let merged_index = merge_lexical_rebuild_shard_index_tree(
&merged_path,
&final_merge_inputs,
&final_merge_stage_root,
1,
)
.unwrap();
let reader = merged_index.reader().unwrap();
reader.reload().unwrap();
assert_eq!(
reader.searcher().num_docs(),
u64::try_from(shard_count * 2).unwrap()
);
}
#[test]
fn lexical_rebuild_staged_merge_controller_trickles_under_builder_pressure() {
let controller = LexicalRebuildStagedMergeController::new(3, Some(7_000));
let fan_in = LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN;
let merge_coordinator = LexicalRebuildShardMergeCoordinator {
stage_root: PathBuf::from("/tmp/eager-merge"),
ready_levels: vec![
(0..fan_in)
.map(|idx| LexicalRebuildShardMergeArtifact {
first_shard_index: idx,
last_shard_index: idx,
index_path: PathBuf::from(format!("/tmp/shard-{idx}")),
docs: 0,
segments: 0,
})
.collect(),
],
next_output_seq_by_level: vec![0, 0],
pending_merge_jobs: 0,
allowed_pending_merge_jobs: 0,
};
let pressured_runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 2,
pending_batch_conversations: 4,
pending_batch_message_bytes: 32_768,
page_prep_workers: 6,
active_page_prep_jobs: 6,
ordered_buffered_pages: 1,
producer_handoff_wait_count: 1,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let decision = controller.decide(false, &pressured_runtime, &merge_coordinator);
assert_eq!(decision.workers_max, 3);
assert_eq!(decision.allowed_jobs, 1);
assert_eq!(decision.ready_artifacts, fan_in);
assert_eq!(decision.ready_groups, 1);
assert_eq!(
decision.controller_reason,
"builder_handoff_pressure_scaling_staged_merge_budget_1_active_jobs_0_ready_groups_1_debt_budget_1_buffered_pages_1_queue_depth_2"
);
}
#[test]
fn lexical_rebuild_staged_merge_controller_spends_debt_budget_under_page_prep_saturation() {
let controller = LexicalRebuildStagedMergeController::new(8, Some(7_000));
let merge_coordinator = LexicalRebuildShardMergeCoordinator {
stage_root: PathBuf::from("/tmp/eager-merge"),
ready_levels: vec![
(0..LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN * 3)
.map(|idx| LexicalRebuildShardMergeArtifact {
first_shard_index: idx,
last_shard_index: idx,
index_path: PathBuf::from(format!("/tmp/shard-{idx}")),
docs: 0,
segments: 0,
})
.collect(),
],
next_output_seq_by_level: vec![0, 0],
pending_merge_jobs: 0,
allowed_pending_merge_jobs: 0,
};
let saturated_runtime = LexicalRebuildPipelineRuntimeSnapshot {
page_prep_workers: 6,
active_page_prep_jobs: 6,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let decision = controller.decide(false, &saturated_runtime, &merge_coordinator);
assert_eq!(decision.workers_max, 8);
assert_eq!(decision.allowed_jobs, 3);
assert_eq!(
decision.ready_artifacts,
LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN * 3
);
assert_eq!(decision.ready_groups, 3);
assert_eq!(
decision.controller_reason,
"page_prep_workers_saturated_6_of_6_merge_budget_3_active_jobs_0_ready_groups_3"
);
}
#[test]
fn lexical_rebuild_staged_merge_controller_scales_under_large_merge_debt() {
let controller = LexicalRebuildStagedMergeController::new(4, Some(7_000));
let fan_in = LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN;
let ready_groups = 16;
let merge_coordinator = LexicalRebuildShardMergeCoordinator {
stage_root: PathBuf::from("/tmp/eager-merge"),
ready_levels: vec![
(0..fan_in * ready_groups)
.map(|idx| LexicalRebuildShardMergeArtifact {
first_shard_index: idx,
last_shard_index: idx,
index_path: PathBuf::from(format!("/tmp/shard-{idx}")),
docs: 0,
segments: 0,
})
.collect(),
],
next_output_seq_by_level: vec![0, 0],
pending_merge_jobs: 1,
allowed_pending_merge_jobs: 1,
};
let pressured_runtime = LexicalRebuildPipelineRuntimeSnapshot {
ordered_buffered_pages: 150,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let decision = controller.decide(false, &pressured_runtime, &merge_coordinator);
assert_eq!(decision.workers_max, 4);
assert_eq!(decision.allowed_jobs, 4);
assert_eq!(decision.active_jobs, 1);
assert_eq!(decision.ready_artifacts, fan_in * ready_groups);
assert_eq!(decision.ready_groups, ready_groups);
assert_eq!(
decision.controller_reason,
format!(
"builder_handoff_pressure_scaling_staged_merge_budget_4_active_jobs_1_ready_groups_{ready_groups}_debt_budget_4_buffered_pages_150_queue_depth_0"
)
);
}
#[test]
fn lexical_rebuild_staged_merge_controller_keeps_debt_budget_monotonic_with_more_workers() {
let controller = LexicalRebuildStagedMergeController::new(8, Some(7_000));
let fan_in = LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN;
let ready_groups = 12;
let merge_coordinator = LexicalRebuildShardMergeCoordinator {
stage_root: PathBuf::from("/tmp/eager-merge"),
ready_levels: vec![
(0..fan_in * ready_groups)
.map(|idx| LexicalRebuildShardMergeArtifact {
first_shard_index: idx,
last_shard_index: idx,
index_path: PathBuf::from(format!("/tmp/shard-{idx}")),
docs: 0,
segments: 0,
})
.collect(),
],
next_output_seq_by_level: vec![0, 0],
pending_merge_jobs: 1,
allowed_pending_merge_jobs: 1,
};
let pressured_runtime = LexicalRebuildPipelineRuntimeSnapshot {
ordered_buffered_pages: 120,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let decision = controller.decide(false, &pressured_runtime, &merge_coordinator);
assert_eq!(decision.workers_max, 8);
assert_eq!(decision.allowed_jobs, 8);
assert_eq!(decision.active_jobs, 1);
assert_eq!(decision.ready_artifacts, fan_in * ready_groups);
assert_eq!(decision.ready_groups, ready_groups);
assert_eq!(
decision.controller_reason,
format!(
"builder_handoff_pressure_scaling_staged_merge_budget_8_active_jobs_1_ready_groups_{ready_groups}_debt_budget_8_buffered_pages_120_queue_depth_0"
)
);
}
#[test]
fn lexical_rebuild_staged_merge_controller_restores_parallelism_after_producer_finishes() {
let controller = LexicalRebuildStagedMergeController::new(3, Some(7_000));
let fan_in = LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN;
let ready_groups = (LEXICAL_REBUILD_FINAL_FRONTIER_FEDERATED_SHARD_LIMIT / fan_in) + 1;
let merge_coordinator = LexicalRebuildShardMergeCoordinator {
stage_root: PathBuf::from("/tmp/eager-merge"),
ready_levels: vec![
(0..fan_in * ready_groups)
.map(|idx| LexicalRebuildShardMergeArtifact {
first_shard_index: idx,
last_shard_index: idx,
index_path: PathBuf::from(format!("/tmp/shard-{idx}")),
docs: 0,
segments: 0,
})
.collect(),
],
next_output_seq_by_level: vec![0, 0],
pending_merge_jobs: 1,
allowed_pending_merge_jobs: 1,
};
let clear_runtime = LexicalRebuildPipelineRuntimeSnapshot {
page_prep_workers: 6,
host_loadavg_1m_milli: Some(4_500),
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let decision = controller.decide(true, &clear_runtime, &merge_coordinator);
assert_eq!(decision.allowed_jobs, 3);
assert_eq!(decision.active_jobs, 1);
assert_eq!(decision.ready_artifacts, fan_in * ready_groups);
assert_eq!(decision.ready_groups, ready_groups);
assert_eq!(
decision.controller_reason,
"producer_finished_allowing_max_staged_merge_parallelism"
);
}
#[test]
fn lexical_rebuild_staged_shard_build_controller_borrows_idle_merge_budget() {
let controller = LexicalRebuildStagedShardBuildController::new(6, Some(7_000));
let runtime = LexicalRebuildPipelineRuntimeSnapshot::default();
let staged_merge_runtime = LexicalRebuildStagedMergeRuntimeSnapshot {
workers_max: 3,
allowed_jobs: 0,
active_jobs: 0,
ready_artifacts: 0,
ready_groups: 0,
controller_reason: "no_staged_merge_backlog".to_string(),
};
let decision = controller.decide(&runtime, &staged_merge_runtime, 2, 3, None);
assert_eq!(decision.workers_max, 6);
assert_eq!(decision.allowed_jobs, 5);
assert_eq!(decision.active_jobs, 2);
assert_eq!(decision.pending_jobs, 3);
assert_eq!(
decision.controller_reason,
"staged_merge_idle_lending_full_budget_to_shard_builders"
);
}
#[test]
fn lexical_rebuild_staged_shard_build_controller_reserves_slots_for_merge_backlog() {
let controller = LexicalRebuildStagedShardBuildController::new(6, Some(7_000));
let runtime = LexicalRebuildPipelineRuntimeSnapshot::default();
let staged_merge_runtime = LexicalRebuildStagedMergeRuntimeSnapshot {
workers_max: 3,
allowed_jobs: 2,
active_jobs: 1,
ready_artifacts: 5,
ready_groups: 1,
controller_reason: "pipeline_active".to_string(),
};
let decision = controller.decide(&runtime, &staged_merge_runtime, 3, 3, None);
assert_eq!(decision.workers_max, 6);
assert_eq!(decision.allowed_jobs, 4);
assert_eq!(decision.active_jobs, 3);
assert_eq!(decision.pending_jobs, 3);
assert_eq!(
decision.controller_reason,
"reserving_2_slots_for_staged_merge_active_jobs_1_ready_groups_1"
);
}
#[test]
fn lexical_rebuild_staged_shard_build_controller_pauses_new_jobs_at_emergency_memory_reserve() {
const GIB: usize = 1024 * 1024 * 1024;
let controller = LexicalRebuildStagedShardBuildController::new_with_memory_reserves(
6,
None,
16 * GIB,
4 * GIB,
);
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
host_available_memory_bytes: Some(2 * GIB as u64),
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let staged_merge_runtime = LexicalRebuildStagedMergeRuntimeSnapshot::default();
let decision = controller.decide(&runtime, &staged_merge_runtime, 2, 4, Some(64 << 20));
assert_eq!(decision.allowed_jobs, 2);
assert_eq!(decision.active_jobs, 2);
assert_eq!(decision.pending_jobs, 4);
assert_eq!(
decision.controller_reason,
format!(
"host_available_memory_bytes_{}_below_emergency_reserve_{}_pausing_new_staged_shard_builds",
2 * GIB,
4 * GIB
)
);
}
#[test]
fn lexical_rebuild_staged_shard_build_controller_allows_one_idle_job_below_memory_reserve() {
const GIB: usize = 1024 * 1024 * 1024;
let controller = LexicalRebuildStagedShardBuildController::new_with_memory_reserves(
6,
None,
16 * GIB,
4 * GIB,
);
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
host_available_memory_bytes: Some(8 * GIB as u64),
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let staged_merge_runtime = LexicalRebuildStagedMergeRuntimeSnapshot::default();
let decision = controller.decide(&runtime, &staged_merge_runtime, 0, 4, Some(64 << 20));
assert_eq!(decision.allowed_jobs, 1);
assert_eq!(
decision.controller_reason,
format!(
"host_available_memory_bytes_{}_below_reserve_{}_limiting_staged_shard_builds_to_1",
8 * GIB,
16 * GIB
)
);
}
#[test]
fn lexical_rebuild_staged_shard_build_controller_allows_small_idle_probe_below_emergency() {
const GIB: usize = 1024 * 1024 * 1024;
const MIB: usize = 1024 * 1024;
let controller = LexicalRebuildStagedShardBuildController::new_with_memory_reserves(
6,
None,
4 * GIB,
GIB,
);
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
host_available_memory_bytes: Some(768 * MIB as u64),
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let staged_merge_runtime = LexicalRebuildStagedMergeRuntimeSnapshot::default();
let decision = controller.decide(&runtime, &staged_merge_runtime, 0, 4, Some(MIB));
assert_eq!(decision.allowed_jobs, 1);
assert_eq!(
decision.controller_reason,
format!(
"host_available_memory_bytes_{}_below_emergency_reserve_{}_admitting_single_small_staged_shard_build_estimated_builder_bytes_{}",
768 * MIB,
GIB,
512 * MIB
)
);
}
#[test]
fn lexical_rebuild_staged_shard_build_controller_caps_override_fanout_by_memory_headroom() {
const GIB: usize = 1024 * 1024 * 1024;
let controller = LexicalRebuildStagedShardBuildController::new_with_memory_reserves(
8,
None,
4 * GIB,
GIB,
);
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
host_available_memory_bytes: Some(8 * GIB as u64),
staged_shard_build_observed_amplification_milli: Some(24_000),
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let staged_merge_runtime = LexicalRebuildStagedMergeRuntimeSnapshot::default();
let decision = controller.decide(&runtime, &staged_merge_runtime, 0, 8, Some(64 << 20));
assert_eq!(decision.allowed_jobs, 1);
assert_eq!(
decision.controller_reason,
format!(
"host_available_memory_bytes_{}_reserve_{}_estimated_builder_bytes_{}_limiting_staged_shard_builds_to_1",
8 * GIB,
4 * GIB,
36usize * (64usize << 20)
)
);
}
#[test]
fn lexical_rebuild_shard_build_telemetry_records_conservative_amplification() {
let telemetry = LexicalRebuildShardBuildTelemetry::default();
let result = LexicalRebuildShardBuildResult {
shard: LexicalShardPlanShard {
shard_index: 7,
first_conversation_id: 10,
last_conversation_id: 11,
conversation_count: 2,
message_count: 4,
message_bytes: 1_000,
conversation_id_fingerprint: "fingerprint".to_string(),
oversized_single_conversation: false,
},
indexed_docs: 4,
segments: 1,
shard_index_path: PathBuf::from("/tmp/shard-7"),
message_bytes: 1_000,
index_size_bytes: 2_500,
build_duration_ms: 123,
amplification_milli: lexical_rebuild_amplification_milli(2_500, 1_000),
};
telemetry.record(&result);
let snapshot = telemetry.snapshot();
assert_eq!(snapshot.completed_jobs, 1);
assert_eq!(snapshot.last_shard_index, Some(7));
assert_eq!(snapshot.last_message_bytes, 1_000);
assert_eq!(snapshot.last_index_size_bytes, 2_500);
assert_eq!(snapshot.last_duration_ms, 123);
assert_eq!(snapshot.last_amplification_milli, Some(2_500));
assert_eq!(
snapshot.observed_amplification_milli,
Some(LEXICAL_REBUILD_STAGED_SHARD_BUILD_AMPLIFICATION_FLOOR_MILLI),
"admission should use a conservative floor until observed amplification exceeds it"
);
}
#[test]
#[serial]
fn lexical_rebuild_pipeline_settings_snapshot_honors_env_overrides() {
let _responsiveness = set_env("CASS_RESPONSIVENESS_DISABLE", "1");
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "7");
let _reserved_cores = set_env("CASS_TANTIVY_REBUILD_RESERVED_CORES", "4");
let _controller_mode = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_MODE", "steady");
let _controller_clear_samples =
set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_CLEAR_SAMPLES", "5");
let _controller_hold_ms =
set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_HOLD_MS", "2345");
let _controller_loadavg_high = set_env(
"CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_HIGH_WATERMARK_1M",
"7.5",
);
let _controller_loadavg_low = set_env(
"CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_LOW_WATERMARK_1M",
"6.25",
);
let _steady_fetch = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "321");
let _startup_fetch = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"33",
);
let _steady_conversations =
set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS", "654");
let _startup_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS",
"65",
);
let _steady_messages = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_MESSAGES", "987");
let _startup_messages = set_env("CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_MESSAGES", "98");
let _steady_message_bytes =
set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_MESSAGE_BYTES", "123456");
let _startup_message_bytes = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_MESSAGE_BYTES",
"12345",
);
let _pipeline_channel = set_env("CASS_TANTIVY_REBUILD_PIPELINE_CHANNEL_SIZE", "5");
let _page_prep_workers = set_env("CASS_TANTIVY_REBUILD_PAGE_PREP_WORKERS", "3");
let _pipeline_bytes = set_env(
"CASS_TANTIVY_REBUILD_PIPELINE_MAX_MESSAGE_BYTES_IN_FLIGHT",
"777777",
);
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let _shard_builders = set_env("CASS_TANTIVY_REBUILD_STAGED_SHARD_BUILDERS", "4");
let _merge_workers = set_env("CASS_TANTIVY_REBUILD_STAGED_MERGE_WORKERS", "2");
let snapshot = lexical_rebuild_pipeline_settings_snapshot();
assert_eq!(snapshot.workers, 7);
assert!(snapshot.available_parallelism >= 1);
assert_eq!(
snapshot.reserved_cores,
4.min(snapshot.available_parallelism.saturating_sub(1))
);
assert_eq!(
snapshot.tantivy_writer_threads,
snapshot.available_parallelism.min(2)
);
assert_eq!(
snapshot.staged_shard_builders,
responsiveness::effective_worker_count(4).max(1)
);
assert_eq!(
snapshot.staged_merge_workers,
responsiveness::effective_worker_count(2).max(1)
);
assert_eq!(snapshot.controller_mode, "steady");
assert_eq!(snapshot.controller_restore_clear_samples, 5);
assert_eq!(snapshot.controller_restore_hold_ms, 2345);
assert_eq!(
snapshot.controller_loadavg_high_watermark_1m_milli,
Some(7_500)
);
assert_eq!(
snapshot.controller_loadavg_low_watermark_1m_milli,
Some(6_250)
);
assert_eq!(snapshot.page_size, LEXICAL_REBUILD_PAGE_SIZE);
assert_eq!(snapshot.steady_batch_fetch_conversations, 321);
assert_eq!(snapshot.startup_batch_fetch_conversations, 33);
assert_eq!(snapshot.steady_commit_every_conversations, 654);
assert_eq!(snapshot.startup_commit_every_conversations, 65);
assert_eq!(snapshot.steady_commit_every_messages, 987);
assert_eq!(snapshot.startup_commit_every_messages, 98);
assert_eq!(snapshot.steady_commit_every_message_bytes, 123456);
assert_eq!(snapshot.startup_commit_every_message_bytes, 12345);
assert_eq!(snapshot.pipeline_channel_size, 5);
assert_eq!(
snapshot.page_prep_workers,
responsiveness::effective_worker_count(3).max(1)
);
assert_eq!(snapshot.pipeline_max_message_bytes_in_flight, 777777);
}
#[test]
#[serial]
fn lexical_rebuild_pipeline_settings_snapshot_disables_global_controller() {
let _responsiveness = set_env("CASS_RESPONSIVENESS_DISABLE", "1");
let _controller_mode = unset_env_var("CASS_TANTIVY_REBUILD_CONTROLLER_MODE");
let _controller_loadavg_high =
unset_env_var("CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_HIGH_WATERMARK_1M");
let _controller_loadavg_low =
unset_env_var("CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_LOW_WATERMARK_1M");
let snapshot = lexical_rebuild_pipeline_settings_snapshot();
assert_eq!(snapshot.controller_mode, "steady");
assert_eq!(snapshot.controller_loadavg_high_watermark_1m_milli, None);
assert_eq!(snapshot.controller_loadavg_low_watermark_1m_milli, None);
}
#[test]
fn lexical_rebuild_default_worker_parallelism_reserves_machine_headroom() {
assert_eq!(lexical_rebuild_default_reserved_cores_for_available(1), 0);
assert_eq!(
lexical_rebuild_default_worker_parallelism_for_available(1),
1
);
assert_eq!(lexical_rebuild_default_reserved_cores_for_available(4), 1);
assert_eq!(
lexical_rebuild_default_worker_parallelism_for_available(4),
3
);
assert_eq!(lexical_rebuild_default_reserved_cores_for_available(8), 2);
assert_eq!(
lexical_rebuild_default_worker_parallelism_for_available(8),
6
);
assert_eq!(lexical_rebuild_default_reserved_cores_for_available(32), 4);
assert_eq!(
lexical_rebuild_default_worker_parallelism_for_available(32),
28
);
assert_eq!(lexical_rebuild_default_reserved_cores_for_available(128), 8);
assert_eq!(
lexical_rebuild_default_worker_parallelism_for_available(128),
64
);
}
#[test]
fn lexical_rebuild_default_batch_fetch_conversation_limit_scales_with_writer_parallelism() {
assert_eq!(
lexical_rebuild_default_batch_fetch_conversation_limit(1024, 1),
512
);
assert_eq!(
lexical_rebuild_default_batch_fetch_conversation_limit(1024, 4),
512
);
assert_eq!(
lexical_rebuild_default_batch_fetch_conversation_limit(1024, 8),
1024
);
assert_eq!(
lexical_rebuild_default_batch_fetch_conversation_limit(1024, 16),
1024
);
assert_eq!(
lexical_rebuild_default_batch_fetch_conversation_limit(256, 16),
256
);
}
#[test]
fn lexical_rebuild_default_staged_merge_worker_parallelism_tracks_quarter_budget() {
assert_eq!(
lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(1),
1
);
assert_eq!(
lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(4),
1
);
assert_eq!(
lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(8),
2
);
assert_eq!(
lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(12),
3
);
assert_eq!(
lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(32),
8
);
assert_eq!(
lexical_rebuild_default_staged_merge_worker_parallelism_for_workers(64),
8
);
}
#[test]
fn lexical_rebuild_final_frontier_reduction_only_runs_above_federated_publish_cap() {
assert!(!should_reduce_staged_lexical_final_frontier(0));
assert!(!should_reduce_staged_lexical_final_frontier(
LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN + 1
));
assert!(!should_reduce_staged_lexical_final_frontier(
LEXICAL_REBUILD_FINAL_FRONTIER_FEDERATED_SHARD_LIMIT
));
assert!(should_reduce_staged_lexical_final_frontier(
LEXICAL_REBUILD_FINAL_FRONTIER_FEDERATED_SHARD_LIMIT + 1
));
}
#[test]
fn staged_merge_controller_skips_finished_tail_merges_within_federated_publish_cap() {
let tmp = TempDir::new().unwrap();
let (merge_work_tx, _merge_work_rx) = bounded::<LexicalRebuildShardMergeJob>(1);
let mut merge_coordinator =
LexicalRebuildShardMergeCoordinator::new(tmp.path().join("eager-merge-stage"));
for shard_index in 0..LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN {
merge_coordinator
.queue_base_artifact(
LexicalRebuildShardMergeArtifact {
first_shard_index: shard_index,
last_shard_index: shard_index,
index_path: tmp.path().join(format!("shard-{shard_index:05}")),
docs: 1,
segments: 1,
},
&merge_work_tx,
)
.unwrap();
}
let controller = LexicalRebuildStagedMergeController::new(8, None);
let decision = controller.decide(
true,
&LexicalRebuildPipelineRuntimeSnapshot::default(),
&merge_coordinator,
);
assert_eq!(decision.ready_artifacts, 8);
assert_eq!(decision.ready_groups, 1);
assert_eq!(
decision.allowed_jobs, 0,
"a bounded final frontier should publish federated instead of paying another eager merge"
);
assert!(
decision
.controller_reason
.contains("final_frontier_within_federated_cap"),
"unexpected controller reason: {}",
decision.controller_reason
);
}
#[test]
fn lexical_rebuild_default_staged_shard_builder_parallelism_uses_bounded_builder_farm() {
const GIB: u64 = 1024 * 1024 * 1024;
assert_eq!(
lexical_rebuild_default_staged_shard_builder_parallelism_for_workers_and_memory(
1, None,
),
1
);
assert_eq!(
lexical_rebuild_default_staged_shard_builder_parallelism_for_workers_and_memory(
4, None,
),
4
);
assert_eq!(
lexical_rebuild_default_staged_shard_builder_parallelism_for_workers_and_memory(
8,
Some(128 * GIB),
),
2,
"128 GiB hosts should not default to an 8-shard Tantivy build storm"
);
assert_eq!(
lexical_rebuild_default_staged_shard_builder_parallelism_for_workers_and_memory(
32,
Some(512 * GIB),
),
8
);
}
#[test]
fn lexical_rebuild_default_staged_shard_max_message_bytes_scales_with_available_memory() {
const GIB: u64 = 1024 * 1024 * 1024;
assert_eq!(
lexical_rebuild_default_staged_shard_max_message_bytes_for_available_memory(None),
64 * 1024 * 1024
);
assert_eq!(
lexical_rebuild_default_staged_shard_max_message_bytes_for_available_memory(Some(
32 * GIB
)),
16 * 1024 * 1024
);
assert_eq!(
lexical_rebuild_default_staged_shard_max_message_bytes_for_available_memory(Some(
128 * GIB
)),
64 * 1024 * 1024
);
assert_eq!(
lexical_rebuild_default_staged_shard_max_message_bytes_for_available_memory(Some(
512 * GIB
)),
128 * 1024 * 1024
);
}
#[test]
#[serial]
fn lexical_rebuild_shard_planner_respects_staged_shard_byte_cap() {
let _cap = set_env(
"CASS_TANTIVY_REBUILD_STAGED_SHARD_MAX_MESSAGE_BYTES",
"65536",
);
let settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 12,
available_parallelism: 12,
reserved_cores: 2,
tantivy_writer_threads: 8,
staged_shard_builders: 8,
staged_merge_workers: 3,
controller_mode: "steady".into(),
controller_restore_clear_samples: 3,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 1024,
startup_batch_fetch_conversations: 256,
steady_commit_every_conversations: 1024,
startup_commit_every_conversations: 256,
steady_commit_every_messages: 2048,
startup_commit_every_messages: 512,
steady_commit_every_message_bytes: 512 * 1024 * 1024,
startup_commit_every_message_bytes: 128 * 1024 * 1024,
pipeline_channel_size: 2,
page_prep_workers: 6,
pipeline_max_message_bytes_in_flight: 4 * 1024 * 1024,
};
let budgets = lexical_rebuild_default_shard_planner_budgets_for_totals(
&settings,
10_000,
1_000_000,
4usize * 1024 * 1024 * 1024,
);
assert_eq!(budgets.max_message_bytes_per_shard, 65_536);
}
#[test]
#[serial]
fn lexical_rebuild_pending_shard_build_job_cap_scales_with_builders_and_env() {
let _invalid_override = set_env("CASS_TANTIVY_REBUILD_PENDING_SHARD_BUILD_MAX_JOBS", "0");
let mut settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 8,
available_parallelism: 8,
reserved_cores: 2,
tantivy_writer_threads: 8,
staged_shard_builders: 2,
staged_merge_workers: 1,
controller_mode: "steady".into(),
controller_restore_clear_samples: 3,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 1024,
startup_batch_fetch_conversations: 256,
steady_commit_every_conversations: 1024,
startup_commit_every_conversations: 256,
steady_commit_every_messages: 2048,
startup_commit_every_messages: 512,
steady_commit_every_message_bytes: 512 * 1024 * 1024,
startup_commit_every_message_bytes: 128 * 1024 * 1024,
pipeline_channel_size: 2,
page_prep_workers: 6,
pipeline_max_message_bytes_in_flight: 4 * 1024 * 1024,
};
assert_eq!(lexical_rebuild_pending_shard_build_max_jobs(&settings), 64);
settings.staged_shard_builders = 16;
assert_eq!(lexical_rebuild_pending_shard_build_max_jobs(&settings), 256);
drop(_invalid_override);
let _explicit_override = set_env("CASS_TANTIVY_REBUILD_PENDING_SHARD_BUILD_MAX_JOBS", "17");
assert_eq!(lexical_rebuild_pending_shard_build_max_jobs(&settings), 17);
}
#[test]
#[serial]
fn lexical_rebuild_large_corpus_planner_keeps_5m_message_shards_byte_bounded() {
const CONVERSATIONS: usize = 56_512;
const MESSAGES: usize = 5_347_311;
const MESSAGE_BYTES: usize = 8 * 1024 * 1024 * 1024;
const SHARD_CAP: usize = 64 * 1024 * 1024;
let _cap = set_env(
"CASS_TANTIVY_REBUILD_STAGED_SHARD_MAX_MESSAGE_BYTES",
&SHARD_CAP.to_string(),
);
let settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 16,
available_parallelism: 16,
reserved_cores: 2,
tantivy_writer_threads: 8,
staged_shard_builders: 2,
staged_merge_workers: 2,
controller_mode: "steady".into(),
controller_restore_clear_samples: 3,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 1024,
startup_batch_fetch_conversations: 256,
steady_commit_every_conversations: 10_000,
startup_commit_every_conversations: 2_048,
steady_commit_every_messages: 800_000,
startup_commit_every_messages: 800_000,
steady_commit_every_message_bytes: 512 * 1024 * 1024,
startup_commit_every_message_bytes: 128 * 1024 * 1024,
pipeline_channel_size: 4,
page_prep_workers: 8,
pipeline_max_message_bytes_in_flight: 512 * 1024 * 1024,
};
let budgets = lexical_rebuild_default_shard_planner_budgets_for_totals(
&settings,
CONVERSATIONS,
MESSAGES,
MESSAGE_BYTES,
);
let mut conversations = Vec::with_capacity(CONVERSATIONS);
for idx in 0..CONVERSATIONS {
let message_count =
MESSAGES / CONVERSATIONS + usize::from(idx < MESSAGES % CONVERSATIONS);
let message_bytes =
MESSAGE_BYTES / CONVERSATIONS + usize::from(idx < MESSAGE_BYTES % CONVERSATIONS);
conversations.push(LexicalShardPlannerConversation {
conversation_id: i64::try_from(idx + 1).unwrap(),
message_count,
message_bytes,
});
}
let plan = plan_lexical_rebuild_shards(&conversations, budgets);
assert_eq!(budgets.max_message_bytes_per_shard, SHARD_CAP);
assert!(
plan.shards.len() >= MESSAGE_BYTES.div_ceil(SHARD_CAP),
"large corpus should be split into enough shards to keep Tantivy builder heaps bounded"
);
assert!(
plan.shards
.iter()
.all(|shard| shard.message_bytes <= SHARD_CAP),
"no synthetic 5M-message shard should exceed the staged shard byte cap"
);
}
#[test]
fn lexical_rebuild_staged_shard_builder_settings_preserve_total_writer_budget() {
let settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 12,
available_parallelism: 12,
reserved_cores: 2,
tantivy_writer_threads: 8,
staged_shard_builders: 8,
staged_merge_workers: 3,
controller_mode: "steady".into(),
controller_restore_clear_samples: 3,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 1024,
startup_batch_fetch_conversations: 256,
steady_commit_every_conversations: 1024,
startup_commit_every_conversations: 256,
steady_commit_every_messages: 2048,
startup_commit_every_messages: 512,
steady_commit_every_message_bytes: 1_000_000,
startup_commit_every_message_bytes: 250_000,
pipeline_channel_size: 2,
page_prep_workers: 6,
pipeline_max_message_bytes_in_flight: 4 * 1024 * 1024,
};
assert_eq!(
lexical_rebuild_staged_shard_builder_settings(&settings, 3),
LexicalRebuildShardBuilderSettings {
max_builders: 3,
writer_parallelism_budget: 8,
}
);
assert_eq!(
lexical_rebuild_staged_shard_builder_settings(&settings, 32),
LexicalRebuildShardBuilderSettings {
max_builders: 8,
writer_parallelism_budget: 8,
}
);
let constrained_writer_budget = LexicalRebuildPipelineSettingsSnapshot {
tantivy_writer_threads: 4,
..settings
};
assert_eq!(
lexical_rebuild_staged_shard_builder_settings(&constrained_writer_budget, 32),
LexicalRebuildShardBuilderSettings {
max_builders: 4,
writer_parallelism_budget: 4,
}
);
}
#[test]
fn lexical_rebuild_staged_shard_builder_dispatch_writer_parallelism_rebalances_budget() {
let balanced = (0..6)
.map(|slot| {
lexical_rebuild_staged_shard_builder_writer_parallelism_for_dispatch(8, 6, slot)
})
.collect::<Vec<_>>();
assert_eq!(balanced, vec![2, 2, 1, 1, 1, 1]);
assert_eq!(balanced.iter().sum::<usize>(), 8);
let widened = (0..2)
.map(|slot| {
lexical_rebuild_staged_shard_builder_writer_parallelism_for_dispatch(8, 2, slot)
})
.collect::<Vec<_>>();
assert_eq!(widened, vec![4, 4]);
}
#[test]
fn lexical_shard_plan_is_deterministic_across_input_order() {
let budgets = LexicalShardPlannerBudgets {
max_conversations_per_shard: 2,
max_messages_per_shard: 20,
max_message_bytes_per_shard: 2_000,
};
let ordered = vec![
LexicalShardPlannerConversation {
conversation_id: 10,
message_count: 4,
message_bytes: 400,
},
LexicalShardPlannerConversation {
conversation_id: 20,
message_count: 5,
message_bytes: 500,
},
LexicalShardPlannerConversation {
conversation_id: 30,
message_count: 6,
message_bytes: 600,
},
];
let permuted = vec![ordered[2], ordered[0], ordered[1]];
let ordered_plan = plan_lexical_rebuild_shards(&ordered, budgets);
let permuted_plan = plan_lexical_rebuild_shards(&permuted, budgets);
assert_eq!(ordered_plan, permuted_plan);
assert_eq!(ordered_plan.shards.len(), 2);
assert_eq!(ordered_plan.shards[0].first_conversation_id, 10);
assert_eq!(ordered_plan.shards[0].last_conversation_id, 20);
assert_eq!(ordered_plan.shards[1].first_conversation_id, 30);
assert_eq!(ordered_plan.shards[1].last_conversation_id, 30);
}
#[test]
fn lexical_shard_plan_id_changes_when_sparse_assignments_change() {
let budgets = LexicalShardPlannerBudgets {
max_conversations_per_shard: 3,
max_messages_per_shard: 20,
max_message_bytes_per_shard: 2_000,
};
let plan_a = plan_lexical_rebuild_shards(
&[
LexicalShardPlannerConversation {
conversation_id: 1,
message_count: 2,
message_bytes: 200,
},
LexicalShardPlannerConversation {
conversation_id: 2,
message_count: 2,
message_bytes: 200,
},
LexicalShardPlannerConversation {
conversation_id: 4,
message_count: 2,
message_bytes: 200,
},
],
budgets,
);
let plan_b = plan_lexical_rebuild_shards(
&[
LexicalShardPlannerConversation {
conversation_id: 1,
message_count: 2,
message_bytes: 200,
},
LexicalShardPlannerConversation {
conversation_id: 3,
message_count: 2,
message_bytes: 200,
},
LexicalShardPlannerConversation {
conversation_id: 4,
message_count: 2,
message_bytes: 200,
},
],
budgets,
);
assert_eq!(plan_a.shards.len(), 1);
assert_eq!(plan_b.shards.len(), 1);
assert_eq!(plan_a.shards[0].first_conversation_id, 1);
assert_eq!(plan_b.shards[0].first_conversation_id, 1);
assert_eq!(plan_a.shards[0].last_conversation_id, 4);
assert_eq!(plan_b.shards[0].last_conversation_id, 4);
assert_eq!(plan_a.shards[0].conversation_count, 3);
assert_eq!(plan_b.shards[0].conversation_count, 3);
assert_ne!(
plan_a.shards[0].conversation_id_fingerprint,
plan_b.shards[0].conversation_id_fingerprint,
"sparse shard assignments with the same range and totals need distinct shard evidence"
);
assert_ne!(
plan_a.plan_id, plan_b.plan_id,
"plan identity must include interior conversation IDs, not only shard ranges"
);
}
#[test]
fn lexical_shard_plan_isolates_oversized_single_conversation() {
let budgets = LexicalShardPlannerBudgets {
max_conversations_per_shard: 3,
max_messages_per_shard: 10,
max_message_bytes_per_shard: 1_000,
};
let plan = plan_lexical_rebuild_shards(
&[
LexicalShardPlannerConversation {
conversation_id: 1,
message_count: 3,
message_bytes: 300,
},
LexicalShardPlannerConversation {
conversation_id: 2,
message_count: 50,
message_bytes: 5_000,
},
LexicalShardPlannerConversation {
conversation_id: 3,
message_count: 2,
message_bytes: 200,
},
],
budgets,
);
assert_eq!(plan.oversized_conversation_ids, vec![2]);
assert_eq!(plan.shards.len(), 3);
assert!(!plan.shards[0].oversized_single_conversation);
assert!(plan.shards[1].oversized_single_conversation);
assert_eq!(plan.shards[1].first_conversation_id, 2);
assert_eq!(plan.shards[1].last_conversation_id, 2);
assert_eq!(plan.shards[2].first_conversation_id, 3);
assert_eq!(plan.shards[2].last_conversation_id, 3);
}
#[test]
fn lexical_rebuild_target_shard_count_scales_with_parallelism() {
assert_eq!(lexical_rebuild_target_shard_count(1, 1), 4);
assert_eq!(lexical_rebuild_target_shard_count(8, 4), 32);
assert_eq!(lexical_rebuild_target_shard_count(4, 12), 48);
assert_eq!(lexical_rebuild_target_shard_count(128, 64), 256);
}
#[test]
fn lexical_rebuild_default_shard_budget_respects_parallel_targets_and_caps() {
assert_eq!(
lexical_rebuild_default_shard_budget(48_000, 48, 512, 10_000),
1_000
);
assert_eq!(
lexical_rebuild_default_shard_budget(4_000, 48, 512, 10_000),
512
);
assert_eq!(
lexical_rebuild_default_shard_budget(5_000_000, 16, 32_000, 250_000),
250_000
);
}
#[test]
fn lexical_rebuild_shard_planner_conversations_from_storage_uses_estimated_byte_footprints() {
let temp = TempDir::new().unwrap();
let db_path = temp.path().join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.to_string()),
title: Some(external_id.to_string()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages,
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap()
.conversation_id
};
let ascii_id = insert(
"planner-ascii",
1_700_000_000_000,
vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_010),
content: "abc".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_020),
content: "defg".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
);
let empty_id = insert("planner-empty", 1_700_000_001_000, Vec::new());
let utf8_id = insert(
"planner-utf8",
1_700_000_002_000,
vec![Message {
id: None,
idx: 0,
role: MessageRole::Tool,
author: None,
created_at: Some(1_700_000_002_010),
content: "hé🙂".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
);
let conversations =
lexical_rebuild_shard_planner_conversations_from_storage(&storage).unwrap();
assert_eq!(
conversations,
vec![
LexicalShardPlannerConversation {
conversation_id: ascii_id,
message_count: 2,
message_bytes: 2
* crate::storage::sqlite::LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
},
LexicalShardPlannerConversation {
conversation_id: empty_id,
message_count: 0,
message_bytes: 0,
},
LexicalShardPlannerConversation {
conversation_id: utf8_id,
message_count: 1,
message_bytes:
crate::storage::sqlite::LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
},
]
);
}
#[test]
fn lexical_rebuild_shard_plan_from_storage_uses_message_footprints() {
let temp = TempDir::new().unwrap();
let db_path = temp.path().join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
for conversation_idx in 0..3 {
let external_id = format!("footprint-plan-{conversation_idx}");
let messages = (0..4)
.map(|message_idx| Message {
id: None,
idx: message_idx,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000 + message_idx),
content: format!("{external_id}-{message_idx}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
})
.collect::<Vec<_>>();
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.clone()),
title: Some(external_id.clone()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages,
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
let settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 1,
available_parallelism: 1,
reserved_cores: 0,
tantivy_writer_threads: 1,
staged_shard_builders: 1,
staged_merge_workers: 1,
controller_mode: "steady".into(),
controller_restore_clear_samples: 1,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 10,
startup_batch_fetch_conversations: 10,
steady_commit_every_conversations: 10,
startup_commit_every_conversations: 10,
steady_commit_every_messages: 5,
startup_commit_every_messages: 5,
steady_commit_every_message_bytes: 1024 * 1024,
startup_commit_every_message_bytes: 1024 * 1024,
pipeline_channel_size: 2,
page_prep_workers: 1,
pipeline_max_message_bytes_in_flight: 2 * 1024 * 1024,
};
let plan =
plan_lexical_rebuild_shards_from_storage_with_settings(&storage, &settings, 3).unwrap();
assert_eq!(plan.total_conversations, 3);
assert_eq!(plan.total_messages, 12);
assert_eq!(plan.shards.len(), 3);
assert_eq!(
plan.shards
.iter()
.map(|shard| shard.message_count)
.collect::<Vec<_>>(),
vec![
LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT,
LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT,
LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT
]
);
assert_eq!(
plan.shards
.iter()
.map(|shard| shard.conversation_count)
.collect::<Vec<_>>(),
vec![1, 1, 1]
);
}
#[test]
fn lexical_rebuild_shard_plan_without_tail_metadata_uses_conservative_id_only_plan() {
let temp = TempDir::new().unwrap();
let db_path = temp.path().join("legacy-canonical.db");
let db_path_str = db_path.to_string_lossy().into_owned();
let conn = frankensqlite::Connection::open(db_path_str).unwrap();
conn.execute_compat(
"CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT NOT NULL)",
&[] as &[ParamValue],
)
.unwrap();
conn.execute_compat(
"CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
role TEXT NOT NULL,
author TEXT,
created_at INTEGER,
content TEXT NOT NULL,
UNIQUE(conversation_id, idx)
)",
&[] as &[ParamValue],
)
.unwrap();
conn.execute_compat(
"CREATE INDEX idx_messages_conv_idx ON messages(conversation_id, idx)",
&[] as &[ParamValue],
)
.unwrap();
for conversation_id in 1..=130 {
conn.execute_compat(
"INSERT INTO conversations(id, source_path) VALUES (?1, ?2)",
&[
ParamValue::from(i64::from(conversation_id)),
ParamValue::from(format!("/tmp/legacy-{conversation_id}.jsonl")),
],
)
.unwrap();
conn.execute_compat(
"INSERT INTO messages(conversation_id, idx, role, content)
VALUES (?1, 0, 'user', ?2)",
&[
ParamValue::from(i64::from(conversation_id)),
ParamValue::from(format!("message {conversation_id}")),
],
)
.unwrap();
}
drop(conn);
let storage = FrankenStorage::open_readonly(&db_path).unwrap();
assert!(
!storage
.lexical_rebuild_has_tail_footprint_metadata()
.unwrap(),
"legacy canonical DB fixture intentionally has no tail metadata"
);
let settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 8,
available_parallelism: 8,
reserved_cores: 0,
tantivy_writer_threads: 8,
staged_shard_builders: 2,
staged_merge_workers: 1,
controller_mode: "steady".into(),
controller_restore_clear_samples: 1,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 512,
startup_batch_fetch_conversations: 512,
steady_commit_every_conversations: 512,
startup_commit_every_conversations: 512,
steady_commit_every_messages: 10_000,
startup_commit_every_messages: 10_000,
steady_commit_every_message_bytes: 64 * 1024 * 1024,
startup_commit_every_message_bytes: 64 * 1024 * 1024,
pipeline_channel_size: 2,
page_prep_workers: 1,
pipeline_max_message_bytes_in_flight: 256 * 1024,
};
let plan = plan_lexical_rebuild_shards_from_storage_with_settings(&storage, &settings, 130)
.unwrap();
assert_eq!(plan.total_conversations, 130);
assert_eq!(plan.total_messages, 130);
assert_eq!(
plan.shards
.iter()
.map(|shard| shard.conversation_count)
.collect::<Vec<_>>(),
vec![64, 64, 2]
);
assert!(
plan.shards
.iter()
.all(|shard| shard.message_count == LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT),
"id-only plans must leave doc-count validation to observed rebuild accounting"
);
}
#[test]
fn lexical_rebuild_shard_plan_with_sparse_tail_metadata_uses_conservative_id_only_plan() {
let temp = TempDir::new().unwrap();
let db_path = temp.path().join("sparse-tail-canonical.db");
let db_path_str = db_path.to_string_lossy().into_owned();
let conn = frankensqlite::Connection::open(db_path_str).unwrap();
conn.execute_compat(
"CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
source_path TEXT NOT NULL,
last_message_idx INTEGER
)",
&[] as &[ParamValue],
)
.unwrap();
conn.execute_compat(
"CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
role TEXT NOT NULL,
author TEXT,
created_at INTEGER,
content TEXT NOT NULL,
UNIQUE(conversation_id, idx)
)",
&[] as &[ParamValue],
)
.unwrap();
conn.execute_compat(
"CREATE INDEX idx_messages_conv_idx ON messages(conversation_id, idx)",
&[] as &[ParamValue],
)
.unwrap();
conn.execute_compat(
"CREATE TABLE conversation_tail_state (
conversation_id INTEGER PRIMARY KEY,
ended_at INTEGER,
last_message_idx INTEGER,
last_message_created_at INTEGER
)",
&[] as &[ParamValue],
)
.unwrap();
for conversation_id in 1..=130 {
if conversation_id == 1 {
conn.execute_compat(
"INSERT INTO conversations(id, source_path, last_message_idx)
VALUES (?1, ?2, 0)",
&[
ParamValue::from(i64::from(conversation_id)),
ParamValue::from(format!("/tmp/sparse-tail-{conversation_id}.jsonl")),
],
)
.unwrap();
} else {
conn.execute_compat(
"INSERT INTO conversations(id, source_path, last_message_idx)
VALUES (?1, ?2, NULL)",
&[
ParamValue::from(i64::from(conversation_id)),
ParamValue::from(format!("/tmp/sparse-tail-{conversation_id}.jsonl")),
],
)
.unwrap();
}
conn.execute_compat(
"INSERT INTO messages(conversation_id, idx, role, content)
VALUES (?1, 0, 'user', ?2)",
&[
ParamValue::from(i64::from(conversation_id)),
ParamValue::from(format!("message {conversation_id}")),
],
)
.unwrap();
conn.execute_compat(
"INSERT INTO conversation_tail_state(conversation_id, last_message_idx)
VALUES (?1, 0)",
&[ParamValue::from(i64::from(conversation_id + 1_000))],
)
.unwrap();
}
drop(conn);
let storage = FrankenStorage::open_readonly(&db_path).unwrap();
assert!(
!storage
.lexical_rebuild_has_tail_footprint_metadata()
.unwrap(),
"one populated conversation tail plus stale tail-state rows must not send a large legacy DB through full footprint aggregation"
);
let settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 8,
available_parallelism: 8,
reserved_cores: 0,
tantivy_writer_threads: 8,
staged_shard_builders: 2,
staged_merge_workers: 1,
controller_mode: "steady".into(),
controller_restore_clear_samples: 1,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 512,
startup_batch_fetch_conversations: 512,
steady_commit_every_conversations: 512,
startup_commit_every_conversations: 512,
steady_commit_every_messages: 10_000,
startup_commit_every_messages: 10_000,
steady_commit_every_message_bytes: 64 * 1024 * 1024,
startup_commit_every_message_bytes: 64 * 1024 * 1024,
pipeline_channel_size: 2,
page_prep_workers: 1,
pipeline_max_message_bytes_in_flight: 256 * 1024,
};
let plan = plan_lexical_rebuild_shards_from_storage_with_settings(&storage, &settings, 130)
.unwrap();
assert_eq!(
plan.shards
.iter()
.map(|shard| shard.conversation_count)
.collect::<Vec<_>>(),
vec![64, 64, 2]
);
assert!(
plan.shards
.iter()
.all(|shard| shard.message_count == LEXICAL_SHARD_UNKNOWN_MESSAGE_COUNT),
"sparse tail metadata should use the same validation-safe id-only plan as absent metadata"
);
}
#[test]
fn lexical_rebuild_default_page_prep_worker_parallelism_stays_bounded_without_channel_cap() {
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(1),
1
);
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(2),
2
);
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(4),
2
);
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(6),
3
);
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(8),
4
);
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(16),
8
);
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(32),
8,
"measured ceiling moves to 8 after the shard-planning tail-state fix exposed page-prep overlap"
);
assert_eq!(
lexical_rebuild_default_page_prep_worker_parallelism_for_workers(128),
8,
"128-core budget is still clamped at the measured 8-worker ceiling"
);
}
#[test]
#[serial]
fn lexical_rebuild_pipeline_settings_snapshot_defaults_page_prep_workers_from_worker_budget() {
let _responsiveness = set_env("CASS_RESPONSIVENESS_DISABLE", "1");
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "12");
let _pipeline_channel = set_env("CASS_TANTIVY_REBUILD_PIPELINE_CHANNEL_SIZE", "2");
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let snapshot = lexical_rebuild_pipeline_settings_snapshot();
assert_eq!(snapshot.workers, 12);
assert_eq!(snapshot.pipeline_channel_size, 2);
assert_eq!(
snapshot.tantivy_writer_threads,
snapshot.available_parallelism.min(2)
);
assert_eq!(
snapshot.steady_batch_fetch_conversations,
lexical_rebuild_default_batch_fetch_conversation_limit(
LEXICAL_REBUILD_PAGE_SIZE,
snapshot.tantivy_writer_threads
)
);
assert_eq!(
snapshot.page_prep_workers,
responsiveness::effective_worker_count(6).max(1)
);
assert_eq!(
snapshot.staged_merge_workers,
responsiveness::effective_worker_count(3).max(1)
);
}
#[test]
#[serial]
fn lexical_rebuild_pipeline_settings_snapshot_defaults_channel_to_measured_handoff_depth() {
let _responsiveness = set_env("CASS_RESPONSIVENESS_DISABLE", "1");
let _pipeline_channel = unset_env_var("CASS_TANTIVY_REBUILD_PIPELINE_CHANNEL_SIZE");
let _pipeline_bytes =
unset_env_var("CASS_TANTIVY_REBUILD_PIPELINE_MAX_MESSAGE_BYTES_IN_FLIGHT");
let snapshot = lexical_rebuild_pipeline_settings_snapshot();
let uncapped_message_bytes_in_flight = snapshot
.startup_commit_every_message_bytes
.max(1)
.saturating_mul(snapshot.pipeline_channel_size.saturating_add(1).max(1));
assert_eq!(snapshot.pipeline_channel_size, 4);
assert_eq!(
snapshot.pipeline_max_message_bytes_in_flight,
responsiveness::effective_inflight_byte_limit(uncapped_message_bytes_in_flight).max(1)
);
assert!(snapshot.pipeline_max_message_bytes_in_flight <= uncapped_message_bytes_in_flight);
}
#[test]
fn lexical_rebuild_staged_shard_merge_settings_scales_with_eager_group_count() {
let settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 12,
available_parallelism: 12,
reserved_cores: 2,
tantivy_writer_threads: 8,
staged_shard_builders: 8,
staged_merge_workers: 3,
controller_mode: "steady".into(),
controller_restore_clear_samples: 3,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 1024,
startup_batch_fetch_conversations: 256,
steady_commit_every_conversations: 1024,
startup_commit_every_conversations: 256,
steady_commit_every_messages: 2048,
startup_commit_every_messages: 512,
steady_commit_every_message_bytes: 1_000_000,
startup_commit_every_message_bytes: 250_000,
pipeline_channel_size: 2,
page_prep_workers: 6,
pipeline_max_message_bytes_in_flight: 4 * 1024 * 1024,
};
assert_eq!(
lexical_rebuild_staged_shard_merge_settings(&settings, 3),
LexicalRebuildShardMergeSettings { workers: 1 }
);
assert_eq!(
lexical_rebuild_staged_shard_merge_settings(&settings, 5),
LexicalRebuildShardMergeSettings { workers: 1 }
);
assert_eq!(
lexical_rebuild_staged_shard_merge_settings(
&settings,
LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN * 3,
),
LexicalRebuildShardMergeSettings { workers: 3 }
);
assert_eq!(
lexical_rebuild_staged_shard_merge_settings(
&settings,
LexicalRebuildShardMergeCoordinator::EAGER_MERGE_FAN_IN * 8,
),
LexicalRebuildShardMergeSettings { workers: 3 }
);
}
#[test]
#[serial]
fn lexical_rebuild_first_budget_promotion_wait_defaults_to_short_bounded_wait() {
let _guard = unset_env_var("CASS_TANTIVY_REBUILD_FIRST_BUDGET_PROMOTION_WAIT_MS");
assert_eq!(
lexical_rebuild_first_budget_promotion_wait(),
Duration::from_millis(5_000)
);
}
#[test]
#[serial]
fn lexical_rebuild_responsiveness_controller_demotes_and_restores_with_hysteresis() {
let _clear_samples = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_CLEAR_SAMPLES", "2");
let _hold_ms = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_HOLD_MS", "1");
let startup_budget =
LexicalRebuildPipelineBudgetSnapshot::new(32, 64, 1024, 2_048, 16, 128, 4_096);
let steady_budget =
LexicalRebuildPipelineBudgetSnapshot::new(256, 512, 4096, 8_192, 1_024, 8_192, 65_536);
let mut controller = LexicalRebuildResponsivenessController::new(
LexicalRebuildResponsivenessPolicy::Auto,
startup_budget,
steady_budget,
2,
false,
None,
None,
);
assert_eq!(controller.mode(), "steady");
assert_eq!(controller.current_budget(), steady_budget);
let pressured_runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 2,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let transition = controller
.observe_runtime(&pressured_runtime)
.expect("pressure should demote");
assert_eq!(transition.old_budget, steady_budget);
assert_eq!(transition.new_budget, startup_budget);
assert_eq!(transition.mode, "pressure_limited");
assert_eq!(transition.new_budget.commit_interval_conversations, 16);
assert_eq!(transition.new_budget.commit_interval_messages, 128);
assert_eq!(transition.new_budget.commit_interval_message_bytes, 4_096);
assert_eq!(controller.current_budget(), startup_budget);
controller.last_transition_at = Instant::now() - controller.restore_hold;
let clear_runtime = LexicalRebuildPipelineRuntimeSnapshot::default();
assert!(controller.observe_runtime(&clear_runtime).is_none());
assert_eq!(controller.mode(), "pressure_limited");
let restore = controller
.observe_runtime(&clear_runtime)
.expect("clear streak should restore steady budget");
assert_eq!(restore.old_budget, startup_budget);
assert_eq!(restore.new_budget, steady_budget);
assert_eq!(restore.mode, "steady");
assert_eq!(restore.new_budget.commit_interval_conversations, 1_024);
assert_eq!(restore.new_budget.commit_interval_messages, 8_192);
assert_eq!(restore.new_budget.commit_interval_message_bytes, 65_536);
assert_eq!(controller.current_budget(), steady_budget);
}
#[test]
#[serial]
fn lexical_rebuild_responsiveness_controller_demotes_on_new_handoff_wait_delta() {
let _clear_samples = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_CLEAR_SAMPLES", "2");
let _hold_ms = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_HOLD_MS", "1");
let startup_budget =
LexicalRebuildPipelineBudgetSnapshot::new(32, 64, 1024, 2_048, 16, 128, 4_096);
let steady_budget =
LexicalRebuildPipelineBudgetSnapshot::new(256, 512, 4096, 8_192, 1_024, 8_192, 65_536);
let mut controller = LexicalRebuildResponsivenessController::new(
LexicalRebuildResponsivenessPolicy::Auto,
startup_budget,
steady_budget,
2,
false,
None,
None,
);
let baseline_runtime = LexicalRebuildPipelineRuntimeSnapshot::default();
assert!(controller.observe_runtime(&baseline_runtime).is_none());
assert_eq!(controller.mode(), "steady");
let pressured_runtime = LexicalRebuildPipelineRuntimeSnapshot {
producer_handoff_wait_count: 1,
producer_handoff_wait_ms: 9,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let transition = controller
.observe_runtime(&pressured_runtime)
.expect("a new producer handoff stall should demote the runtime budget");
assert_eq!(transition.old_budget, steady_budget);
assert_eq!(transition.new_budget, startup_budget);
assert_eq!(transition.mode, "pressure_limited");
assert!(
transition
.reason
.contains("producer_handoff_wait_count_1_observed_consumer_backpressure"),
"unexpected transition reason: {}",
transition.reason
);
controller.last_transition_at = Instant::now() - controller.restore_hold;
assert!(
controller.observe_runtime(&pressured_runtime).is_none(),
"cumulative handoff counters alone must not keep the controller permanently pressured"
);
let restore = controller
.observe_runtime(&pressured_runtime)
.expect("steady state should restore after the clear-window hold expires");
assert_eq!(restore.old_budget, startup_budget);
assert_eq!(restore.new_budget, steady_budget);
assert_eq!(restore.mode, "steady");
}
#[test]
fn lexical_rebuild_responsiveness_controller_honors_pinned_conservative_mode() {
let startup_budget =
LexicalRebuildPipelineBudgetSnapshot::new(32, 64, 1024, 2_048, 16, 128, 4_096);
let steady_budget =
LexicalRebuildPipelineBudgetSnapshot::new(256, 512, 4096, 8_192, 1_024, 8_192, 65_536);
let mut controller = LexicalRebuildResponsivenessController::new(
LexicalRebuildResponsivenessPolicy::Conservative,
startup_budget,
steady_budget,
2,
true,
None,
None,
);
assert_eq!(controller.mode(), "pinned_conservative");
assert_eq!(controller.current_budget(), startup_budget);
assert!(!controller.waits_for_first_durable_commit());
assert!(controller.record_first_durable_commit().is_none());
assert!(
controller
.observe_runtime(&LexicalRebuildPipelineRuntimeSnapshot::default())
.is_none()
);
assert_eq!(controller.current_budget(), startup_budget);
}
#[test]
fn lexical_rebuild_budget_transition_updates_active_commit_cadence() {
let startup_budget =
LexicalRebuildPipelineBudgetSnapshot::new(32, 64, 1024, 2_048, 16, 128, 4_096);
let steady_budget =
LexicalRebuildPipelineBudgetSnapshot::new(256, 512, 4096, 8_192, 1_024, 8_192, 65_536);
let flow_limiter = StreamingByteLimiter::new(steady_budget.max_message_bytes_in_flight);
let pipeline_budget_controller = LexicalRebuildPipelineBudgetController::new(steady_budget);
let mut current_batch_conversation_limit = steady_budget.page_conversation_limit;
let mut commit_interval_conversations = steady_budget.commit_interval_conversations;
let mut commit_interval_messages = steady_budget.commit_interval_messages;
let mut commit_interval_message_bytes = steady_budget.commit_interval_message_bytes;
apply_lexical_rebuild_budget_transition(
LexicalRebuildBudgetTransition {
old_budget: steady_budget,
new_budget: startup_budget,
mode: "pressure_limited",
reason: "test_pressure".into(),
},
&flow_limiter,
&pipeline_budget_controller,
&mut current_batch_conversation_limit,
Some((
&mut commit_interval_conversations,
&mut commit_interval_messages,
&mut commit_interval_message_bytes,
)),
);
assert_eq!(
current_batch_conversation_limit,
startup_budget.page_conversation_limit
);
assert_eq!(
flow_limiter.max_bytes_in_flight(),
startup_budget.max_message_bytes_in_flight
);
assert_eq!(
pipeline_budget_controller.snapshot(),
startup_budget,
"producer-side budget snapshots should include the demoted commit cadence"
);
assert_eq!(
(
commit_interval_conversations,
commit_interval_messages,
commit_interval_message_bytes,
),
(
startup_budget.commit_interval_conversations,
startup_budget.commit_interval_messages,
startup_budget.commit_interval_message_bytes,
)
);
}
#[test]
fn parse_lexical_rebuild_loadavg_1m_milli_reads_first_field() {
assert_eq!(
parse_lexical_rebuild_loadavg_1m_milli("1.50 1.21 0.80 2/199 1234\n"),
Some(1_500)
);
}
#[test]
#[serial]
fn lexical_rebuild_responsiveness_controller_demotes_and_restores_on_host_loadavg_pressure() {
let _clear_samples = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_CLEAR_SAMPLES", "1");
let _hold_ms = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_RESTORE_HOLD_MS", "1");
let startup_budget =
LexicalRebuildPipelineBudgetSnapshot::new(32, 64, 1024, 2_048, 16, 128, 4_096);
let steady_budget =
LexicalRebuildPipelineBudgetSnapshot::new(256, 512, 4096, 8_192, 1_024, 8_192, 65_536);
let mut controller = LexicalRebuildResponsivenessController::new(
LexicalRebuildResponsivenessPolicy::Auto,
startup_budget,
steady_budget,
2,
false,
Some(7_000),
Some(6_000),
);
let pressured_runtime = LexicalRebuildPipelineRuntimeSnapshot {
host_loadavg_1m_milli: Some(7_250),
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let transition = controller
.observe_runtime(&pressured_runtime)
.expect("host loadavg should demote");
assert_eq!(transition.old_budget, steady_budget);
assert_eq!(transition.new_budget, startup_budget);
assert_eq!(transition.mode, "pressure_limited");
assert!(
transition
.reason
.contains("host_loadavg_1m_7.250_reached_high_watermark_7.000"),
"unexpected reason: {}",
transition.reason
);
controller.last_transition_at = Instant::now() - controller.restore_hold;
let clear_runtime = LexicalRebuildPipelineRuntimeSnapshot {
host_loadavg_1m_milli: Some(5_750),
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let restore = controller
.observe_runtime(&clear_runtime)
.expect("sub-threshold loadavg should restore steady budget");
assert_eq!(restore.old_budget, startup_budget);
assert_eq!(restore.new_budget, steady_budget);
assert_eq!(restore.mode, "steady");
}
#[test]
fn lexical_rebuild_packet_producer_builds_lookup_and_source_context_internally() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("producer-lookups.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_lexical_rebuild_fixture(&storage);
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
})
.unwrap();
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/workspace/alpha")),
external_id: Some("remote-lexical-fixture".into()),
title: Some("Remote lexical rebuild fixture".into()),
source_path: PathBuf::from("/tmp/remote-lexical-fixture.jsonl"),
started_at: Some(1_700_000_010_000),
ended_at: Some(1_700_000_010_100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_010_010),
content: "remote-first".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_010_020),
content: "remote-second".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: "remote-builder".into(),
origin_host: Some("builder-host".into()),
},
)
.unwrap();
drop(storage);
let (tx, rx) = bounded::<LexicalRebuildPipelineMessage>(4);
let flow_limiter = Arc::new(StreamingByteLimiter::new(8 * 1024));
let handle = spawn_lexical_rebuild_packet_producer(
db_path,
None,
None,
LEXICAL_REBUILD_PAGE_SIZE,
4,
None,
Arc::new(LexicalRebuildPipelineBudgetController::new(
lexical_rebuild_runtime_pipeline_budget_snapshot(3, 32, 1024, 4, 3, 32, 1024),
)),
tx,
flow_limiter.clone(),
None,
Arc::new(LexicalRebuildProducerTelemetry::default()),
);
let batch = match rx.recv().unwrap() {
LexicalRebuildPipelineMessage::Batch(batch) => batch,
other => panic!("expected prepared batch, got {other:?}"),
};
match rx.recv().unwrap() {
LexicalRebuildPipelineMessage::Done => {}
other => panic!("expected pipeline completion, got {other:?}"),
}
handle.join().unwrap();
assert_eq!(batch.packets.len(), 3);
let remote_packet = batch
.packets
.iter()
.find(|packet| packet.identity.external_id.as_deref() == Some("remote-lexical-fixture"))
.expect("remote fixture packet should be present");
assert_eq!(remote_packet.identity.agent, "codex");
assert_eq!(
remote_packet.identity.external_id.as_deref(),
Some("remote-lexical-fixture")
);
assert_eq!(remote_packet.provenance.source_id, "remote-builder");
assert_eq!(remote_packet.provenance.origin_kind, "remote");
assert_eq!(
remote_packet.provenance.origin_host.as_deref(),
Some("builder-host")
);
assert!(
batch.packets.iter().all(|packet| packet.message_count > 0),
"fixture pages should still carry grouped messages after producer-owned lookup warmup"
);
assert!(
flow_limiter.bytes_in_flight() > 0,
"consumer-owned release should keep byte reservations visible until the sink drains them"
);
flow_limiter.release(flow_limiter.bytes_in_flight());
assert_eq!(flow_limiter.bytes_in_flight(), 0);
}
#[test]
fn lexical_rebuild_producer_startup_delivers_first_batch_quickly() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("startup-timing.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_lexical_rebuild_fixture(&storage);
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
})
.unwrap();
for i in 0..50 {
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/workspace/bulk")),
external_id: Some(format!("bulk-conv-{i}")),
title: Some(format!("Bulk conversation {i}")),
source_path: PathBuf::from(format!("/tmp/bulk-{i}.jsonl")),
started_at: Some(1_700_000_100_000 + i * 1000),
ended_at: Some(1_700_000_100_100 + i * 1000),
approx_tokens: Some(32),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_100_010 + i * 1000),
content: format!("bulk message {i}"),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
drop(storage);
let started = Instant::now();
let (tx, rx) = bounded::<LexicalRebuildPipelineMessage>(4);
let flow_limiter = Arc::new(StreamingByteLimiter::new(64 * 1024));
let handle = spawn_lexical_rebuild_packet_producer(
db_path,
None,
None,
LEXICAL_REBUILD_PAGE_SIZE,
4,
None,
Arc::new(LexicalRebuildPipelineBudgetController::new(
lexical_rebuild_runtime_pipeline_budget_snapshot(16, 128, 8192, 4, 16, 128, 8192),
)),
tx,
flow_limiter.clone(),
None,
Arc::new(LexicalRebuildProducerTelemetry::default()),
);
let first_batch = match rx.recv_timeout(Duration::from_secs(10)) {
Ok(LexicalRebuildPipelineMessage::Batch(batch)) => batch,
Ok(other) => panic!("expected batch as first message, got {other:?}"),
Err(_) => panic!("timed out waiting for first batch from producer"),
};
let first_batch_ms = started.elapsed().as_millis();
assert!(
!first_batch.packets.is_empty(),
"first batch should contain at least one conversation packet"
);
assert!(
first_batch_ms < 5_000,
"producer should deliver first batch within 5s, took {first_batch_ms}ms"
);
while let Ok(msg) = rx.recv_timeout(Duration::from_secs(5)) {
match msg {
LexicalRebuildPipelineMessage::Done => break,
LexicalRebuildPipelineMessage::Batch(batch) => {
release_lexical_rebuild_prepared_page_reservation(
&batch,
flow_limiter.as_ref(),
);
}
LexicalRebuildPipelineMessage::Error(err) => panic!("producer error: {err}"),
}
}
handle.join().unwrap();
}
#[test]
fn lexical_rebuild_packet_producer_releases_budget_when_consumer_disconnects() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("producer-disconnect.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_lexical_rebuild_fixture(&storage);
drop(storage);
let (tx, rx) = bounded::<LexicalRebuildPipelineMessage>(1);
drop(rx);
let flow_limiter = Arc::new(StreamingByteLimiter::new(1));
let handle = spawn_lexical_rebuild_packet_producer(
db_path,
None,
None,
LEXICAL_REBUILD_PAGE_SIZE,
1,
None,
Arc::new(LexicalRebuildPipelineBudgetController::new(
lexical_rebuild_runtime_pipeline_budget_snapshot(2, 32, 1024, 4, 2, 32, 1024),
)),
tx,
flow_limiter.clone(),
None,
Arc::new(LexicalRebuildProducerTelemetry::default()),
);
handle
.join()
.expect("lexical rebuild producer should stop cleanly after consumer disconnect");
assert_eq!(
flow_limiter.bytes_in_flight(),
0,
"disconnect path must release any reserved pipeline byte budget"
);
}
#[test]
#[serial]
fn lexical_rebuild_packet_producer_preserves_flow_budget_under_sustained_burst() {
let _page_prep_workers = set_env("CASS_TANTIVY_REBUILD_PAGE_PREP_WORKERS", "2");
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("producer-handoff-backpressure.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_lexical_rebuild_fixture(&storage);
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
})
.unwrap();
for suffix in 3..=8 {
let external_id = format!("handoff-burst-{suffix}");
let base_ts = 1_700_000_300_000_i64 + i64::from(suffix) * 1_000;
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.clone()),
title: Some("Handoff burst fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
drop(storage);
let (tx, rx) = bounded::<LexicalRebuildPipelineMessage>(1);
let flow_limiter = Arc::new(StreamingByteLimiter::new(256 * 1024));
let producer_telemetry = Arc::new(LexicalRebuildProducerTelemetry::default());
let handle = spawn_lexical_rebuild_packet_producer(
db_path,
None,
None,
1,
1,
None,
Arc::new(LexicalRebuildPipelineBudgetController::new(
lexical_rebuild_runtime_pipeline_budget_snapshot(
1,
32,
64 * 1024,
1,
1,
32,
64 * 1024,
),
)),
tx,
flow_limiter.clone(),
None,
producer_telemetry.clone(),
);
let saturation_deadline = Instant::now() + Duration::from_secs(5);
while rx.is_empty() && Instant::now() < saturation_deadline {
assert!(
!handle.is_finished(),
"producer finished before bounded handoff queue filled"
);
thread::sleep(Duration::from_millis(10));
}
assert_eq!(
rx.len(),
1,
"slow consumer should leave the bounded handoff queue saturated"
);
assert!(
!handle.is_finished(),
"producer should still be running while the handoff queue is saturated"
);
thread::sleep(Duration::from_millis(50));
let first_batch = match rx.recv_timeout(Duration::from_secs(10)).unwrap() {
LexicalRebuildPipelineMessage::Batch(batch) => batch,
other => panic!("expected first burst batch, got {other:?}"),
};
assert_eq!(first_batch.packets.len(), 1);
let mut held_batches = vec![first_batch];
assert!(
flow_limiter.bytes_in_flight() > 0,
"holding a prepared page should keep its byte reservation visible until the consumer releases it"
);
while let Ok(LexicalRebuildPipelineMessage::Batch(batch)) =
rx.recv_timeout(Duration::from_millis(10))
{
held_batches.push(batch);
assert!(
flow_limiter.bytes_in_flight() <= flow_limiter.max_bytes_in_flight(),
"producer must keep prepared-page reservations inside the configured byte budget"
);
}
for batch in &held_batches {
release_lexical_rebuild_prepared_page_reservation(batch, flow_limiter.as_ref());
}
loop {
match rx.recv_timeout(Duration::from_secs(10)).unwrap() {
LexicalRebuildPipelineMessage::Batch(batch) => {
release_lexical_rebuild_prepared_page_reservation(
&batch,
flow_limiter.as_ref(),
);
}
LexicalRebuildPipelineMessage::Done => break,
LexicalRebuildPipelineMessage::Error(error) => {
panic!("producer returned error: {error}")
}
}
}
handle.join().unwrap();
assert_eq!(
flow_limiter.bytes_in_flight(),
0,
"consumer-owned releases should drain all reserved bytes after the burst completes"
);
}
#[test]
fn lexical_rebuild_packet_producer_respects_planned_shard_boundaries() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("producer-planned-shards.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_lexical_rebuild_fixture(&storage);
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
})
.unwrap();
for suffix in 3..=6 {
let external_id = format!("lexical-fixture-{suffix}");
let base_ts = 1_700_000_000_000_i64 + i64::from(suffix) * 1_000;
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.clone()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
drop(storage);
let planned_settings = LexicalRebuildPipelineSettingsSnapshot {
workers: 1,
available_parallelism: 1,
reserved_cores: 0,
tantivy_writer_threads: 1,
staged_shard_builders: 1,
staged_merge_workers: 1,
controller_mode: "steady".into(),
controller_restore_clear_samples: 1,
controller_restore_hold_ms: 0,
controller_loadavg_high_watermark_1m_milli: None,
controller_loadavg_low_watermark_1m_milli: None,
page_size: LEXICAL_REBUILD_PAGE_SIZE,
steady_batch_fetch_conversations: 3,
startup_batch_fetch_conversations: 3,
steady_commit_every_conversations: 3,
startup_commit_every_conversations: 3,
steady_commit_every_messages: 6,
startup_commit_every_messages: 6,
steady_commit_every_message_bytes: 65_536,
startup_commit_every_message_bytes: 65_536,
pipeline_channel_size: 2,
page_prep_workers: 1,
pipeline_max_message_bytes_in_flight: 256 * 1024,
};
let storage = FrankenStorage::open_readonly(&db_path).unwrap();
let planned_shard_plan =
plan_lexical_rebuild_shards_from_storage_with_settings(&storage, &planned_settings, 6)
.unwrap();
storage.close_without_checkpoint().unwrap();
let (tx, rx) = bounded::<LexicalRebuildPipelineMessage>(2);
let flow_limiter = Arc::new(StreamingByteLimiter::new(256 * 1024));
let handle = spawn_lexical_rebuild_packet_producer(
db_path,
None,
Some(planned_shard_plan),
LEXICAL_REBUILD_PAGE_SIZE,
2,
None,
Arc::new(LexicalRebuildPipelineBudgetController::new(
lexical_rebuild_runtime_pipeline_budget_snapshot(
64,
256,
256 * 1024,
2,
64,
256,
256 * 1024,
),
)),
tx,
flow_limiter.clone(),
None,
Arc::new(LexicalRebuildProducerTelemetry::default()),
);
let mut observed_pages = Vec::new();
loop {
match rx.recv_timeout(Duration::from_secs(10)).unwrap() {
LexicalRebuildPipelineMessage::Batch(batch) => {
assert_eq!(
batch.planned_shard_index,
Some(observed_pages.len()),
"each emitted page should stay tagged with its deterministic shard index"
);
assert!(
batch.finishes_planned_shard,
"each page in this fixture should close exactly one planned shard"
);
observed_pages.push(
batch
.packets
.iter()
.map(|packet| {
packet
.identity
.external_id
.clone()
.expect("fixture conversation has external id")
})
.collect::<Vec<_>>(),
);
release_lexical_rebuild_prepared_page_reservation(
&batch,
flow_limiter.as_ref(),
);
}
LexicalRebuildPipelineMessage::Done => break,
LexicalRebuildPipelineMessage::Error(error) => {
panic!("producer returned error: {error}")
}
}
}
handle.join().unwrap();
assert_eq!(
observed_pages,
vec![
vec![
"lexical-fixture-1".to_string(),
"lexical-fixture-2".to_string(),
"lexical-fixture-3".to_string(),
],
vec![
"lexical-fixture-4".to_string(),
"lexical-fixture-5".to_string(),
"lexical-fixture-6".to_string(),
],
]
);
assert_eq!(flow_limiter.bytes_in_flight(), 0);
}
#[test]
#[serial]
fn lexical_rebuild_packet_producer_preserves_order_across_parallel_page_prep_workers() {
let _responsiveness = set_env("CASS_RESPONSIVENESS_DISABLE", "1");
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("producer-ordered-workers.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_lexical_rebuild_fixture(&storage);
let agent_id = storage
.ensure_agent(&Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
})
.unwrap();
for suffix in 3..=6 {
let external_id = format!("lexical-fixture-{suffix}");
let base_ts = 1_700_000_000_000_i64 + i64::from(suffix) * 1_000;
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.clone()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
drop(storage);
let _page_prep_workers = set_env("CASS_TANTIVY_REBUILD_PAGE_PREP_WORKERS", "2");
let (tx, rx) = bounded::<LexicalRebuildPipelineMessage>(2);
let flow_limiter = Arc::new(StreamingByteLimiter::new(256 * 1024));
let producer_telemetry = Arc::new(LexicalRebuildProducerTelemetry::default());
let handle = spawn_lexical_rebuild_packet_producer(
db_path,
None,
None,
LEXICAL_REBUILD_PAGE_SIZE,
2,
None,
Arc::new(LexicalRebuildPipelineBudgetController::new(
lexical_rebuild_runtime_pipeline_budget_snapshot(
1,
32,
64 * 1024,
2,
1,
32,
64 * 1024,
),
)),
tx,
flow_limiter.clone(),
None,
producer_telemetry.clone(),
);
let mut observed_external_ids = Vec::new();
loop {
match rx.recv_timeout(Duration::from_secs(10)).unwrap() {
LexicalRebuildPipelineMessage::Batch(batch) => {
assert_eq!(batch.packets.len(), 1);
observed_external_ids.push(
batch.packets[0]
.identity
.external_id
.clone()
.expect("fixture conversation has external id"),
);
release_lexical_rebuild_prepared_page_reservation(
&batch,
flow_limiter.as_ref(),
);
}
LexicalRebuildPipelineMessage::Done => break,
LexicalRebuildPipelineMessage::Error(error) => {
panic!("producer returned error: {error}")
}
}
}
handle.join().unwrap();
assert_eq!(
observed_external_ids,
vec![
"lexical-fixture-1",
"lexical-fixture-2",
"lexical-fixture-3",
"lexical-fixture-4",
"lexical-fixture-5",
"lexical-fixture-6",
]
);
assert_eq!(
flow_limiter.bytes_in_flight(),
0,
"ordered worker handoff must leave byte reservations owned and released by the sink"
);
let telemetry = producer_telemetry.snapshot();
assert_eq!(telemetry.page_prep_workers, 2);
assert_eq!(telemetry.active_page_prep_jobs, 0);
assert_eq!(telemetry.ordered_buffered_pages, 0);
}
fn tantivy_doc_count_for_data_dir(data_dir: &Path) -> u64 {
let index_path = index_dir(data_dir).unwrap();
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.expect("searchable lexical index summary")
.docs as u64
}
fn token_usage_extra(input_tokens: i64, output_tokens: i64) -> serde_json::Value {
serde_json::json!({
"message": {
"model": "claude-opus-4-6",
"usage": {
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"cache_read_input_tokens": input_tokens / 2,
"cache_creation_input_tokens": input_tokens / 4,
"service_tier": "standard"
}
}
})
}
fn large_startup_conv(
agent_slug: &str,
prefix: &str,
conv_idx: usize,
message_count: usize,
body_bytes: usize,
base_ts: i64,
) -> NormalizedConversation {
let mut messages = Vec::with_capacity(message_count);
for msg_idx in 0..message_count {
let is_assistant = msg_idx % 2 == 1;
let ts = base_ts
.saturating_add((conv_idx as i64).saturating_mul(10_000))
.saturating_add(msg_idx as i64);
messages.push(NormalizedMessage {
idx: msg_idx as i64,
role: if is_assistant { "assistant" } else { "user" }.to_string(),
author: Some(if is_assistant {
format!("{agent_slug}-model")
} else {
"user".to_string()
}),
created_at: Some(ts),
content: format!("{prefix}-{conv_idx}-{msg_idx}-{}", "x".repeat(body_bytes)),
extra: if is_assistant {
token_usage_extra(1_000 + msg_idx as i64, 500 + msg_idx as i64)
} else {
serde_json::json!({})
},
snippets: Vec::new(),
invocations: Vec::new(),
});
}
NormalizedConversation {
agent_slug: agent_slug.to_string(),
external_id: Some(format!("{prefix}-{conv_idx}")),
title: Some(format!("{agent_slug} startup {conv_idx}")),
workspace: Some(PathBuf::from(format!("/workspace/{agent_slug}/{prefix}"))),
source_path: PathBuf::from(format!("/logs/{agent_slug}/{prefix}-{conv_idx}.jsonl")),
started_at: messages.first().and_then(|msg| msg.created_at),
ended_at: messages.last().and_then(|msg| msg.created_at),
metadata: serde_json::json!({
"agent_slug": agent_slug,
"fixture": "large_startup"
}),
messages,
}
}
fn send_done(tx: &Sender<IndexMessage>, connector_name: &'static str, is_discovered: bool) {
tx.send(IndexMessage::Done {
connector_name,
scan_ms: 1,
is_discovered,
})
.expect("done message should send");
}
struct DeferredBatchConnector;
impl Connector for DeferredBatchConnector {
fn detect(&self) -> DetectionResult {
DetectionResult {
detected: true,
evidence: vec!["fixture".to_string()],
root_paths: Vec::new(),
}
}
fn scan(
&self,
_ctx: &crate::connectors::ScanContext,
) -> anyhow::Result<Vec<NormalizedConversation>> {
Ok(vec![norm_conv(
Some("deferred-batch"),
vec![
norm_msg(0, 1_700_000_000_000),
norm_msg(1, 1_700_000_000_100),
],
)])
}
fn scan_with_callback(
&self,
ctx: &crate::connectors::ScanContext,
on_conversation: &mut dyn FnMut(NormalizedConversation) -> anyhow::Result<()>,
) -> anyhow::Result<()> {
for conversation in self.scan(ctx)? {
on_conversation(conversation)?;
}
Ok(())
}
}
fn deferred_batch_connector_factory() -> Box<dyn Connector + Send> {
Box::new(DeferredBatchConnector)
}
static FAILING_EXPLICIT_FILE_ROOT: Mutex<Option<PathBuf>> = Mutex::new(None);
struct FailingExplicitFileRootConnector;
impl Connector for FailingExplicitFileRootConnector {
fn detect(&self) -> DetectionResult {
let root_path = FAILING_EXPLICIT_FILE_ROOT
.lock()
.unwrap_or_else(|e| e.into_inner())
.clone()
.expect("explicit file root should be configured");
DetectionResult {
detected: true,
evidence: vec!["explicit-file-root".to_string()],
root_paths: vec![root_path],
}
}
fn scan(
&self,
_ctx: &crate::connectors::ScanContext,
) -> anyhow::Result<Vec<NormalizedConversation>> {
Err(anyhow::anyhow!(
"connector parse failed after source discovery"
))
}
}
fn failing_explicit_file_root_connector_factory() -> Box<dyn Connector + Send> {
Box::new(FailingExplicitFileRootConnector)
}
struct DetectedRemoteFailureConnector;
impl Connector for DetectedRemoteFailureConnector {
fn detect(&self) -> DetectionResult {
DetectionResult {
detected: true,
evidence: vec!["fixture".to_string()],
root_paths: Vec::new(),
}
}
fn scan(
&self,
_ctx: &crate::connectors::ScanContext,
) -> anyhow::Result<Vec<NormalizedConversation>> {
Ok(Vec::new())
}
fn scan_with_callback(
&self,
ctx: &crate::connectors::ScanContext,
_on_conversation: &mut dyn FnMut(NormalizedConversation) -> anyhow::Result<()>,
) -> anyhow::Result<()> {
if ctx.scan_roots.is_empty() {
Ok(())
} else {
Err(anyhow::anyhow!("remote exploded"))
}
}
}
fn detected_remote_failure_connector_factory() -> Box<dyn Connector + Send> {
Box::new(DetectedRemoteFailureConnector)
}
struct PanicConnector;
impl Connector for PanicConnector {
fn detect(&self) -> DetectionResult {
DetectionResult {
detected: true,
evidence: vec!["fixture".to_string()],
root_paths: Vec::new(),
}
}
fn scan(
&self,
_ctx: &crate::connectors::ScanContext,
) -> anyhow::Result<Vec<NormalizedConversation>> {
Ok(Vec::new())
}
fn scan_with_callback(
&self,
_ctx: &crate::connectors::ScanContext,
_on_conversation: &mut dyn FnMut(NormalizedConversation) -> anyhow::Result<()>,
) -> anyhow::Result<()> {
panic!("connector panic during local scan");
}
}
fn panic_connector_factory() -> Box<dyn Connector + Send> {
Box::new(PanicConnector)
}
static DISCONNECT_TEST_COUNTER: Mutex<Option<Arc<AtomicUsize>>> = Mutex::new(None);
struct DisconnectAwareConnector;
impl Connector for DisconnectAwareConnector {
fn detect(&self) -> DetectionResult {
DetectionResult {
detected: true,
evidence: vec!["fixture".to_string()],
root_paths: Vec::new(),
}
}
fn scan(
&self,
_ctx: &crate::connectors::ScanContext,
) -> anyhow::Result<Vec<NormalizedConversation>> {
Ok(Vec::new())
}
fn scan_with_callback(
&self,
ctx: &crate::connectors::ScanContext,
on_conversation: &mut dyn FnMut(NormalizedConversation) -> anyhow::Result<()>,
) -> anyhow::Result<()> {
let counter = DISCONNECT_TEST_COUNTER
.lock()
.unwrap_or_else(|e| e.into_inner())
.clone()
.expect("disconnect test counter should be configured");
let scope = if ctx.scan_roots.is_empty() {
"local"
} else {
"remote"
};
for idx in 0..3 {
counter.fetch_add(1, Ordering::Relaxed);
let oversized = NormalizedMessage {
content: "x".repeat(DEFAULT_STREAMING_BATCH_LIMITS.max_chars + 1),
..norm_msg(idx, 2_000 + idx)
};
on_conversation(norm_conv(Some(scope), vec![oversized]))?;
}
Ok(())
}
}
fn disconnect_aware_connector_factory() -> Box<dyn Connector + Send> {
Box::new(DisconnectAwareConnector)
}
#[test]
fn next_streaming_batch_splits_large_message_batches() {
let limits = StreamingBatchLimits {
max_conversations: 8,
max_messages: 1_000,
max_chars: usize::MAX,
};
let convs = vec![
norm_conv(
Some("a"),
(0..700).map(|i| norm_msg(i, 1_000 + i)).collect(),
),
norm_conv(
Some("b"),
(0..400).map(|i| norm_msg(i, 2_000 + i)).collect(),
),
norm_conv(
Some("c"),
(0..300).map(|i| norm_msg(i, 3_000 + i)).collect(),
),
];
let mut iter = convs.into_iter().peekable();
let (batch1, batch1_messages) = next_streaming_batch(&mut iter, limits).unwrap();
let (batch2, batch2_messages) = next_streaming_batch(&mut iter, limits).unwrap();
assert_eq!(
batch1
.iter()
.map(|conv| conv.external_id.as_deref().unwrap())
.collect::<Vec<_>>(),
vec!["a"]
);
assert_eq!(batch1_messages, 700);
assert_eq!(
batch2
.iter()
.map(|conv| conv.external_id.as_deref().unwrap())
.collect::<Vec<_>>(),
vec!["b", "c"]
);
assert_eq!(batch2_messages, 700);
assert!(next_streaming_batch(&mut iter, limits).is_none());
}
#[test]
fn next_streaming_batch_keeps_single_oversized_conversation_isolated() {
let limits = StreamingBatchLimits {
max_conversations: 8,
max_messages: 8,
max_chars: 64,
};
let oversized = NormalizedMessage {
content: "x".repeat(256),
..norm_msg(0, 1_000)
};
let convs = vec![
norm_conv(Some("huge"), vec![oversized]),
norm_conv(Some("small"), vec![norm_msg(0, 2_000)]),
];
let mut iter = convs.into_iter().peekable();
let (batch1, batch1_messages) = next_streaming_batch(&mut iter, limits).unwrap();
let (batch2, batch2_messages) = next_streaming_batch(&mut iter, limits).unwrap();
assert_eq!(
batch1[0].external_id.as_deref(),
Some("huge"),
"oversized conversations should still index, but alone"
);
assert_eq!(batch1_messages, 1);
assert_eq!(batch2[0].external_id.as_deref(), Some("small"));
assert_eq!(batch2_messages, 1);
assert!(next_streaming_batch(&mut iter, limits).is_none());
}
#[test]
fn streaming_batch_sender_flushes_single_oversized_conversation_immediately() {
let (tx, rx) = bounded(2);
let mut sender = StreamingBatchSender::new(
&tx,
Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
"gemini",
false,
);
let oversized = NormalizedMessage {
content: "x".repeat(DEFAULT_STREAMING_BATCH_LIMITS.max_chars + 1),
..norm_msg(0, 1_000)
};
let conversation = norm_conv(Some("huge"), vec![oversized]);
sender
.push(conversation)
.expect("oversized conversation should still flush even in tests");
match rx
.try_recv()
.expect("oversized conversation should flush immediately")
{
IndexMessage::Batch {
connector_name,
conversations,
message_count,
byte_reservation,
..
} => {
assert_eq!(connector_name, "gemini");
assert_eq!(conversations.len(), 1);
assert_eq!(conversations[0].external_id.as_deref(), Some("huge"));
assert_eq!(message_count, 1);
assert_eq!(
byte_reservation,
DEFAULT_STREAMING_BATCH_LIMITS.max_chars + 1
);
}
other => panic!(
"expected batch for oversized conversation flush, got {:?}",
std::mem::discriminant(&other)
),
}
assert!(
rx.try_recv().is_err(),
"sender buffer should be empty after auto-flush"
);
sender.flush().unwrap();
assert!(rx.try_recv().is_err(), "explicit flush should be a no-op");
}
#[test]
fn streaming_batch_sender_accounts_pending_conversation_bytes_before_flush() {
let (tx, rx) = bounded(2);
let limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let mut sender = StreamingBatchSender::new(&tx, limiter.clone(), "codex", true);
let content = "x".repeat(512);
let expected_bytes = content.len();
let conversation = norm_conv(
Some("pending"),
vec![NormalizedMessage {
content,
..norm_msg(0, 1_000)
}],
);
sender
.push(conversation)
.expect("pending conversation should fit in the streaming budget");
assert_eq!(
limiter.bytes_in_flight(),
expected_bytes,
"producer-side pending batches must be counted before flush"
);
assert!(
rx.try_recv().is_err(),
"non-oversized conversation should remain pending until flush"
);
sender.flush().unwrap();
match rx.try_recv().expect("flush should publish pending batch") {
IndexMessage::Batch {
connector_name,
conversations,
message_count,
byte_reservation,
..
} => {
assert_eq!(connector_name, "codex");
assert_eq!(conversations.len(), 1);
assert_eq!(conversations[0].external_id.as_deref(), Some("pending"));
assert_eq!(message_count, 1);
assert_eq!(byte_reservation, expected_bytes);
assert_eq!(limiter.bytes_in_flight(), expected_bytes);
limiter.release(byte_reservation);
}
other => panic!(
"expected batch for pending conversation flush, got {:?}",
std::mem::discriminant(&other)
),
}
assert_eq!(limiter.bytes_in_flight(), 0);
}
#[test]
fn streaming_batch_sender_drop_releases_unflushed_reservation() {
let (tx, rx) = bounded(2);
let limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let content = "x".repeat(384);
let expected_bytes = content.len();
{
let mut sender = StreamingBatchSender::new(&tx, limiter.clone(), "cursor", false);
sender
.push(norm_conv(
Some("unflushed"),
vec![NormalizedMessage {
content,
..norm_msg(0, 1_000)
}],
))
.expect("pending conversation should reserve bytes");
assert_eq!(limiter.bytes_in_flight(), expected_bytes);
}
assert_eq!(
limiter.bytes_in_flight(),
0,
"dropping an unflushed producer must not leak byte budget"
);
assert!(
rx.try_recv().is_err(),
"drop must not publish a partial batch"
);
}
#[test]
fn streaming_batch_sender_send_failure_releases_preacquired_reservation() {
let (tx, rx) = bounded(1);
drop(rx);
let limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let mut sender = StreamingBatchSender::new(&tx, limiter.clone(), "amp", false);
let content = "x".repeat(256);
let expected_bytes = content.len();
sender
.push(norm_conv(
Some("disconnected"),
vec![NormalizedMessage {
content,
..norm_msg(0, 1_000)
}],
))
.expect("push should only reserve bytes before flush");
assert_eq!(limiter.bytes_in_flight(), expected_bytes);
let error = sender
.flush()
.expect_err("closed receiver should fail flush");
assert!(is_streaming_consumer_disconnected(&error));
assert_eq!(
limiter.bytes_in_flight(),
0,
"flush failure must release the reservation exactly once"
);
}
#[test]
fn streaming_byte_limiter_blocks_until_capacity_is_released() {
let limiter = Arc::new(StreamingByteLimiter::new(64));
let first = limiter.acquire(128).unwrap();
let (ready_tx, ready_rx) = bounded(1);
let (result_tx, result_rx) = bounded(1);
let waiter = {
let limiter = limiter.clone();
thread::spawn(move || {
ready_tx.send(()).unwrap();
let second = limiter.acquire(32).unwrap();
result_tx.send(second).unwrap();
limiter.release(second);
})
};
ready_rx.recv_timeout(Duration::from_secs(1)).unwrap();
assert!(
result_rx.try_recv().is_err(),
"waiter should remain blocked while the limiter is full"
);
limiter.release(first);
assert_eq!(result_rx.recv_timeout(Duration::from_secs(1)).unwrap(), 32);
waiter.join().unwrap();
}
#[test]
fn streaming_byte_limiter_close_wakes_waiters() {
let limiter = Arc::new(StreamingByteLimiter::new(64));
let first = limiter.acquire(64).unwrap();
let (ready_tx, ready_rx) = bounded(1);
let (result_tx, result_rx) = bounded(1);
let waiter = {
let limiter = limiter.clone();
thread::spawn(move || {
ready_tx.send(()).unwrap();
let result = limiter.acquire(1).map_err(|error| error.to_string());
result_tx.send(result).unwrap();
})
};
ready_rx.recv_timeout(Duration::from_secs(1)).unwrap();
assert!(
result_rx.try_recv().is_err(),
"waiter should remain blocked until the limiter is closed"
);
limiter.close();
let error = result_rx
.recv_timeout(Duration::from_secs(1))
.unwrap()
.expect_err("closing the limiter should wake blocked waiters with an error");
assert!(error.contains("closed"));
limiter.release(first);
waiter.join().unwrap();
}
#[test]
fn streaming_byte_limiter_update_max_bytes_in_flight_wakes_waiters() {
let limiter = Arc::new(StreamingByteLimiter::new(64));
let first = limiter.acquire(64).unwrap();
let (ready_tx, ready_rx) = bounded(1);
let (result_tx, result_rx) = bounded(1);
let waiter = {
let limiter = limiter.clone();
thread::spawn(move || {
ready_tx.send(()).unwrap();
let second = limiter.acquire(64).unwrap();
result_tx.send(second).unwrap();
limiter.release(second);
})
};
ready_rx.recv_timeout(Duration::from_secs(1)).unwrap();
assert!(
result_rx.try_recv().is_err(),
"waiter should remain blocked while the limiter is full at the startup cap"
);
limiter.update_max_bytes_in_flight(128);
assert_eq!(limiter.max_bytes_in_flight(), 128);
assert_eq!(result_rx.recv_timeout(Duration::from_secs(1)).unwrap(), 64);
limiter.release(first);
waiter.join().unwrap();
}
#[test]
fn lexical_rebuild_reservation_order_keeps_later_pages_from_spending_budget_first() {
let order = Arc::new(LexicalRebuildReservationOrder::new());
let limiter = Arc::new(StreamingByteLimiter::new(64));
let (ready_tx, ready_rx) = bounded(1);
let (result_tx, result_rx) = bounded(1);
let waiter = {
let order = order.clone();
let limiter = limiter.clone();
thread::spawn(move || {
ready_tx.send(()).unwrap();
let (reserved, _, _) =
acquire_ordered_lexical_rebuild_page_budget(&order, &limiter, 1, 64).unwrap();
result_tx.send(reserved).unwrap();
})
};
ready_rx.recv_timeout(Duration::from_secs(1)).unwrap();
assert!(
result_rx.recv_timeout(Duration::from_millis(50)).is_err(),
"sequence 1 must wait even though the byte limiter is empty"
);
assert_eq!(
limiter.bytes_in_flight(),
0,
"later pages must not reserve bytes before earlier sequences"
);
let (first, _, _) =
acquire_ordered_lexical_rebuild_page_budget(&order, &limiter, 0, 1).unwrap();
assert_eq!(first, 1);
assert_eq!(limiter.bytes_in_flight(), 1);
assert!(
result_rx.recv_timeout(Duration::from_millis(50)).is_err(),
"sequence 1 should still respect normal byte capacity after order opens"
);
limiter.release(first);
assert_eq!(result_rx.recv_timeout(Duration::from_secs(1)).unwrap(), 64);
limiter.release(64);
waiter.join().unwrap();
}
#[test]
fn streaming_byte_limiter_update_does_not_lose_wakeup_under_repeated_shrink_grow() {
const ITERATIONS: usize = 50;
for iteration in 0..ITERATIONS {
let limiter = Arc::new(StreamingByteLimiter::new(16));
let first = limiter.acquire(16).unwrap();
let (ready_tx, ready_rx) = bounded(1);
let (result_tx, result_rx) = bounded(1);
let waiter = {
let limiter = limiter.clone();
thread::spawn(move || {
ready_tx.send(()).unwrap();
let second = limiter.acquire(16).unwrap();
result_tx.send(second).unwrap();
limiter.release(second);
})
};
ready_rx.recv_timeout(Duration::from_secs(1)).unwrap();
thread::yield_now();
limiter.update_max_bytes_in_flight(64);
let woken = result_rx
.recv_timeout(Duration::from_secs(2))
.unwrap_or_else(|err| {
panic!(
"iteration {iteration}: update_max_bytes_in_flight failed to wake parked \
waiter within 2s — lost-wakeup race regressed (wxsy8): {err}"
)
});
assert_eq!(woken, 16);
limiter.release(first);
waiter.join().unwrap();
}
}
#[test]
fn streaming_byte_limiter_acquire_with_wait_reports_capacity_stall() {
let limiter = Arc::new(StreamingByteLimiter::new(64));
let first = limiter.acquire(64).unwrap();
let releaser = {
let limiter = limiter.clone();
thread::spawn(move || {
thread::sleep(Duration::from_millis(25));
limiter.release(first);
})
};
let (reserved, _wait_duration, waited) = limiter.acquire_with_wait(32).unwrap();
assert_eq!(reserved, 32);
assert!(waited, "capacity stall should be reported to telemetry");
limiter.release(reserved);
releaser.join().unwrap();
}
#[test]
fn lexical_rebuild_pipeline_runtime_snapshot_observes_stall_only_telemetry() {
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
producer_budget_wait_count: 1,
producer_budget_wait_ms: 7,
producer_handoff_wait_count: 1,
producer_handoff_wait_ms: 3,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
assert!(
runtime.is_observed(),
"stall counters alone should keep attach/status runtime visible"
);
}
#[test]
fn send_conversation_batches_marks_only_first_batch_as_discovered() {
let (tx, rx) = bounded(4);
let convs = vec![
norm_conv(
Some("a"),
(0..1_200).map(|i| norm_msg(i, 1_000 + i)).collect(),
),
norm_conv(
Some("b"),
(0..1_200).map(|i| norm_msg(i, 2_000 + i)).collect(),
),
];
send_conversation_batches(&tx, "claude", convs, true);
drop(tx);
let batches = rx.try_iter().collect::<Vec<_>>();
assert_eq!(batches.len(), 2);
match &batches[0] {
IndexMessage::Batch {
connector_name,
is_discovered,
message_count,
conversations,
..
} => {
assert_eq!(*connector_name, "claude");
assert!(*is_discovered);
assert_eq!(*message_count, 1_200);
assert_eq!(conversations.len(), 1);
}
_ => panic!("expected first message to be a batch"),
}
match &batches[1] {
IndexMessage::Batch {
connector_name,
is_discovered,
message_count,
conversations,
..
} => {
assert_eq!(*connector_name, "claude");
assert!(!*is_discovered);
assert_eq!(*message_count, 1_200);
assert_eq!(conversations.len(), 1);
}
_ => panic!("expected second message to be a batch"),
}
}
#[test]
fn snapshot_json_phase_label_matches_phase_code_sample() {
let progress = IndexingProgress::default();
progress.phase.store(2, Ordering::Relaxed);
let snapshot = progress.snapshot_json(2500);
assert_eq!(snapshot["phase_code"], serde_json::json!(2));
assert_eq!(snapshot["phase"], serde_json::json!("indexing"));
}
#[test]
fn snapshot_json_includes_rebuild_pipeline_runtime_metrics() {
let progress = IndexingProgress::default();
progress.phase.store(2, Ordering::Relaxed);
progress.is_rebuilding.store(true, Ordering::Relaxed);
progress
.rebuild_pipeline_queue_depth
.store(3, Ordering::Relaxed);
progress
.rebuild_pipeline_inflight_message_bytes
.store(65_536, Ordering::Relaxed);
progress
.rebuild_pipeline_pending_batch_conversations
.store(9, Ordering::Relaxed);
progress
.rebuild_pipeline_pending_batch_message_bytes
.store(131_072, Ordering::Relaxed);
progress
.rebuild_pipeline_page_prep_workers
.store(6, Ordering::Relaxed);
progress
.rebuild_pipeline_active_page_prep_jobs
.store(2, Ordering::Relaxed);
progress
.rebuild_pipeline_ordered_buffered_pages
.store(4, Ordering::Relaxed);
progress
.rebuild_pipeline_budget_generation
.store(1, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_budget_wait_count
.store(2, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_budget_wait_ms
.store(17, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_handoff_wait_count
.store(1, Ordering::Relaxed);
progress
.rebuild_pipeline_producer_handoff_wait_ms
.store(9, Ordering::Relaxed);
*progress
.rebuild_pipeline_host_loadavg_1m_milli
.lock()
.expect("lock host loadavg") = Some(7_250);
*progress
.rebuild_pipeline_host_available_memory_bytes
.lock()
.expect("lock host available memory") = Some(123_456_789);
*progress
.rebuild_pipeline_process_rss_bytes
.lock()
.expect("lock process rss") = Some(98_765_432);
*progress
.rebuild_pipeline_controller_mode
.lock()
.expect("lock controller mode") = "pressure_limited".to_string();
*progress
.rebuild_pipeline_controller_reason
.lock()
.expect("lock controller reason") =
"queue_depth_2_reached_pipeline_capacity_2".to_string();
progress
.rebuild_pipeline_staged_merge_workers_max
.store(3, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_allowed_jobs
.store(1, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_active_jobs
.store(1, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_ready_artifacts
.store(5, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_merge_ready_groups
.store(1, Ordering::Relaxed);
*progress
.rebuild_pipeline_staged_merge_controller_reason
.lock()
.expect("lock staged merge reason") = "page_prep_workers_saturated_6_of_6".to_string();
progress
.rebuild_pipeline_staged_shard_build_workers_max
.store(6, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_allowed_jobs
.store(5, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_active_jobs
.store(4, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_pending_jobs
.store(2, Ordering::Relaxed);
*progress
.rebuild_pipeline_staged_shard_build_controller_reason
.lock()
.expect("lock staged shard-build reason") =
"reserving_1_slots_for_staged_merge_active_jobs_1_ready_groups_1".to_string();
progress
.rebuild_pipeline_staged_shard_build_memory_reserve_bytes
.store(16_000, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_emergency_memory_reserve_bytes
.store(4_000, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_completed_jobs
.store(7, Ordering::Relaxed);
*progress
.rebuild_pipeline_staged_shard_build_last_shard_index
.lock()
.expect("lock last shard index") = Some(6);
progress
.rebuild_pipeline_staged_shard_build_last_message_bytes
.store(65_000_000, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_last_index_size_bytes
.store(130_000_000, Ordering::Relaxed);
progress
.rebuild_pipeline_staged_shard_build_last_duration_ms
.store(2_500, Ordering::Relaxed);
*progress
.rebuild_pipeline_staged_shard_build_last_amplification_milli
.lock()
.expect("lock last amplification") = Some(2_000);
*progress
.rebuild_pipeline_staged_shard_build_observed_amplification_milli
.lock()
.expect("lock observed amplification") = Some(8_000);
let snapshot = progress.snapshot_json(2500);
assert_eq!(
snapshot["rebuild_pipeline"]["queue_depth"],
serde_json::json!(3)
);
assert_eq!(
snapshot["rebuild_pipeline"]["inflight_message_bytes"],
serde_json::json!(65_536)
);
assert_eq!(
snapshot["rebuild_pipeline"]["pending_batch_conversations"],
serde_json::json!(9)
);
assert_eq!(
snapshot["rebuild_pipeline"]["pending_batch_message_bytes"],
serde_json::json!(131_072)
);
assert_eq!(
snapshot["rebuild_pipeline"]["page_prep_workers"],
serde_json::json!(6)
);
assert_eq!(
snapshot["rebuild_pipeline"]["active_page_prep_jobs"],
serde_json::json!(2)
);
assert_eq!(
snapshot["rebuild_pipeline"]["ordered_buffered_pages"],
serde_json::json!(4)
);
assert_eq!(
snapshot["rebuild_pipeline"]["budget_generation"],
serde_json::json!(1)
);
assert_eq!(
snapshot["rebuild_pipeline"]["producer_budget_wait_count"],
serde_json::json!(2)
);
assert_eq!(
snapshot["rebuild_pipeline"]["producer_budget_wait_ms"],
serde_json::json!(17)
);
assert_eq!(
snapshot["rebuild_pipeline"]["producer_handoff_wait_count"],
serde_json::json!(1)
);
assert_eq!(
snapshot["rebuild_pipeline"]["producer_handoff_wait_ms"],
serde_json::json!(9)
);
assert_eq!(
snapshot["rebuild_pipeline"]["host_loadavg_1m"],
serde_json::json!(7.25)
);
assert_eq!(
snapshot["rebuild_pipeline"]["host_available_memory_bytes"],
serde_json::json!(123_456_789)
);
assert_eq!(
snapshot["rebuild_pipeline"]["process_rss_bytes"],
serde_json::json!(98_765_432)
);
assert_eq!(
snapshot["rebuild_pipeline"]["controller_mode"],
serde_json::json!("pressure_limited")
);
assert_eq!(
snapshot["rebuild_pipeline"]["controller_reason"],
serde_json::json!("queue_depth_2_reached_pipeline_capacity_2")
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_merge_workers_max"],
serde_json::json!(3)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_merge_allowed_jobs"],
serde_json::json!(1)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_merge_active_jobs"],
serde_json::json!(1)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_merge_ready_artifacts"],
serde_json::json!(5)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_merge_ready_groups"],
serde_json::json!(1)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_merge_controller_reason"],
serde_json::json!("page_prep_workers_saturated_6_of_6")
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_workers_max"],
serde_json::json!(6)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_allowed_jobs"],
serde_json::json!(5)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_active_jobs"],
serde_json::json!(4)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_pending_jobs"],
serde_json::json!(2)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_controller_reason"],
serde_json::json!("reserving_1_slots_for_staged_merge_active_jobs_1_ready_groups_1")
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_memory_reserve_bytes"],
serde_json::json!(16_000)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_emergency_memory_reserve_bytes"],
serde_json::json!(4_000)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_completed_jobs"],
serde_json::json!(7)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_last_shard_index"],
serde_json::json!(6)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_last_message_bytes"],
serde_json::json!(65_000_000)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_last_index_size_bytes"],
serde_json::json!(130_000_000u64)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_last_duration_ms"],
serde_json::json!(2_500u64)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_last_amplification_milli"],
serde_json::json!(2_000u64)
);
assert_eq!(
snapshot["rebuild_pipeline"]["staged_shard_build_observed_amplification_milli"],
serde_json::json!(8_000u64)
);
}
#[test]
fn streaming_consumer_preserves_discovered_connector_with_no_batches() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let (tx, rx) = bounded(2);
tx.send(IndexMessage::Done {
connector_name: "claude",
scan_ms: 42,
is_discovered: true,
})
.unwrap();
drop(tx);
let (discovered, mutations) = run_streaming_consumer(
rx,
1,
&storage,
&data_dir,
Some(&mut index),
Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
&Some(progress.clone()),
LexicalPopulationStrategy::IncrementalInline,
None,
)
.unwrap();
assert_eq!(discovered, vec!["claude".to_string()]);
assert_eq!(mutations, CanonicalMutationCounts::default());
let stats = progress.stats.lock().unwrap_or_else(|e| e.into_inner());
assert_eq!(stats.agents_discovered, vec!["claude".to_string()]);
assert_eq!(stats.total_conversations, 0);
assert_eq!(stats.total_messages, 0);
}
#[test]
fn streaming_consumer_can_defer_authoritative_lexical_updates_without_tantivy_writer() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let progress = Arc::new(IndexingProgress::default());
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
send_conversation_batches(
&tx,
"codex",
vec![norm_conv(
Some("stream-deferred"),
vec![
norm_msg(0, 1_700_000_000_000),
norm_msg(1, 1_700_000_000_100),
],
)],
true,
);
send_done(&tx, "codex", true);
drop(tx);
let (discovered, mutations) = run_streaming_consumer(
rx,
1,
&storage,
&data_dir,
None,
Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
&Some(progress.clone()),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
Some(FrankenStorage::now_millis()),
)
.expect("deferred streaming ingest should not require a Tantivy writer");
let conversation_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
let message_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(discovered, vec!["codex".to_string()]);
assert_eq!(
mutations,
CanonicalMutationCounts {
inserted_conversations: 1,
inserted_messages: 2,
}
);
assert_eq!(conversation_count, 1);
assert_eq!(message_count, 2);
assert!(
!index_dir(&data_dir).unwrap().join("meta.json").exists(),
"deferred streaming ingest should not materialize a live Tantivy index before the authoritative rebuild"
);
}
#[test]
#[serial]
fn streaming_consumer_defers_lexical_oom_without_quarantining_persisted_conversation() {
let _oom_guard = set_env("CASS_TEST_INCREMENTAL_LEXICAL_UPDATE_OOM", "1");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
send_conversation_batches(
&tx,
"codex",
vec![norm_conv(
Some("lexical-oom-deferred"),
vec![norm_msg(0, 1_700_000_000_000)],
)],
true,
);
send_done(&tx, "codex", true);
drop(tx);
let (_discovered, outcome) = run_streaming_consumer(
rx,
1,
&storage,
&data_dir,
Some(&mut index),
Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
&Some(progress.clone()),
LexicalPopulationStrategy::IncrementalInline,
Some(FrankenStorage::now_millis()),
)
.expect("lexical OOM after SQLite ingest should defer repair, not fail the scan");
assert_eq!(outcome.inserted_conversations, 1);
assert_eq!(outcome.inserted_messages, 1);
assert!(outcome.lexical_update_deferred);
assert_eq!(outcome.quarantined_conversations, 0);
let conversation_rows: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_rows, 1);
assert!(
!data_dir
.join("quarantine/index_ingest_poison.jsonl")
.exists(),
"lexical-only OOM must not quarantine a conversation already preserved in SQLite"
);
let stats = progress.stats.lock().unwrap_or_else(|e| e.into_inner());
assert!(stats.lexical_update_deferred);
assert_eq!(stats.quarantined_conversations, 0);
}
#[test]
#[serial]
fn streaming_consumer_quarantines_single_non_watch_oom_and_dedupes_record() {
let _oom_guard = set_env("CASS_TEST_NON_WATCH_INGEST_OOM_MIN_CONVS", "1");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let progress = Arc::new(IndexingProgress::default());
let conv = norm_conv(Some("poison-single"), vec![norm_msg(0, 1_700_000_000_000)]);
for expected_attempts in [1_u64, 2] {
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
send_conversation_batches(&tx, "codex", vec![conv.clone()], true);
send_done(&tx, "codex", true);
drop(tx);
let (_discovered, outcome) = run_streaming_consumer(
rx,
1,
&storage,
&data_dir,
Some(&mut index),
Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
&Some(progress.clone()),
LexicalPopulationStrategy::IncrementalInline,
Some(FrankenStorage::now_millis()),
)
.expect("single deferred ingest OOM should quarantine and continue");
assert_eq!(outcome.inserted_conversations, 0);
assert_eq!(outcome.inserted_messages, 0);
assert!(outcome.lexical_update_deferred);
assert_eq!(outcome.quarantined_conversations, 1);
let quarantine_path = data_dir.join("quarantine/index_ingest_poison.jsonl");
let contents = std::fs::read_to_string(&quarantine_path).unwrap();
let lines: Vec<_> = contents
.lines()
.filter(|line| !line.trim().is_empty())
.collect();
assert_eq!(
lines.len(),
1,
"poison record must be upserted, not appended"
);
let record: serde_json::Value = serde_json::from_str(lines[0]).unwrap();
assert_eq!(record["reason"], "index-ingest-out-of-memory");
assert_eq!(
record["attempt_count"],
serde_json::json!(expected_attempts)
);
assert_eq!(record["external_id"], serde_json::json!("poison-single"));
let structured_state = QuarantineState::load(&data_dir);
assert_eq!(structured_state.len(), 1);
let structured_attempt_count = structured_state
.iter()
.next()
.map(|(_, record)| record.attempt_count);
assert_eq!(structured_attempt_count, Some(expected_attempts));
let summary = conversation_ingest_quarantine_summary(&data_dir);
assert_eq!(summary.quarantined_conversations, 1);
assert_eq!(summary.status, "degraded");
}
}
#[test]
fn stale_index_ingest_quarantine_version_retry_detects_legacy_records() -> Result<()> {
let tmp = TempDir::new()?;
let data_dir = tmp.path().join("data");
let quarantine_dir = data_dir.join("quarantine");
std::fs::create_dir_all(&quarantine_dir)?;
let legacy = norm_conv(
Some("legacy-index-poison"),
vec![norm_msg(0, 1_700_000_000_000)],
);
let current = norm_conv(
Some("current-index-poison"),
vec![norm_msg(0, 1_700_000_000_001)],
);
let legacy_id = poison_conversation_id(&legacy);
let current_id = poison_conversation_id(¤t);
std::fs::write(
quarantine_dir.join(INDEX_INGEST_POISON_FILE),
format!(
"{}\n{}\n",
serde_json::json!({
"conversation_id": legacy_id,
"schema_version_at_quarantine": crate::storage::sqlite::CURRENT_SCHEMA_VERSION,
"reason": "index-ingest-out-of-memory"
}),
serde_json::json!({
"conversation_id": current_id,
"schema_version_at_quarantine": crate::storage::sqlite::CURRENT_SCHEMA_VERSION,
"cass_version_at_quarantine": current_cass_version(),
"reason": "index-ingest-out-of-memory"
})
),
)
.context("write index-ingest poison fixture")?;
let retry = stale_index_ingest_quarantine_version_retry(&data_dir)
.context("load index-ingest poison retry state")?
.context("legacy missing-version record should request one retry")?;
assert_eq!(retry.stale_records, 1);
assert_eq!(retry.legacy_records, 1);
assert_eq!(retry.previous_versions, vec!["unknown".to_string()]);
Ok(())
}
#[test]
fn stale_index_ingest_quarantine_version_retry_detects_structured_legacy_records() -> Result<()>
{
let tmp = TempDir::new()?;
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir)?;
let schema_version = u32::try_from(crate::storage::sqlite::CURRENT_SCHEMA_VERSION)
.context("current schema version fits in quarantine key")?;
let mut state = QuarantineState::default();
state.record_attempt(
&QuarantineKey::new("structured-legacy-index-poison", schema_version),
"index-ingest-out-of-memory: out of memory",
chrono::DateTime::<chrono::Utc>::from_timestamp(1_700_000_000, 0)
.context("valid quarantine timestamp")?,
);
state
.entries
.values_mut()
.next()
.context("structured quarantine record exists")?
.cass_version_at_quarantine = None;
state.save(&data_dir)?;
let retry = stale_index_ingest_quarantine_version_retry(&data_dir)
.context("load structured index-ingest poison retry state")?
.context("structured legacy record should request one retry")?;
assert_eq!(retry.stale_records, 1);
assert_eq!(retry.legacy_records, 1);
assert_eq!(retry.previous_versions, vec!["unknown".to_string()]);
Ok(())
}
#[test]
fn marking_stale_index_ingest_quarantine_retry_attempted_prevents_repeat() -> Result<()> {
let tmp = TempDir::new()?;
let data_dir = tmp.path().join("data");
let quarantine_dir = data_dir.join("quarantine");
std::fs::create_dir_all(&quarantine_dir)?;
let conv = norm_conv(
Some("stale-index-poison-missing-source"),
vec![norm_msg(0, 1_700_000_000_000)],
);
let conversation_id = poison_conversation_id(&conv);
let jsonl_path = quarantine_dir.join(INDEX_INGEST_POISON_FILE);
std::fs::write(
&jsonl_path,
format!(
"{}\n",
serde_json::json!({
"conversation_id": conversation_id,
"schema_version_at_quarantine": crate::storage::sqlite::CURRENT_SCHEMA_VERSION,
"reason": "index-ingest-out-of-memory",
"cass_version_at_quarantine": "0.0.0"
})
),
)
.context("write stale index-ingest poison fixture")?;
let schema_version = u32::try_from(crate::storage::sqlite::CURRENT_SCHEMA_VERSION)
.context("current schema version fits in quarantine key")?;
let mut state = QuarantineState::default();
state.record_attempt(
&QuarantineKey::new(conversation_id, schema_version),
"index-ingest-out-of-memory: out of memory",
chrono::DateTime::<chrono::Utc>::from_timestamp(1_700_000_000, 0)
.context("valid quarantine timestamp")?,
);
state
.entries
.values_mut()
.next()
.context("structured quarantine record exists")?
.cass_version_at_quarantine = None;
state.save(&data_dir)?;
let retry = stale_index_ingest_quarantine_version_retry(&data_dir)
.context("load index-ingest poison retry state")?
.context("stale records should request a retry before marking")?;
assert_eq!(retry.stale_records, 1);
assert_eq!(retry.legacy_records, 1);
assert_eq!(
mark_stale_index_ingest_quarantine_retry_attempted(&data_dir)
.context("mark stale retry attempted")?,
2,
"both JSONL and structured quarantine surfaces should be marked"
);
assert!(
stale_index_ingest_quarantine_version_retry(&data_dir)?.is_none(),
"marking an attempted retry must prevent repeat full scans"
);
assert!(
jsonl_path.exists(),
"marking retry attempted must rewrite the quarantine file in place, not delete it"
);
let jsonl = std::fs::read_to_string(&jsonl_path)?;
let record: serde_json::Value = serde_json::from_str(
jsonl
.lines()
.next()
.context("rewritten JSONL record remains")?,
)?;
assert_eq!(
poison_record_cass_version(&record),
Some(current_cass_version())
);
let structured_state = QuarantineState::load(&data_dir);
assert_eq!(structured_state.len(), 1);
let structured_record = structured_state
.entries
.values()
.next()
.context("structured quarantine record remains")?;
assert_eq!(
structured_record.cass_version_at_quarantine.as_deref(),
Some(current_cass_version())
);
Ok(())
}
#[test]
fn successful_ingest_clears_matching_index_poison_records_without_deleting_file() -> Result<()>
{
let tmp = TempDir::new()?;
let data_dir = tmp.path().join("data");
let quarantine_dir = data_dir.join("quarantine");
std::fs::create_dir_all(&quarantine_dir)?;
let conv = norm_conv(
Some("stale-index-poison-cleared"),
vec![norm_msg(0, 1_700_000_000_000)],
);
let conversation_id = poison_conversation_id(&conv);
let jsonl_path = quarantine_dir.join(INDEX_INGEST_POISON_FILE);
std::fs::write(
&jsonl_path,
format!(
"{}\n",
serde_json::json!({
"conversation_id": conversation_id,
"schema_version_at_quarantine": crate::storage::sqlite::CURRENT_SCHEMA_VERSION,
"reason": "index-ingest-out-of-memory",
"cass_version_at_quarantine": "0.0.0"
})
),
)
.context("write index-ingest poison fixture")?;
let schema_version = u32::try_from(crate::storage::sqlite::CURRENT_SCHEMA_VERSION)
.context("current schema version fits in quarantine key")?;
let mut state = QuarantineState::default();
state.record_attempt(
&QuarantineKey::new(conversation_id.clone(), schema_version),
"index-ingest-out-of-memory: out of memory",
chrono::DateTime::<chrono::Utc>::from_timestamp(1_700_000_000, 0)
.context("valid quarantine timestamp")?,
);
state.save(&data_dir)?;
clear_poison_conversations_after_successful_ingest(
&data_dir,
INDEX_INGEST_POISON_FILE,
"index-ingest-out-of-memory",
&[conv],
);
assert!(
jsonl_path.exists(),
"cleanup must truncate the quarantine file in place, not delete it"
);
assert!(
std::fs::read_to_string(&jsonl_path)?.trim().is_empty(),
"successful retry should remove the matching JSONL poison record"
);
assert!(
QuarantineState::load(&data_dir).is_empty(),
"successful retry should clear the matching structured quarantine record"
);
Ok(())
}
#[test]
fn stale_index_ingest_quarantine_retry_forces_scan_from_start() {
assert_eq!(
non_watch_scan_since_ts(false, false, false, Some(1234)),
Some(1233)
);
assert_eq!(
non_watch_scan_since_ts(false, false, true, Some(1234)),
None
);
assert_eq!(
non_watch_scan_since_ts(true, false, false, Some(1234)),
None
);
assert_eq!(
non_watch_scan_since_ts(false, true, false, Some(1234)),
None
);
}
#[test]
#[serial]
fn streaming_consumer_handles_mixed_startup_batches_with_watch_checkpoint_policy() {
let _wal_guard = set_env("CASS_INDEX_WRITER_WAL_AUTOCHECKPOINT_PAGES", "-1");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
persist::apply_index_writer_busy_timeout(&storage);
persist::apply_index_writer_checkpoint_policy(&storage, false);
let progress = Arc::new(IndexingProgress::default());
let flow_limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
let amp_convs: Vec<_> = (0..4)
.map(|conv_idx| {
large_startup_conv("amp", "amp-startup", conv_idx, 6, 256, 1_700_000_000_000)
})
.collect();
let opencode_convs: Vec<_> = (0..2)
.map(|conv_idx| {
large_startup_conv(
"opencode",
"opencode-startup",
conv_idx,
4,
256,
1_700_100_000_000,
)
})
.collect();
let expected_conversations = (amp_convs.len() + opencode_convs.len()) as i64;
let expected_messages = (4 * 6 + 2 * 4) as i64;
send_conversation_batches(&tx, "amp", amp_convs, true);
send_done(&tx, "amp", true);
send_conversation_batches(&tx, "opencode", opencode_convs, true);
send_done(&tx, "opencode", true);
drop(tx);
let (discovered, mutations) = run_streaming_consumer(
rx,
2,
&storage,
&data_dir,
None,
flow_limiter,
&Some(progress.clone()),
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
Some(FrankenStorage::now_millis()),
)
.expect("mixed startup ingest should not violate foreign keys");
assert!(
discovered.iter().any(|name| name == "amp"),
"amp should remain marked as discovered"
);
assert!(
discovered.iter().any(|name| name == "opencode"),
"opencode should remain marked as discovered"
);
let conversation_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
let message_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
let wal_autocheckpoint: i64 = storage
.raw()
.query_row_map("PRAGMA wal_autocheckpoint;", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(conversation_count, expected_conversations);
assert_eq!(message_count, expected_messages);
assert_eq!(
mutations,
CanonicalMutationCounts {
inserted_conversations: expected_conversations as usize,
inserted_messages: expected_messages as usize,
}
);
assert_eq!(
wal_autocheckpoint, 0,
"startup watch ingest should defer WAL auto-checkpoints"
);
}
#[test]
#[serial]
fn ingest_batch_applies_checkpoint_policy_for_serial_writer_path() {
let _guard = set_env("CASS_INDEX_WRITER_WAL_AUTOCHECKPOINT_PAGES", "-1");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("checkpoint-policy.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
persist::apply_index_writer_checkpoint_policy(&storage, false);
let first = vec![norm_conv(Some("checkpoint-a"), vec![norm_msg(0, 1_000)])];
ingest_batch(
&storage,
Some(&mut index),
&data_dir,
&first,
&None,
LexicalPopulationStrategy::IncrementalInline,
true,
)
.unwrap();
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get(0).unwrap(), &SqliteValue::Integer(0));
let second = vec![norm_conv(Some("checkpoint-b"), vec![norm_msg(0, 2_000)])];
ingest_batch(
&storage,
Some(&mut index),
&data_dir,
&second,
&None,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.unwrap();
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get(0).unwrap(), &SqliteValue::Integer(1000));
}
#[test]
fn restore_watch_steady_state_checkpoint_policy_only_reenables_autocheckpoint_for_live_watch() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("watch-steady-state.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
persist::apply_index_writer_checkpoint_policy(&storage, true);
restore_watch_steady_state_checkpoint_policy(&storage, true);
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get(0).unwrap(), &SqliteValue::Integer(1000));
persist::apply_index_writer_checkpoint_policy(&storage, true);
restore_watch_steady_state_checkpoint_policy(&storage, false);
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].get(0).unwrap(), &SqliteValue::Integer(0));
}
#[test]
#[serial]
fn final_index_close_restores_checkpoint_policy_after_deferred_bulk_ingest() {
let _guard = set_env("CASS_INDEX_WRITER_WAL_AUTOCHECKPOINT_PAGES", "-1");
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("final-checkpoint-policy.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
persist::apply_index_writer_checkpoint_policy(&storage, true);
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(rows[0].get(0).unwrap(), &SqliteValue::Integer(0));
prepare_storage_for_final_checkpoint(&storage, &db_path, "test index close");
let rows = storage.raw().query("PRAGMA wal_autocheckpoint;").unwrap();
assert_eq!(
rows[0].get(0).unwrap(),
&SqliteValue::Integer(1000),
"final close should restore bounded auto-checkpoint policy after deferred bulk ingest"
);
assert_eq!(storage.index_writer_checkpoint_pages(), Some(1000));
}
#[test]
#[serial]
fn close_storage_after_index_checkpointing_close_does_not_leave_backfillable_wal_frames() {
let _guard = set_env("CASS_INDEX_WRITER_WAL_AUTOCHECKPOINT_PAGES", "-1");
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("final-close-checkpoints.db");
let db_path_str = db_path.to_string_lossy().to_string();
let storage = FrankenStorage::open(&db_path).unwrap();
storage.raw().execute("PRAGMA journal_mode = WAL;").unwrap();
storage
.raw()
.execute("CREATE TABLE checkpoint_probe (x INTEGER);")
.unwrap();
persist::apply_index_writer_checkpoint_policy(&storage, true);
storage
.raw()
.execute("INSERT INTO checkpoint_probe VALUES (42);")
.unwrap();
close_storage_after_index(storage, &db_path, "test index run").unwrap();
let conn = frankensqlite::Connection::open(db_path_str).unwrap();
let rows = conn.query("PRAGMA wal_checkpoint(FULL);").unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(
rows[0].get(2).unwrap(),
&SqliteValue::Integer(0),
"normal index close should already have checkpointed deferred WAL frames"
);
conn.close().unwrap();
}
#[test]
#[serial]
fn flat_combine_preserves_order_and_counts() {
let _guard = set_env("CASS_STREAMING_CONSUMER_COMBINE", "1");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("combine.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let flow_limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
send_conversation_batches(
&tx,
"codex",
vec![norm_conv(
Some("combine-a"),
vec![norm_msg(0, 1_700_000_000_000)],
)],
true,
);
send_conversation_batches(
&tx,
"claude",
vec![norm_conv(
Some("combine-b"),
vec![norm_msg(0, 1_700_000_000_100)],
)],
true,
);
send_conversation_batches(
&tx,
"codex",
vec![norm_conv(
Some("combine-c"),
vec![norm_msg(0, 1_700_000_000_200)],
)],
false,
);
send_done(&tx, "codex", true);
send_done(&tx, "claude", true);
drop(tx);
let (_discovered, mutations) = run_streaming_consumer(
rx,
2, &storage,
&data_dir,
Some(&mut index),
flow_limiter,
&Some(progress.clone()),
LexicalPopulationStrategy::IncrementalInline,
None,
)
.unwrap();
assert_eq!(mutations.inserted_conversations, 3);
let conversation_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
assert_eq!(conversation_count, 3);
use frankensqlite::compat::{ConnectionExt, RowExt};
let external_ids: Vec<String> = storage
.raw()
.query_map_collect(
"SELECT external_id FROM conversations WHERE external_id IS NOT NULL ORDER BY id",
&[],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(
external_ids,
vec![
"combine-a".to_string(),
"combine-b".to_string(),
"combine-c".to_string(),
],
);
let stats = progress.stats.lock().unwrap_or_else(|e| e.into_inner());
let codex_convs = stats
.connectors
.iter()
.find(|c| c.name == "codex")
.map(|c| c.conversations)
.unwrap_or(0);
let claude_convs = stats
.connectors
.iter()
.find(|c| c.name == "claude")
.map(|c| c.conversations)
.unwrap_or(0);
assert_eq!(codex_convs, 2);
assert_eq!(claude_convs, 1);
}
#[test]
#[serial]
fn flat_combine_disabled_produces_identical_db_state_as_combine_enabled() {
fn run_once(combine: &str) -> (i64, i64, Vec<(String, i64)>) {
let _guard = set_env("CASS_STREAMING_CONSUMER_COMBINE", combine);
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("parity.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let flow_limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
for i in 0..6 {
let conn = if i % 2 == 0 { "codex" } else { "claude" };
send_conversation_batches(
&tx,
conn,
vec![norm_conv(
Some(&format!("parity-{i}")),
vec![norm_msg(0, 1_700_000_000_000 + i as i64)],
)],
i < 2,
);
}
send_done(&tx, "codex", true);
send_done(&tx, "claude", true);
drop(tx);
run_streaming_consumer(
rx,
2,
&storage,
&data_dir,
Some(&mut index),
flow_limiter,
&Some(progress),
LexicalPopulationStrategy::IncrementalInline,
None,
)
.unwrap();
use frankensqlite::compat::{ConnectionExt, RowExt};
let conv_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
let msg_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
let rows: Vec<(String, i64)> = storage
.raw()
.query_map_collect(
"SELECT external_id, started_at FROM conversations ORDER BY id",
&[],
|row| {
let ext: Option<String> = row.get_typed(0)?;
let started: Option<i64> = row.get_typed(1)?;
Ok((ext.unwrap_or_default(), started.unwrap_or(0)))
},
)
.unwrap();
(conv_count, msg_count, rows)
}
let off = run_once("0");
let on = run_once("1");
assert_eq!(off.0, on.0, "conversation count must match");
assert_eq!(off.1, on.1, "message count must match");
assert_eq!(
off.2, on.2,
"per-conversation external_id + started_at must appear in the same order"
);
}
#[test]
#[serial]
fn flat_combine_disabled_leaves_per_message_path_untouched() {
let _guard = set_env("CASS_STREAMING_CONSUMER_COMBINE", "0");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("disabled.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let flow_limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
send_conversation_batches(
&tx,
"codex",
vec![norm_conv(
Some("single"),
vec![norm_msg(0, 1_700_000_000_000)],
)],
true,
);
send_done(&tx, "codex", true);
drop(tx);
let (discovered, mutations) = run_streaming_consumer(
rx,
1,
&storage,
&data_dir,
Some(&mut index),
flow_limiter,
&Some(progress.clone()),
LexicalPopulationStrategy::IncrementalInline,
None,
)
.unwrap();
assert_eq!(discovered, vec!["codex".to_string()]);
assert_eq!(mutations.inserted_conversations, 1);
}
#[test]
#[serial]
fn flat_combine_single_producer_skips_drain() {
let _guard = set_env("CASS_STREAMING_CONSUMER_COMBINE", "1");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("single-prod.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let flow_limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
for i in 0..3 {
send_conversation_batches(
&tx,
"codex",
vec![norm_conv(
Some(&format!("solo-{i}")),
vec![norm_msg(0, 1_700_000_000_000 + i)],
)],
i == 0,
);
}
send_done(&tx, "codex", true);
drop(tx);
let (_discovered, mutations) = run_streaming_consumer(
rx,
1, &storage,
&data_dir,
Some(&mut index),
flow_limiter,
&Some(progress.clone()),
LexicalPopulationStrategy::IncrementalInline,
None,
)
.unwrap();
assert_eq!(mutations.inserted_conversations, 3);
}
#[test]
#[serial]
fn streaming_combine_env_respects_truthy_parsing() {
let prior = std::env::var("CASS_STREAMING_CONSUMER_COMBINE").ok();
unsafe {
std::env::remove_var("CASS_STREAMING_CONSUMER_COMBINE");
}
assert!(
streaming_combine_enabled(),
"unset env must default to enabled"
);
for truthy in ["1", "true", "yes", "on", "TRUE", "Yes"] {
unsafe {
std::env::set_var("CASS_STREAMING_CONSUMER_COMBINE", truthy);
}
assert!(
streaming_combine_enabled(),
"expected `{truthy}` to enable combine"
);
}
for falsy in ["0", "false", "no", "off", "OFF"] {
unsafe {
std::env::set_var("CASS_STREAMING_CONSUMER_COMBINE", falsy);
}
assert!(
!streaming_combine_enabled(),
"expected `{falsy}` to disable combine"
);
}
for pass_through in ["", "maybe", "idk"] {
unsafe {
std::env::set_var("CASS_STREAMING_CONSUMER_COMBINE", pass_through);
}
assert!(
streaming_combine_enabled(),
"non-off values fall through to default-enabled; `{pass_through}` did not"
);
}
unsafe {
std::env::remove_var("CASS_STREAMING_CONSUMER_COMBINE");
}
if let Some(v) = prior {
unsafe {
std::env::set_var("CASS_STREAMING_CONSUMER_COMBINE", v);
}
}
}
#[test]
#[serial]
fn streaming_combine_max_messages_clamps_to_valid_range() {
let prior = std::env::var("CASS_STREAMING_COMBINE_MAX").ok();
unsafe {
std::env::set_var("CASS_STREAMING_COMBINE_MAX", "0");
}
assert_eq!(
streaming_combine_max_messages(),
64,
"zero must not be honored; default"
);
unsafe {
std::env::set_var("CASS_STREAMING_COMBINE_MAX", "1000000");
}
assert_eq!(
streaming_combine_max_messages(),
1024,
"upper clamp is 1024"
);
unsafe {
std::env::set_var("CASS_STREAMING_COMBINE_MAX", "16");
}
assert_eq!(streaming_combine_max_messages(), 16);
unsafe {
std::env::remove_var("CASS_STREAMING_COMBINE_MAX");
}
assert_eq!(streaming_combine_max_messages(), 64);
if let Some(v) = prior {
unsafe {
std::env::set_var("CASS_STREAMING_COMBINE_MAX", v);
}
}
}
#[test]
fn streaming_producer_records_remote_scan_errors_in_connector_stats() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
let flow_limiter = Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT));
let remote_root_path = PathBuf::from("/remote/fixture/claude");
let handle = spawn_connector_producer(
"claude",
detected_remote_failure_connector_factory,
tx,
StreamingProducerConfig {
flow_limiter: flow_limiter.clone(),
data_dir: data_dir.clone(),
additional_scan_roots: vec![ScanRoot::remote(
remote_root_path.clone(),
Origin::remote("fixture-host"),
Some(crate::sources::config::Platform::Linux),
)],
since_ts: None,
progress: Some(progress.clone()),
active_source_filter: Arc::new(ActiveSessionSourceFilter::default()),
},
);
let (discovered, mutations) = run_streaming_consumer(
rx,
1,
&storage,
&data_dir,
Some(&mut index),
flow_limiter,
&Some(progress.clone()),
LexicalPopulationStrategy::IncrementalInline,
None,
)
.unwrap();
handle.join().unwrap();
assert_eq!(discovered, vec!["claude".to_string()]);
assert_eq!(mutations, CanonicalMutationCounts::default());
let stats = progress.stats.lock().unwrap_or_else(|e| e.into_inner());
let connector = stats
.connectors
.iter()
.find(|connector| connector.name == "claude")
.expect("claude connector stats should exist");
assert_eq!(
connector.error.as_deref(),
Some("remote scan failed for /remote/fixture/claude: remote exploded")
);
}
#[test]
fn streaming_index_fails_closed_when_producer_panics() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let progress = Arc::new(IndexingProgress::default());
let opts = IndexOptions {
full: false,
force_rebuild: false,
watch: false,
watch_once_paths: None,
db_path,
data_dir,
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: Some(progress.clone()),
watch_interval_secs: 30,
};
let error = run_streaming_index_with_connector_factories(
&storage,
Some(&mut index),
&opts,
None,
LexicalPopulationStrategy::IncrementalInline,
Vec::new(),
vec![("claude", panic_connector_factory)],
FrankenStorage::now_millis(),
)
.expect_err("producer panic should abort streaming indexing");
let message = error.to_string();
assert!(
message.contains("streaming producer thread panicked"),
"panic should surface in the returned error: {message}"
);
assert!(
message.contains("claude: connector panic during local scan"),
"returned error should name the failing connector and panic: {message}"
);
assert_eq!(
progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner())
.as_deref(),
Some(message.as_str()),
"progress tracker should expose the real panic instead of pretending indexing succeeded"
);
}
#[test]
fn batch_index_can_defer_authoritative_lexical_updates_without_tantivy_writer() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let progress = Arc::new(IndexingProgress::default());
let opts = IndexOptions {
full: true,
force_rebuild: false,
watch: false,
watch_once_paths: None,
db_path,
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: Some(progress.clone()),
watch_interval_secs: 30,
};
let mutations = run_batch_index_with_connector_factories(
&storage,
None,
&opts,
None,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
Vec::new(),
vec![("codex", deferred_batch_connector_factory)],
FrankenStorage::now_millis(),
)
.expect("deferred batch ingest should not require a Tantivy writer");
let conversation_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
row.get_typed(0)
})
.unwrap();
let message_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
let stats = progress.stats.lock().unwrap_or_else(|e| e.into_inner());
assert_eq!(
mutations,
CanonicalMutationCounts {
inserted_conversations: 1,
inserted_messages: 2,
}
);
assert_eq!(conversation_count, 1);
assert_eq!(message_count, 2);
assert_eq!(stats.total_conversations, 1);
assert_eq!(stats.total_messages, 2);
assert!(
!index_dir(&data_dir).unwrap().join("meta.json").exists(),
"deferred batch ingest should not materialize a live Tantivy index before the authoritative rebuild"
);
}
#[test]
fn streaming_producer_stops_after_consumer_disconnect() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let counter = Arc::new(AtomicUsize::new(0));
*DISCONNECT_TEST_COUNTER
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some(counter.clone());
let (tx, rx) = bounded(STREAMING_CHANNEL_SIZE);
drop(rx);
let handle = spawn_connector_producer(
"claude",
disconnect_aware_connector_factory,
tx,
StreamingProducerConfig {
flow_limiter: Arc::new(StreamingByteLimiter::new(STREAMING_MAX_BYTES_IN_FLIGHT)),
data_dir,
additional_scan_roots: vec![ScanRoot::remote(
PathBuf::from("/remote/fixture/claude"),
Origin::remote("fixture-host"),
Some(crate::sources::config::Platform::Linux),
)],
since_ts: None,
progress: None,
active_source_filter: Arc::new(ActiveSessionSourceFilter::default()),
},
);
handle
.join()
.expect("producer should stop cleanly after consumer disconnect");
assert_eq!(
counter.load(Ordering::Relaxed),
1,
"producer should stop after the first failed batch send instead of chewing through local and remote scans"
);
*DISCONNECT_TEST_COUNTER
.lock()
.unwrap_or_else(|e| e.into_inner()) = None;
}
#[test]
fn db_id_conversion_helpers_handle_invalid_ranges() {
assert_eq!(message_id_from_db(-1), None);
assert_eq!(message_id_from_db(0), Some(0));
assert_eq!(message_id_from_db(42), Some(42));
assert_eq!(saturating_u32_from_i64(-9), 0);
assert_eq!(saturating_u32_from_i64(17), 17);
assert_eq!(
saturating_u32_from_i64(i64::from(u32::MAX) + 1234),
u32::MAX
);
}
#[test]
fn open_storage_for_index_refuses_newer_schema_without_replacing_db() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("future-schema.db");
{
let storage = FrankenStorage::open(&db_path).unwrap();
storage
.raw()
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
&[ParamValue::from(format!(
"{}",
crate::storage::sqlite::CURRENT_SCHEMA_VERSION + 1
))],
)
.unwrap();
}
let err = match open_storage_for_index(&db_path, false) {
Ok(_) => panic!("newer schema must fail closed before indexing"),
Err(err) => err.to_string(),
};
assert!(
err.contains("newer than supported"),
"diagnostic should name the schema mismatch: {err}"
);
assert!(db_path.exists(), "canonical DB must remain in place");
let backup_count = std::fs::read_dir(tmp.path())
.unwrap()
.flatten()
.filter(|entry| {
entry
.file_name()
.to_str()
.map(|name| name.starts_with("future-schema.db.backup."))
.unwrap_or(false)
})
.count();
assert!(
backup_count == 0,
"index open must not backup-and-replace a future-schema archive"
);
}
#[test]
fn open_storage_retryable_classifier_reads_anyhow_error_chain() {
let err = anyhow::anyhow!("database is locked by another indexer")
.context("opening canonical archive read-only before index");
assert!(
anyhow_chain_indicates_retryable_storage_contention(&err),
"retryable storage contention can be hidden below contextual anyhow wrappers: {err:#}"
);
}
#[test]
fn open_storage_retryable_classifier_rejects_corruption() {
let err = anyhow::anyhow!("database disk image is malformed")
.context("opening canonical archive read-only before index");
assert!(
!anyhow_chain_indicates_retryable_storage_contention(&err),
"corruption must stay on the fail-closed archive-health path: {err:#}"
);
}
#[test]
fn headroom_probe_uses_existing_ancestor_for_missing_data_dir() {
let tmp = TempDir::new().unwrap();
let missing_data_dir = tmp.path().join("missing").join("cass-data");
let missing_db_path = missing_data_dir.join("agent_search.db");
assert_eq!(
existing_headroom_probe_paths(&missing_data_dir, &missing_db_path),
vec![tmp.path().to_path_buf()]
);
}
#[test]
fn headroom_probe_checks_data_dir_and_custom_db_parent_when_distinct() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("cass-data");
let db_parent = tmp.path().join("custom-db");
std::fs::create_dir_all(&data_dir).unwrap();
std::fs::create_dir_all(&db_parent).unwrap();
assert_eq!(
existing_headroom_probe_paths(&data_dir, &db_parent.join("agent_search.db")),
vec![data_dir, db_parent]
);
}
#[test]
fn database_sidecar_paths_preserve_filename_bytes_without_display_roundtrip() {
let db_path = PathBuf::from("/tmp/cass.db");
let [wal, shm] = database_sidecar_paths(&db_path);
assert_eq!(wal, PathBuf::from("/tmp/cass.db-wal"));
assert_eq!(shm, PathBuf::from("/tmp/cass.db-shm"));
}
#[test]
fn disk_headroom_skip_env_value_parser_is_truthy_not_presence_based() {
for truthy in ["1", "true", "YES", "on"] {
assert!(
env_value_truthy(truthy),
"expected {truthy:?} to disable the headroom check"
);
}
for falsy in ["0", "false", "No", "off", "", "maybe"] {
assert!(
!env_value_truthy(falsy),
"expected {falsy:?} to keep the headroom check enabled"
);
}
}
#[test]
fn full_rebuild_integrity_preflight_accepts_healthy_current_schema() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("healthy-current-schema.db");
let storage = FrankenStorage::open(&db_path).unwrap();
assert_eq!(
full_rebuild_existing_storage_integrity_problem(&storage)
.unwrap()
.as_deref(),
None
);
}
#[test]
fn full_rebuild_integrity_preflight_flags_missing_core_tables() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("broken-current-schema.db");
let storage = FrankenStorage::open(&db_path).unwrap();
storage
.raw()
.execute("DROP TABLE messages")
.expect("drop messages to simulate current-schema archive damage");
let problem = full_rebuild_existing_storage_integrity_problem(&storage)
.unwrap()
.expect("missing core table should trigger fresh full-rebuild archive");
assert!(
problem.contains("messages"),
"diagnostic should identify the failing canary table: {problem}"
);
}
#[test]
fn current_schema_fast_probe_accepts_current_schema() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("current-schema.db");
let storage = FrankenStorage::open(&db_path).unwrap();
assert_eq!(
storage.schema_version().unwrap(),
crate::storage::sqlite::CURRENT_SCHEMA_VERSION
);
drop(storage);
assert!(current_schema_fast_probe(&db_path).unwrap());
}
#[test]
fn current_schema_fast_probe_rejects_future_schema_marker() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("future-marker.db");
let storage = FrankenStorage::open(&db_path).unwrap();
storage
.raw()
.execute_compat(
"INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
&[ParamValue::from(format!(
"{}",
crate::storage::sqlite::CURRENT_SCHEMA_VERSION + 1
))],
)
.unwrap();
drop(storage);
assert!(!current_schema_fast_probe(&db_path).unwrap());
}
#[test]
fn open_storage_for_index_fast_current_schema_path_preserves_transition_state() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("current-schema-transition.db");
{
let storage = FrankenStorage::open(&db_path).unwrap();
storage
.raw()
.execute("DROP TABLE _schema_migrations")
.unwrap();
}
assert!(current_schema_fast_probe(&db_path).unwrap());
let (storage, rebuilt, opened_fresh_for_full) =
open_storage_for_index(&db_path, false).unwrap();
assert!(!rebuilt);
assert!(!opened_fresh_for_full);
assert_eq!(
storage.schema_version().unwrap(),
crate::storage::sqlite::CURRENT_SCHEMA_VERSION
);
assert!(
storage
.raw()
.query("SELECT version FROM _schema_migrations LIMIT 1;")
.is_ok()
);
}
#[test]
fn reset_storage_clears_data_but_leaves_meta() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = norm_conv(Some("c1"), vec![norm_msg(0, 10)]);
storage
.insert_conversation_tree(
agent_id,
None,
&crate::model::types::Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: conv.title.clone(),
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: conv.metadata.clone(),
messages: conv
.messages
.iter()
.map(|m| crate::model::types::Message {
id: None,
idx: m.idx,
role: crate::model::types::MessageRole::User,
author: m.author.clone(),
created_at: m.created_at,
content: m.content.clone(),
extra_json: m.extra.clone(),
snippets: Vec::new(),
})
.collect(),
source_id: "local".to_string(),
origin_host: None,
},
)
.unwrap();
let msg_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[] as &[ParamValue], |r| {
r.get_typed(0)
})
.unwrap();
assert_eq!(msg_count, 1);
storage
.raw()
.execute_compat(
"INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
VALUES(?1, ?2, ?3, 1, 1, 10, ?4)",
&[
ParamValue::from(1_i64),
ParamValue::from("tester"),
ParamValue::from("local"),
ParamValue::from(123_i64),
],
)
.unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO usage_daily(day_id, agent_slug, workspace_id, source_id, message_count, last_updated)
VALUES(?1, ?2, ?3, ?4, 1, ?5)",
&[
ParamValue::from(1_i64),
ParamValue::from("tester"),
ParamValue::from(0_i64),
ParamValue::from("local"),
ParamValue::from(123_i64),
],
)
.unwrap();
reset_storage(&storage).unwrap();
let reopened = FrankenStorage::open(&db_path).unwrap();
let msg_count: i64 = reopened
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[] as &[ParamValue], |r| {
r.get_typed(0)
})
.unwrap();
assert_eq!(msg_count, 0);
let daily_count: i64 = reopened
.raw()
.query_row_map(
"SELECT COUNT(*) FROM daily_stats",
&[] as &[ParamValue],
|r| r.get_typed(0),
)
.unwrap();
assert_eq!(daily_count, 0);
let usage_daily_count: i64 = reopened
.raw()
.query_row_map(
"SELECT COUNT(*) FROM usage_daily",
&[] as &[ParamValue],
|r| r.get_typed(0),
)
.unwrap();
assert_eq!(usage_daily_count, 0);
let fts_count: i64 = reopened
.raw()
.query_row_map(
"SELECT COUNT(*) FROM fts_messages",
&[] as &[ParamValue],
|r| r.get_typed(0),
)
.unwrap();
assert_eq!(fts_count, 0, "reset should recreate an empty FTS table");
assert_eq!(
reopened.schema_version().unwrap(),
crate::storage::sqlite::CURRENT_SCHEMA_VERSION
);
}
#[test]
fn repair_daily_stats_if_drifted_rebuilds_materialized_totals() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
storage.run_migrations().unwrap();
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = crate::model::types::Conversation {
id: None,
agent_slug: "tester".into(),
workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
external_id: Some("daily-stats-repair".into()),
title: Some("repair".into()),
source_path: std::path::PathBuf::from("/tmp/repair.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![crate::model::types::Message {
id: None,
idx: 0,
role: crate::model::types::MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: "local".into(),
origin_host: None,
};
storage
.insert_conversations_batched(&[(agent_id, None, &conversation)])
.unwrap();
storage.raw().execute("DELETE FROM daily_stats").unwrap();
storage
.raw()
.execute(
"INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
VALUES(0, 'all', 'all', 99, 99, 99, 0)",
)
.unwrap();
let before = storage.daily_stats_health().unwrap();
assert_eq!(before.materialized_total, 99);
assert!(before.drift > 0);
assert_eq!(
repair_daily_stats_if_drifted(&storage, &db_path, None).unwrap(),
DailyStatsRepairOutcome::Rebuilt {
rows_created: 4,
total_sessions: 1,
}
);
let after = storage.daily_stats_health().unwrap();
assert_eq!(after.conversation_count, 1);
assert_eq!(after.materialized_total, 1);
assert_eq!(after.drift, 0);
}
#[test]
#[serial]
fn repair_daily_stats_if_drifted_falls_back_after_packet_rebuild_oom() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
storage.run_migrations().unwrap();
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = crate::model::types::Conversation {
id: None,
agent_slug: "tester".into(),
workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
external_id: Some("daily-stats-fallback".into()),
title: Some("fallback".into()),
source_path: std::path::PathBuf::from("/tmp/fallback.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![crate::model::types::Message {
id: None,
idx: 0,
role: crate::model::types::MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: "local".into(),
origin_host: None,
};
storage
.insert_conversations_batched(&[(agent_id, None, &conversation)])
.unwrap();
storage.raw().execute("DELETE FROM daily_stats").unwrap();
storage
.raw()
.execute(
"INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
VALUES(0, 'all', 'all', 99, 99, 99, 0)",
)
.unwrap();
let _oom_guard = set_env("CASS_TEST_PACKET_DAILY_STATS_REBUILD_OOM", "1");
assert_eq!(
repair_daily_stats_if_drifted(&storage, &db_path, None).unwrap(),
DailyStatsRepairOutcome::Rebuilt {
rows_created: 4,
total_sessions: 1,
}
);
let after = storage.daily_stats_health().unwrap();
assert_eq!(after.materialized_total, 1);
assert_eq!(after.drift, 0);
}
#[test]
fn repair_daily_stats_if_drifted_packet_rebuild_matches_legacy_storage_rebuild() {
fn load_daily_stats_rows(
storage: &FrankenStorage,
) -> Vec<(i64, String, String, i64, i64, i64)> {
storage
.raw()
.query_map_collect(
"SELECT day_id, agent_slug, source_id, session_count, message_count, total_chars
FROM daily_stats
ORDER BY day_id, agent_slug, source_id",
&[] as &[ParamValue],
|row| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
row.get_typed(4)?,
row.get_typed(5)?,
))
},
)
.unwrap()
}
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
storage.run_migrations().unwrap();
let tester = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let reviewer = crate::model::types::Agent {
id: None,
slug: "reviewer".into(),
name: "Reviewer".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let tester_id = storage.ensure_agent(&tester).unwrap();
let reviewer_id = storage.ensure_agent(&reviewer).unwrap();
let conv_local = crate::model::types::Conversation {
id: None,
agent_slug: "tester".into(),
workspace: Some(std::path::PathBuf::from("/tmp/workspace-local")),
external_id: Some("daily-stats-local".into()),
title: Some("local".into()),
source_path: std::path::PathBuf::from("/tmp/local.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_500),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
crate::model::types::Message {
id: None,
idx: 0,
role: crate::model::types::MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
crate::model::types::Message {
id: None,
idx: 1,
role: crate::model::types::MessageRole::Tool,
author: None,
created_at: Some(1_700_000_000_100),
content: String::new(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
crate::model::types::Message {
id: None,
idx: 2,
role: crate::model::types::MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_200),
content: "done".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: "local".into(),
origin_host: None,
};
let conv_remote = crate::model::types::Conversation {
id: None,
agent_slug: "reviewer".into(),
workspace: Some(std::path::PathBuf::from("/tmp/workspace-remote")),
external_id: Some("daily-stats-remote".into()),
title: Some("remote".into()),
source_path: std::path::PathBuf::from("/tmp/remote.jsonl"),
started_at: Some(1_700_086_400_000),
ended_at: Some(1_700_086_400_800),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![
crate::model::types::Message {
id: None,
idx: 0,
role: crate::model::types::MessageRole::System,
author: None,
created_at: Some(1_700_086_400_000),
content: "prep".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
crate::model::types::Message {
id: None,
idx: 1,
role: crate::model::types::MessageRole::Other("narrator".into()),
author: None,
created_at: Some(1_700_086_400_050),
content: "note".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
crate::model::types::Message {
id: None,
idx: 2,
role: crate::model::types::MessageRole::User,
author: None,
created_at: Some(1_700_086_400_100),
content: "go".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: "builder.example.com".into(),
origin_host: Some("builder.example.com".into()),
};
storage
.insert_conversations_batched(&[
(tester_id, None, &conv_local),
(reviewer_id, None, &conv_remote),
])
.unwrap();
storage.raw().execute("DELETE FROM daily_stats").unwrap();
let expected_rebuild = storage.rebuild_daily_stats().unwrap();
let expected_rows = load_daily_stats_rows(&storage);
storage.raw().execute("DELETE FROM daily_stats").unwrap();
assert_eq!(
repair_daily_stats_if_drifted(&storage, &db_path, None).unwrap(),
DailyStatsRepairOutcome::Rebuilt {
rows_created: expected_rebuild.rows_created,
total_sessions: expected_rebuild.total_sessions,
}
);
assert_eq!(load_daily_stats_rows(&storage), expected_rows);
}
#[test]
fn repair_daily_stats_if_drifted_skips_known_healthy_archive_fingerprint() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
storage.run_migrations().unwrap();
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conversation = crate::model::types::Conversation {
id: None,
agent_slug: "tester".into(),
workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
external_id: Some("daily-stats-known-healthy".into()),
title: Some("healthy".into()),
source_path: std::path::PathBuf::from("/tmp/healthy.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: None,
metadata_json: serde_json::Value::Null,
messages: vec![crate::model::types::Message {
id: None,
idx: 0,
role: crate::model::types::MessageRole::User,
author: None,
created_at: Some(1_700_000_000_000),
content: "hello".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: "local".into(),
origin_host: None,
};
storage
.insert_conversations_batched(&[(agent_id, None, &conversation)])
.unwrap();
let archive_fingerprint = lexical_storage_fingerprint_for_db(&db_path).unwrap();
storage
.record_daily_stats_archive_fingerprint(&archive_fingerprint)
.unwrap();
assert_eq!(
repair_daily_stats_if_drifted(&storage, &db_path, Some(&archive_fingerprint)).unwrap(),
DailyStatsRepairOutcome::SkippedKnownHealthyForFingerprint {
archive_fingerprint
}
);
}
#[test]
fn daily_stats_repair_after_historical_salvage_avoids_duplicate_plain_full_probe() {
assert!(
!should_repair_daily_stats_after_historical_salvage(true, true, false, 0),
"plain full runs should not re-check daily_stats after a pre-scan probe when no salvage changed canonical data"
);
assert!(should_repair_daily_stats_after_historical_salvage(
false, true, false, 0
));
assert!(should_repair_daily_stats_after_historical_salvage(
true, true, false, 7
));
assert!(!should_repair_daily_stats_after_historical_salvage(
false, true, true, 0
));
assert!(!should_repair_daily_stats_after_historical_salvage(
true, false, false, 0
));
}
#[test]
fn historical_salvage_decision_skips_populated_canonical_db() {
assert!(!should_salvage_historical_databases(false, 1, false, false));
assert!(!should_salvage_historical_databases(
false, 43_678, false, false
));
}
#[test]
fn historical_salvage_decision_keeps_empty_or_rebuilt_storage() {
assert!(should_salvage_historical_databases(false, 0, false, false));
assert!(should_salvage_historical_databases(true, 0, false, false));
assert!(should_salvage_historical_databases(
true, 43_678, false, false
));
}
#[test]
fn historical_salvage_decision_keeps_populated_canonical_when_more_bundles_are_pending() {
assert!(should_salvage_historical_databases(
false, 43_678, true, false
));
}
#[test]
fn historical_salvage_decision_skips_pending_bundles_during_canonical_only_full_rebuild() {
assert!(!should_salvage_historical_databases(
false, 43_678, true, true
));
}
#[test]
fn targeted_watch_once_only_allows_empty_or_populated_incremental_run() {
assert!(should_run_targeted_watch_once_only(
true, false, false, false, 43_678
));
assert!(should_run_targeted_watch_once_only(
true, false, false, false, 0
));
assert!(
should_run_targeted_watch_once_only(true, false, false, true, 0),
"fresh explicit watch-once imports should not broaden into every detected connector"
);
assert!(
!should_run_targeted_watch_once_only(true, false, false, true, 43_678),
"populated archives with a missing or invalid index still need authoritative repair"
);
assert!(!should_run_targeted_watch_once_only(
true, true, false, false, 43_678
));
assert!(!should_run_targeted_watch_once_only(
true, false, true, false, 43_678
));
assert!(!should_run_targeted_watch_once_only(
true, false, false, true, 43_678
));
assert!(!should_run_targeted_watch_once_only(
false, false, false, false, 43_678
));
}
fn watch_once_skip_test_options(
data_dir: std::path::PathBuf,
watch_once_paths: Option<Vec<std::path::PathBuf>>,
) -> IndexOptions {
IndexOptions {
full: false,
force_rebuild: false,
watch: false,
watch_once_paths,
db_path: data_dir.join("agent_search.db"),
data_dir,
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
}
}
#[test]
fn readonly_canonical_force_rebuild_fast_path_is_narrow() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let mut opts = watch_once_skip_test_options(data_dir.clone(), None);
opts.force_rebuild = true;
assert!(
!should_try_readonly_canonical_force_rebuild(&opts),
"missing databases should continue through normal index creation"
);
std::fs::write(&opts.db_path, b"placeholder").unwrap();
assert!(
should_try_readonly_canonical_force_rebuild(&opts),
"plain force rebuild against an existing DB can use the read-only canonical path"
);
opts.full = true;
assert!(
!should_try_readonly_canonical_force_rebuild(&opts),
"--full still owns source rescan and historical salvage behavior"
);
opts.full = false;
opts.semantic = true;
assert!(
!should_try_readonly_canonical_force_rebuild(&opts),
"semantic indexing still requires the normal post-lexical command flow"
);
opts.semantic = false;
opts.watch_once_paths = Some(vec![tmp.path().join("session.jsonl")]);
assert!(
!should_try_readonly_canonical_force_rebuild(&opts),
"targeted watch-once force rebuild should not ignore explicit paths"
);
}
#[test]
fn absent_explicit_watch_once_paths_skip_heavy_index_setup() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
let opts = watch_once_skip_test_options(
data_dir.clone(),
Some(vec![
tmp.path().join("missing-a.jsonl"),
tmp.path().join("missing-b.jsonl"),
]),
);
assert!(should_skip_absent_explicit_watch_once_paths(&opts));
assert!(
!can_skip_absent_explicit_watch_once_index_run(&opts),
"missing canonical/index assets must keep the normal repair/create path"
);
let storage = FrankenStorage::open(&opts.db_path).unwrap();
ensure_fts_schema(&storage);
let _index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
assert!(
can_skip_absent_explicit_watch_once_index_run(&opts),
"populated data dirs with a current lexical index can skip absent explicit paths"
);
}
#[test]
fn absent_explicit_watch_once_paths_preserve_non_noop_modes() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let existing = data_dir.join("session.jsonl");
std::fs::write(&existing, "{}\n").unwrap();
let missing = data_dir.join("missing.jsonl");
let existing_opts =
watch_once_skip_test_options(data_dir.clone(), Some(vec![existing.clone()]));
assert!(
!should_skip_absent_explicit_watch_once_paths(&existing_opts),
"existing explicit paths must flow through normal watch-once reindexing"
);
let mixed_opts = watch_once_skip_test_options(
data_dir.clone(),
Some(vec![missing.clone(), existing.clone()]),
);
assert!(
!should_skip_absent_explicit_watch_once_paths(&mixed_opts),
"mixed existing/missing batches must still index the existing paths"
);
let mut watch_opts =
watch_once_skip_test_options(data_dir.clone(), Some(vec![missing.clone()]));
watch_opts.watch = true;
assert!(!should_skip_absent_explicit_watch_once_paths(&watch_opts));
let mut full_opts =
watch_once_skip_test_options(data_dir.clone(), Some(vec![missing.clone()]));
full_opts.full = true;
assert!(!should_skip_absent_explicit_watch_once_paths(&full_opts));
let mut force_opts =
watch_once_skip_test_options(data_dir.clone(), Some(vec![missing.clone()]));
force_opts.force_rebuild = true;
assert!(!should_skip_absent_explicit_watch_once_paths(&force_opts));
let mut semantic_opts =
watch_once_skip_test_options(data_dir.clone(), Some(vec![missing.clone()]));
semantic_opts.semantic = true;
assert!(!should_skip_absent_explicit_watch_once_paths(
&semantic_opts
));
let mut hnsw_opts = watch_once_skip_test_options(data_dir, Some(vec![missing]));
hnsw_opts.build_hnsw = true;
assert!(!should_skip_absent_explicit_watch_once_paths(&hnsw_opts));
}
#[test]
fn watch_once_authoritative_repair_skips_broad_followup_scan() {
assert!(
should_skip_broad_scan_after_watch_once_authoritative_repair(true, false, false, true)
);
assert!(
!should_skip_broad_scan_after_watch_once_authoritative_repair(true, true, false, true)
);
assert!(
!should_skip_broad_scan_after_watch_once_authoritative_repair(true, false, true, true)
);
assert!(
!should_skip_broad_scan_after_watch_once_authoritative_repair(
true, false, false, false
)
);
assert!(
!should_skip_broad_scan_after_watch_once_authoritative_repair(
false, false, false, true
)
);
}
#[test]
fn fallback_fts_repair_is_skipped_for_canonical_only_full_rebuild() {
assert!(!should_repair_fallback_fts_after_full_index_run(true, true));
assert!(should_repair_fallback_fts_after_full_index_run(true, false));
assert!(!should_repair_fallback_fts_after_full_index_run(
false, false
));
assert!(!should_repair_fallback_fts_after_full_index_run(
false, true
));
}
#[test]
fn full_run_fallback_fts_repair_skips_rebuild_when_fts_is_already_healthy() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("fts-healthy.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let repair =
repair_fallback_fts_after_full_index_run(&storage, &db_path, true, false, None)
.unwrap();
assert_eq!(
repair,
Some(FallbackFtsRepairOutcome::Repaired(
FtsConsistencyRepair::AlreadyHealthy { rows: 4 }
))
);
}
#[test]
fn full_run_fallback_fts_repair_rebuilds_missing_schema_when_needed() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("fts-missing.db");
let storage = FrankenStorage::open(&db_path).unwrap();
seed_lexical_rebuild_fixture(&storage);
let repair =
repair_fallback_fts_after_full_index_run(&storage, &db_path, true, false, None)
.unwrap();
assert_eq!(
repair,
Some(FallbackFtsRepairOutcome::Repaired(
FtsConsistencyRepair::Rebuilt { inserted_rows: 4 }
))
);
}
#[test]
fn full_run_fallback_fts_repair_skips_known_healthy_archive_fingerprint() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("fts-known-healthy.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let archive_fingerprint = lexical_storage_fingerprint_for_db(&db_path).unwrap();
storage.ensure_search_fallback_fts_consistency().unwrap();
storage
.record_search_fallback_fts_archive_fingerprint(&archive_fingerprint)
.unwrap();
let repair = repair_fallback_fts_after_full_index_run(
&storage,
&db_path,
true,
false,
Some(&archive_fingerprint),
)
.unwrap();
assert_eq!(
repair,
Some(
FallbackFtsRepairOutcome::SkippedKnownHealthyForFingerprint {
archive_fingerprint
}
)
);
}
#[test]
fn full_rebuild_does_not_restart_based_on_historical_local_rowids() {
fn insert_demo_conversation(db_path: &Path, external_id: &str, msg_idx: i64, ts: i64) {
let storage = crate::storage::sqlite::SqliteStorage::open(db_path).unwrap();
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = norm_conv(Some(external_id), vec![norm_msg(msg_idx, ts)]);
storage
.insert_conversation_tree(
agent_id,
None,
&crate::model::types::Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: conv.title.clone(),
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: conv.metadata.clone(),
messages: conv
.messages
.iter()
.map(|m| crate::model::types::Message {
id: None,
idx: m.idx,
role: crate::model::types::MessageRole::User,
author: m.author.clone(),
created_at: m.created_at,
content: m.content.clone(),
extra_json: m.extra.clone(),
snippets: Vec::new(),
})
.collect(),
source_id: "local".to_string(),
origin_host: None,
},
)
.unwrap();
drop(storage);
crate::storage::sqlite::rebuild_fts_via_rusqlite(db_path).unwrap();
}
let tmp = TempDir::new().unwrap();
let canonical_db = tmp.path().join("agent_search.db");
let backups_dir = tmp.path().join("backups");
std::fs::create_dir_all(&backups_dir).unwrap();
let healthy_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
insert_demo_conversation(&canonical_db, "canonical-only", 0, 1_700_000_000_000);
insert_demo_conversation(&healthy_backup, "backup-1", 0, 1_700_000_000_100);
insert_demo_conversation(&healthy_backup, "backup-2", 1, 1_700_000_000_200);
let canonical_db_path = canonical_db.to_string_lossy().to_string();
let conn = frankensqlite::Connection::open(canonical_db_path).unwrap();
conn.execute_compat(
"INSERT INTO meta(key, value) VALUES(?1, ?2)",
&[
ParamValue::from("historical_bundle_salvaged:test"),
ParamValue::from("{\"salvage_version\":2,\"method\":\"baseline-bulk-sql-copy\"}"),
],
)
.unwrap();
drop(conn);
let storage = FrankenStorage::open(&canonical_db).unwrap();
let canonical_sessions = count_total_conversations_exact(&storage).unwrap();
assert_eq!(canonical_sessions, 1);
assert!(
!full_rebuild_requires_historical_restart(&storage, &canonical_db, canonical_sessions)
.unwrap(),
"full rebuild must not compare local message rowids across different sqlite files"
);
}
#[test]
fn full_rebuild_restart_ignores_stale_progress_when_canonical_is_healthy() {
fn insert_demo_conversation(db_path: &Path, external_id: &str, msg_idx: i64, ts: i64) {
let storage = crate::storage::sqlite::SqliteStorage::open(db_path).unwrap();
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = norm_conv(Some(external_id), vec![norm_msg(msg_idx, ts)]);
storage
.insert_conversation_tree(
agent_id,
None,
&crate::model::types::Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: conv.title.clone(),
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: conv.metadata.clone(),
messages: conv
.messages
.iter()
.map(|m| crate::model::types::Message {
id: None,
idx: m.idx,
role: crate::model::types::MessageRole::User,
author: m.author.clone(),
created_at: m.created_at,
content: m.content.clone(),
extra_json: m.extra.clone(),
snippets: Vec::new(),
})
.collect(),
source_id: "local".to_string(),
origin_host: None,
},
)
.unwrap();
drop(storage);
crate::storage::sqlite::rebuild_fts_via_rusqlite(db_path).unwrap();
}
let tmp = TempDir::new().unwrap();
let canonical_db = tmp.path().join("agent_search.db");
let backups_dir = tmp.path().join("backups");
std::fs::create_dir_all(&backups_dir).unwrap();
let healthy_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
insert_demo_conversation(&canonical_db, "canonical-only", 0, 1_700_000_000_000);
insert_demo_conversation(&healthy_backup, "backup-only", 0, 1_700_000_000_100);
let storage = FrankenStorage::open(&canonical_db).unwrap();
storage
.raw()
.execute_compat(
"INSERT INTO meta(key, value) VALUES(?1, ?2)",
&[
ParamValue::from("historical_bundle_progress:test"),
ParamValue::from(
"{\"progress_version\":1,\"last_completed_source_row_id\":78}",
),
],
)
.unwrap();
let canonical_sessions = count_total_conversations_exact(&storage).unwrap();
assert_eq!(canonical_sessions, 1);
assert!(
!full_rebuild_requires_historical_restart(&storage, &canonical_db, canonical_sessions)
.unwrap(),
"stale salvage progress alone must not force a fresh canonical restart when the canonical db is healthy"
);
}
#[test]
fn open_storage_for_index_refuses_corrupt_archive_without_replacing_db() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("db.sqlite");
let original = b"not a sqlite database";
std::fs::write(&db_path, original).unwrap();
let err = match open_storage_for_index(&db_path, true) {
Ok(_) => panic!("corrupt existing archive must fail closed"),
Err(err) => err.to_string(),
};
assert!(
err.contains("will not replace or truncate"),
"diagnostic should explain archive preservation: {err}"
);
assert_eq!(
std::fs::read(&db_path).unwrap(),
original,
"corrupt archive bytes must be preserved for recovery"
);
let backup_count = std::fs::read_dir(tmp.path())
.unwrap()
.flatten()
.filter(|entry| {
entry
.file_name()
.to_str()
.map(|name| name.starts_with("db.sqlite.backup."))
.unwrap_or(false)
})
.count();
assert!(
backup_count == 0,
"index open must not create backup artifacts for corrupt archive"
);
}
#[test]
fn persist_append_only_adds_new_messages_to_index() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let conv1 = norm_conv(Some("ext"), vec![norm_msg(0, 100), norm_msg(1, 200)]);
persist::persist_conversation(&storage, &mut index, &conv1).unwrap();
index.commit().unwrap();
let reader = index.reader().unwrap();
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 2);
let conv2 = norm_conv(
Some("ext"),
vec![norm_msg(0, 100), norm_msg(1, 200), norm_msg(2, 300)],
);
persist::persist_conversation(&storage, &mut index, &conv2).unwrap();
index.commit().unwrap();
let reader = index.reader().unwrap();
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 3);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_emits_phase_exact_prep_profile_logs() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let _prep_profile = set_env("CASS_PREP_PROFILE", "1");
let _conversation_limit = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "2");
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
assert_eq!(rebuild.indexed_docs, 4);
});
for needle in [
"lexical rebuild prep profile",
r#"component="main""#,
r#"step="open_readonly""#,
r#"step="compute_db_state_fingerprint""#,
r#"step="load_checkpoint_state""#,
r#"step="start_packet_producer""#,
r#"step="plan_lexical_shards""#,
r#"step="persist_initial_checkpoint""#,
r#"component="producer""#,
r#"step="load_sources""#,
r#"step="build_lookups""#,
r#"step="load_resume_cursor""#,
r#"step="first_batch_handoff""#,
] {
assert!(
logs.contains(needle),
"expected prep-profile log fragment `{needle}`, got:
{logs}"
);
}
let start_packet_producer = logs
.find(r#"step="start_packet_producer""#)
.expect("start_packet_producer log position");
let plan_lexical_shards = logs
.find(r#"step="plan_lexical_shards""#)
.expect("plan_lexical_shards log position");
let persist_initial_checkpoint = logs
.find(r#"step="persist_initial_checkpoint""#)
.expect("persist_initial_checkpoint log position");
assert!(
plan_lexical_shards < start_packet_producer,
"fresh rebuild should finish deterministic shard planning before producer startup overlap: {logs}"
);
assert!(
start_packet_producer < persist_initial_checkpoint,
"fresh staged rebuild should start the packet producer before \
persisting the initial checkpoint (producer-handoff overlap): {logs}"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_skips_tantivy_open_when_checkpoint_is_complete() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
drop(storage);
let index_path = index_dir(&data_dir).unwrap();
std::fs::create_dir_all(&index_path).unwrap();
let fingerprint = lexical_storage_fingerprint_for_db(&db_path).unwrap();
std::fs::write(
lexical_rebuild_state_path(&index_path),
serde_json::to_vec_pretty(&serde_json::json!({
"version": LEXICAL_REBUILD_STATE_VERSION,
"schema_hash": crate::search::tantivy::SCHEMA_HASH,
"db": {
"db_path": db_path.display().to_string(),
"total_conversations": 2,
"total_messages": 4,
"storage_fingerprint": fingerprint,
},
"page_size": LEXICAL_REBUILD_PAGE_SIZE,
"committed_offset": 2,
"processed_conversations": 2,
"indexed_docs": 4,
"committed_meta_fingerprint": null,
"pending": null,
"completed": true,
"updated_at_ms": FrankenStorage::now_millis(),
"runtime": {
"queue_depth": 0,
"inflight_message_bytes": 0,
"pending_batch_conversations": 0,
"pending_batch_message_bytes": 0,
"page_prep_workers": 0,
"active_page_prep_jobs": 0,
"ordered_buffered_pages": 0,
"budget_generation": 0,
"updated_at_ms": FrankenStorage::now_millis(),
}
}))
.unwrap(),
)
.unwrap();
let _prep_profile = set_env("CASS_PREP_PROFILE", "1");
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
assert!(!rebuild.exact_checkpoint_persisted);
});
assert!(
!logs.contains(r#"step="open_tantivy""#),
"completed checkpoint should not reopen Tantivy: {logs}"
);
assert!(
!logs.contains(r#"step="start_packet_producer""#),
"completed checkpoint should not start producer warmup: {logs}"
);
assert!(
!logs.contains(r#"component="producer""#),
"completed checkpoint should not emit producer prep logs: {logs}"
);
assert!(
!index_path.join("meta.json").exists(),
"completed checkpoint fast-path should not create Tantivy metadata"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_logs_streamed_batch_stats() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let _conversation_limit = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "2");
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
});
assert!(
logs.contains(
"running fresh authoritative lexical rebuild via staged shard-build path"
),
"expected staged-shards rebuild banner after 9ct8r routing, got:
{logs}"
);
assert!(
logs.contains("lexical rebuild prepared bounded page"),
"expected page-level bounded-prep diagnostics, got:
{logs}"
);
assert!(
logs.contains("budget_shrink_decision=")
&& logs.contains("batch_fetch_message_bytes_limit=")
&& logs.contains("max_message_bytes_in_flight="),
"expected bounded page budget diagnostics, got:
{logs}"
);
assert!(
logs.contains("page_messages=4") && logs.contains("page_conversations=2"),
"expected per-page conversation + message counts matching the fixture (2 convs / 4 msgs), got:
{logs}"
);
assert!(
logs.contains("built lexical rebuild shard index"),
"expected staged-shards commit-phase diagnostic, got:
{logs}"
);
assert!(
logs.contains("indexed_docs=4") && logs.contains("shard_conversations=2"),
"expected shard-build completion counts (4 docs / 2 convs) in staged output, got:
{logs}"
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 4);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_emits_equivalence_evidence() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
assert_eq!(rebuild.indexed_docs, 4);
let evidence = rebuild
.equivalence
.as_ref()
.expect("authoritative rebuild must emit equivalence evidence");
assert_eq!(evidence.document_count, 4);
assert_eq!(
evidence.manifest_fingerprint.len(),
64,
"manifest fingerprint should be a 32-byte blake3 hex digest"
);
assert_eq!(evidence.golden_query_digest.len(), 64);
let default_probes = LEXICAL_REBUILD_EQUIVALENCE_DEFAULT_PROBES.iter().copied();
let recorded_probes = evidence
.golden_query_hit_counts
.iter()
.map(|hit| hit.probe.as_str())
.collect::<Vec<_>>();
assert_eq!(
recorded_probes,
default_probes.collect::<Vec<_>>(),
"evidence should record the default probe list verbatim"
);
let index_path = index_dir(&data_dir).unwrap();
let artifact_path = lexical_rebuild_equivalence_evidence_path(&index_path);
let persisted = std::fs::read_to_string(&artifact_path)
.expect("equivalence evidence artifact should be persisted on disk");
let parsed: super::LexicalRebuildEquivalenceEvidence =
serde_json::from_str(&persisted).expect("persisted evidence is valid JSON");
assert_eq!(&parsed, evidence);
});
assert!(
logs.contains("lexical rebuild authoritative equivalence evidence"),
"expected authoritative equivalence evidence log, got:
{logs}"
);
assert!(
logs.contains("manifest_fingerprint="),
"expected manifest_fingerprint field in evidence log, got:
{logs}"
);
assert!(
logs.contains("golden_query_digest="),
"expected golden_query_digest field in evidence log, got:
{logs}"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_persists_serveable_generation_manifest() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
let evidence = rebuild
.equivalence
.as_ref()
.expect("authoritative rebuild must emit equivalence evidence");
assert_eq!(rebuild.indexed_docs, 4);
let index_path = index_dir(&data_dir).unwrap();
let loaded = lexical_generation::load_manifest(&index_path)
.expect("loading the generation manifest must not error")
.expect(
"authoritative rebuild must persist a generation manifest \
alongside the equivalence ledger",
);
assert!(
loaded.is_serveable(),
"freshly committed generation must be Validated+Published; got \
build_state={:?} publish_state={:?}",
loaded.build_state,
loaded.publish_state
);
assert_eq!(
loaded.build_state,
lexical_generation::LexicalGenerationBuildState::Validated
);
assert_eq!(
loaded.publish_state,
lexical_generation::LexicalGenerationPublishState::Published
);
assert_eq!(
loaded.indexed_doc_count, evidence.document_count,
"manifest indexed_doc_count must match equivalence accumulator"
);
assert_eq!(
loaded.indexed_doc_count, rebuild.indexed_docs as u64,
"manifest indexed_doc_count must match outcome.indexed_docs"
);
assert_eq!(
loaded.equivalence_manifest_fingerprint.as_deref(),
Some(evidence.manifest_fingerprint.as_str()),
"generation manifest must link to the equivalence accumulator fingerprint"
);
assert!(
!loaded.generation_id.is_empty(),
"generation_id must be non-empty"
);
assert!(
!loaded.attempt_id.is_empty(),
"attempt_id must be non-empty"
);
assert!(
!loaded.source_db_fingerprint.is_empty(),
"source_db_fingerprint must be propagated from rebuild_state"
);
assert!(
loaded.failure_history.is_empty(),
"successful rebuild should have an empty failure history"
);
assert!(
loaded.updated_at_ms >= loaded.created_at_ms,
"updated_at_ms must be non-decreasing relative to created_at_ms"
);
});
assert!(
logs.contains("lexical generation manifest published"),
"expected generation manifest publish log, got:
{logs}"
);
assert!(
logs.contains("generation_id=") && logs.contains("attempt_id="),
"expected generation_id + attempt_id fields in publish log, got:
{logs}"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_persists_packet_refresh_ledger() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
let evidence = rebuild
.equivalence
.as_ref()
.expect("authoritative rebuild must emit equivalence evidence");
let index_path = index_dir(&data_dir).unwrap();
let raw = std::fs::read_to_string(lexical_refresh_ledger_path(&index_path))
.expect("refresh ledger should be persisted for the packet-driven rebuild");
let ledger: refresh_ledger::RefreshLedger =
serde_json::from_str(&raw).expect("refresh ledger must be valid JSON");
let milestones = ledger.readiness_milestones();
assert_eq!(
milestones.search_readiness_state,
refresh_ledger::RefreshSearchReadinessState::Published
);
assert!(
milestones.time_to_lexical_ready_ms.is_some(),
"packet rebuild should record a lexical-ready milestone"
);
assert!(
milestones.time_to_search_ready_ms.is_some(),
"packet rebuild should record a search-ready milestone"
);
assert_eq!(
ledger.tags.get("dataflow").map(String::as_str),
Some("conversation_packet")
);
assert_eq!(
ledger.tags.get("publish_mode").map(String::as_str),
Some("atomic_staged_swap")
);
assert!(
ledger.full_rebuild,
"authoritative canonical packet replay is a full lexical rebuild"
);
assert_eq!(
ledger
.phase(refresh_ledger::RefreshPhase::LexicalRebuild)
.and_then(|phase| phase.counters.get("indexed_docs"))
.copied(),
Some(u64::try_from(rebuild.indexed_docs).unwrap_or(u64::MAX))
);
assert_eq!(
ledger.equivalence.search_hit_digest.as_deref(),
Some(evidence.golden_query_digest.as_str())
);
});
assert!(
logs.contains("lexical refresh ledger published"),
"expected refresh ledger publish log, got:\n{logs}"
);
}
#[test]
fn authoritative_publish_emits_lexical_refresh_evidence_sidecar() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("evidence-sidecar.db");
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let ledger_path = lexical_refresh_ledger_path(&index_path);
let evidence_path = lexical_refresh_evidence_path(&index_path);
assert!(
evidence_path.exists(),
"evidence sidecar must be written at {} after publish",
evidence_path.display()
);
let evidence_raw = std::fs::read_to_string(&evidence_path)
.expect("evidence sidecar must be readable after publish");
let parsed: serde_json::Value =
serde_json::from_str(&evidence_raw).expect("evidence sidecar must be valid JSON");
for required in [
"throughput",
"phase_share",
"dominant_phase",
"aggregate_items_processed",
"aggregate_duration_ms",
"aggregate_items_per_second",
] {
assert!(
parsed.get(required).is_some(),
"evidence sidecar missing required field {required}; got: {parsed}"
);
}
let ledger_raw =
std::fs::read_to_string(&ledger_path).expect("raw ledger must also be persisted");
let ledger: refresh_ledger::RefreshLedger =
serde_json::from_str(&ledger_raw).expect("raw ledger must be valid JSON");
let computed_evidence = ledger.evidence_summary();
let serialized_computed: serde_json::Value =
serde_json::from_str(&serde_json::to_string(&computed_evidence).unwrap()).unwrap();
assert_eq!(
parsed, serialized_computed,
"persisted evidence sidecar must equal evidence_summary() of the persisted ledger; \
a divergence here means the sidecar was computed from a different ledger snapshot"
);
assert_eq!(
parsed["aggregate_duration_ms"].as_u64(),
Some(ledger.total_duration_ms),
"aggregate_duration_ms must equal ledger.total_duration_ms"
);
assert_eq!(
parsed["aggregate_items_processed"].as_u64(),
Some(ledger.total_items_processed()),
"aggregate_items_processed must equal ledger.total_items_processed()"
);
}
#[test]
fn persist_lexical_refresh_ledger_emits_cross_run_tracing_when_prior_sidecar_exists() {
use std::collections::BTreeMap;
use std::sync::{Arc, Mutex};
use tracing::field::{Field, Visit};
use tracing::{Event, Subscriber};
use tracing_subscriber::Registry;
use tracing_subscriber::layer::{Context, Layer, SubscriberExt};
#[derive(Debug, Clone, Default)]
struct CapturedEvent {
level: String,
message: String,
}
#[derive(Clone, Default)]
struct ComparisonCollector {
events: Arc<Mutex<Vec<CapturedEvent>>>,
}
impl<S: Subscriber> Layer<S> for ComparisonCollector {
fn on_event(&self, event: &Event<'_>, _ctx: Context<'_, S>) {
if event.metadata().target() != "cass::indexer::lexical_refresh" {
return;
}
let mut visitor = MessageVisitor::default();
event.record(&mut visitor);
if !visitor.message.contains("lexical refresh evidence") {
return;
}
self.events
.lock()
.expect("collector lock")
.push(CapturedEvent {
level: event.metadata().level().to_string(),
message: visitor.message,
});
}
}
#[derive(Default)]
struct MessageVisitor {
message: String,
}
impl Visit for MessageVisitor {
fn record_str(&mut self, _field: &Field, _value: &str) {}
fn record_debug(&mut self, field: &Field, value: &dyn std::fmt::Debug) {
if field.name() == "message" {
self.message = format!("{:?}", value).trim_matches('"').to_string();
}
}
}
let tmp = TempDir::new().expect("temp dir");
let data_dir = tmp.path().join("data");
let index_path = index_dir(&data_dir).unwrap();
std::fs::create_dir_all(&index_path).expect("create index path");
fn make_ledger(scan_duration_ms: u64) -> RefreshLedger {
RefreshLedger {
version: 1,
started_at_ms: 1_700_000_000_000,
completed_at_ms: 1_700_000_000_000 + scan_duration_ms as i64,
total_duration_ms: scan_duration_ms,
full_rebuild: true,
corpus_family: "ibuuh.24-wiring-test".to_string(),
phases: vec![PhaseRecord {
phase: RefreshPhase::Scan,
duration_ms: scan_duration_ms,
items_processed: 100,
items_skipped: 0,
errors: 0,
counters: BTreeMap::new(),
success: true,
error_message: None,
}],
equivalence: refresh_ledger::EquivalenceArtifacts::default(),
tags: BTreeMap::new(),
}
}
let collector = ComparisonCollector::default();
let subscriber = Registry::default().with(collector.clone());
tracing::subscriber::with_default(subscriber, || {
persist_lexical_refresh_ledger(&index_path, &make_ledger(100))
.expect("first persist must succeed");
});
let first_events = collector.events.lock().expect("lock").clone();
assert!(
first_events.is_empty(),
"first publish has no prior sidecar to compare against; no cross-run \
tracing event should fire. Got: {first_events:?}"
);
assert!(
lexical_refresh_evidence_path(&index_path).exists(),
"first publish must persist a sidecar so the second publish can compare"
);
let collector = ComparisonCollector::default();
let subscriber = Registry::default().with(collector.clone());
tracing::subscriber::with_default(subscriber, || {
persist_lexical_refresh_ledger(&index_path, &make_ledger(200))
.expect("second persist must succeed");
});
let second_events = collector.events.lock().expect("lock").clone();
assert_eq!(
second_events.len(),
1,
"second publish (with prior sidecar) must emit EXACTLY one cross-run \
comparison event. Got: {second_events:?}"
);
let event = &second_events[0];
assert_eq!(
event.level, "WARN",
"+100% duration delta vs prior must route through the warn tier \
(≥+25% slowdown threshold from emit_tracing_summary). Got: {event:?}"
);
assert!(
event.message.contains("significant slowdown"),
"warn-tier message must name the slowdown signal; got: {:?}",
event.message
);
}
#[test]
fn lexical_rebuild_equivalence_accumulator_matches_legacy_and_keyset_replays() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("lexical-equivalence-accumulator.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let legacy_packets = legacy_offset_lexical_rebuild_packets(&storage, 1).unwrap();
let keyset_packets = keyset_batched_lexical_rebuild_packets(&storage, 2, 2).unwrap();
let accumulate = |packets: &[LexicalRebuildConversationPacket]| {
let mut acc = LexicalRebuildEquivalenceAccumulator::new();
for packet in packets {
acc.absorb_packet(packet);
}
acc.finalize()
};
let legacy_evidence = accumulate(&legacy_packets);
let keyset_evidence = accumulate(&keyset_packets);
assert_eq!(
legacy_evidence, keyset_evidence,
"streaming accumulator must agree across legacy OFFSET and keyset replays"
);
assert_eq!(legacy_evidence.document_count, 4);
assert_eq!(legacy_evidence.manifest_fingerprint.len(), 64);
assert_eq!(legacy_evidence.golden_query_digest.len(), 64);
}
#[test]
fn lexical_rebuild_equivalence_accumulator_counts_probe_hits_and_hashes_are_stable() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("lexical-equivalence-probes.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let packets = keyset_batched_lexical_rebuild_packets(&storage, 10, 10).unwrap();
let probe_evidence = |probes: &[&str]| {
let mut acc = LexicalRebuildEquivalenceAccumulator::with_probes(
probes.iter().map(|probe| (*probe).to_string()),
);
for packet in &packets {
acc.absorb_packet(packet);
}
acc.finalize()
};
let targeted = probe_evidence(&[
"lexical-fixture-1",
"lexical-fixture-2-second",
"missing-golden-query",
]);
let hits: Vec<_> = targeted
.golden_query_hit_counts
.iter()
.map(|hit| (hit.probe.as_str(), hit.hit_count))
.collect();
assert_eq!(
hits,
vec![
("lexical-fixture-1", 2),
("lexical-fixture-2-second", 1),
("missing-golden-query", 0),
],
"fixture should produce deterministic per-probe hit counts"
);
let repeat = probe_evidence(&[
"lexical-fixture-1",
"lexical-fixture-2-second",
"missing-golden-query",
]);
assert_eq!(
targeted, repeat,
"equivalence evidence must be deterministic across invocations"
);
let shuffled = probe_evidence(&[
"lexical-fixture-2-second",
"lexical-fixture-1",
"missing-golden-query",
]);
assert_ne!(
targeted.golden_query_digest, shuffled.golden_query_digest,
"probe order must be part of the digest"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_promotes_pipeline_budgets_after_first_commit() {
let _responsiveness = set_env("CASS_RESPONSIVENESS_DISABLE", "1");
let _controller_mode = set_env("CASS_TANTIVY_REBUILD_CONTROLLER_MODE", "auto");
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
for (suffix, base_ts) in [("3", 1_700_000_002_000_i64), ("4", 1_700_000_003_000_i64)] {
let external_id = format!("lexical-fixture-{suffix}");
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.clone()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
drop(storage);
let _conversation_limit = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "2");
let _initial_conversation_limit = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"1",
);
let _commit_messages = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_MESSAGES", "4");
let _initial_commit_messages =
set_env("CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_MESSAGES", "2");
let _commit_message_bytes =
set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_MESSAGE_BYTES", "4096");
let _initial_commit_message_bytes = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_MESSAGE_BYTES",
"64",
);
let _channel_size = set_env("CASS_TANTIVY_REBUILD_PIPELINE_CHANNEL_SIZE", "1");
let _controller_loadavg_high = set_env(
"CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_HIGH_WATERMARK_1M",
"1000000",
);
let _controller_loadavg_low = set_env(
"CASS_TANTIVY_REBUILD_CONTROLLER_LOADAVG_LOW_WATERMARK_1M",
"999999",
);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 4, None).unwrap();
assert_eq!(rebuild.indexed_docs, 8);
assert_eq!(rebuild.observed_messages, Some(8));
assert!(rebuild.exact_checkpoint_persisted);
});
assert!(
logs.contains("updated lexical rebuild pipeline budgets")
&& logs.contains("controller_reason=first_durable_commit_promoted_steady_budget"),
"expected budget-promotion log, got:
{logs}"
);
assert!(
logs.contains("old_page_conversation_limit=1"),
"expected startup page budget in logs, got:
{logs}"
);
assert!(
logs.contains("new_page_conversation_limit=2"),
"expected steady page budget in logs, got:
{logs}"
);
assert!(
logs.contains("old_batch_fetch_message_bytes_limit=64"),
"expected startup byte budget in logs, got:
{logs}"
);
assert!(
logs.contains("new_batch_fetch_message_bytes_limit=4096"),
"expected steady byte budget in logs, got:
{logs}"
);
let producer_budget_lines = logs
.lines()
.filter(|line| line.contains("lexical rebuild producer adopted pipeline budgets"))
.collect::<Vec<_>>();
assert!(
!producer_budget_lines.is_empty(),
"expected producer budget adoption logs, got:
{logs}"
);
assert!(
producer_budget_lines
.iter()
.any(|line| line.contains("page_conversation_limit=1")),
"expected startup producer page budget adoption log, got:
{logs}"
);
assert!(
producer_budget_lines.iter().any(|line| {
line.contains("page_conversation_limit=2")
&& line.contains("batch_fetch_message_limit=4")
&& line.contains("batch_fetch_message_bytes_limit=4096")
}),
"expected steady-state producer budget adoption log, got:
{logs}"
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 8);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_resume_reports_total_observed_messages() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let index_path = index_dir(&data_dir).unwrap();
let mut index = TantivyIndex::open_or_create(&index_path).unwrap();
let first = norm_conv(Some("resume-1"), vec![norm_msg(0, 100), norm_msg(1, 200)]);
persist::persist_conversation(&storage, &mut index, &first).unwrap();
index.commit().unwrap();
let _defer_guard = set_env("CASS_DEFER_LEXICAL_UPDATES", "1");
let second = norm_conv(Some("resume-2"), vec![norm_msg(0, 300), norm_msg(1, 400)]);
persist::persist_conversation(&storage, &mut index, &second).unwrap();
let db_state = lexical_rebuild_db_state(&storage, &db_path).unwrap();
let committed_meta_fingerprint = index_meta_fingerprint(&index_path).unwrap();
let mut state = LexicalRebuildState::new(db_state, LEXICAL_REBUILD_PAGE_SIZE);
state.committed_offset = 1;
state.processed_conversations = 1;
state.indexed_docs = 2;
state.committed_meta_fingerprint = committed_meta_fingerprint;
persist_lexical_rebuild_state(&index_path, &state).unwrap();
drop(index);
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 2, None).unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
assert!(rebuild.exact_checkpoint_persisted);
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("completed checkpoint after rebuild");
assert!(checkpoint.completed);
assert_eq!(checkpoint.processed_conversations, 2);
assert_eq!(checkpoint.committed_offset, 2);
assert_eq!(checkpoint.indexed_docs, 4);
let state = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("completed lexical rebuild state after rebuild");
assert_eq!(state.db.total_messages, 4);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 4);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_exposes_active_pipeline_runtime_to_attachers() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let conversation_count = 24usize;
let messages_per_conversation = 24usize;
let expected_docs = conversation_count * messages_per_conversation;
let convs = (0..conversation_count)
.map(|idx| {
large_startup_conv(
"codex",
"attach-active-runtime",
idx,
messages_per_conversation,
8 * 1024,
1_700_000_000_000,
)
})
.collect::<Vec<_>>();
ingest_batch(
&storage,
None,
&data_dir,
&convs,
&None,
LexicalPopulationStrategy::DeferredAuthoritativeDbRebuild,
false,
)
.unwrap();
drop(storage);
let _fetch_conversations = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "1");
let _startup_fetch_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"1",
);
let _progress_conversations = set_env(
"CASS_TANTIVY_REBUILD_PROGRESS_HEARTBEAT_EVERY_CONVERSATIONS",
"1",
);
let _progress_ms = set_env("CASS_TANTIVY_REBUILD_PROGRESS_HEARTBEAT_EVERY_MS", "1");
let _commit_conversations =
set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS", "4096");
let _startup_commit_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS",
"4096",
);
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "6");
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let _pipeline_channel = set_env("CASS_TANTIVY_REBUILD_PIPELINE_CHANNEL_SIZE", "1");
let _page_prep_workers = set_env("CASS_TANTIVY_REBUILD_PAGE_PREP_WORKERS", "1");
let db_path_for_thread = db_path.clone();
let data_dir_for_thread = data_dir.clone();
let rebuild_handle = std::thread::spawn(move || {
rebuild_tantivy_from_db_with_options(
&db_path_for_thread,
&data_dir_for_thread,
2,
None,
LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: true,
},
None,
)
});
let index_path = index_dir(&data_dir).unwrap();
let deadline = Instant::now() + Duration::from_secs(20);
let active_runtime = loop {
if let Some(runtime) =
load_active_lexical_rebuild_pipeline_runtime(&index_path, &db_path).unwrap()
&& runtime.is_observed()
{
break runtime;
}
assert!(
Instant::now() < deadline,
"timed out waiting for active rebuild pipeline runtime to become attach-visible"
);
std::thread::sleep(Duration::from_millis(25));
};
assert!(
active_runtime.updated_at_ms > 0,
"attach-visible active runtime should include an update timestamp"
);
assert!(
active_runtime.page_prep_workers > 0
|| active_runtime.pending_batch_conversations > 0
|| active_runtime.queue_depth > 0
|| active_runtime.staged_shard_build_workers_max > 0,
"attach-visible runtime should expose concrete pipeline activity: {active_runtime:?}"
);
let rebuild = rebuild_handle.join().unwrap().unwrap();
assert_eq!(rebuild.indexed_docs, expected_docs);
assert_eq!(rebuild.observed_messages, Some(expected_docs));
assert!(
load_active_lexical_rebuild_pipeline_runtime(&index_path, &db_path)
.unwrap()
.is_none(),
"completed rebuild should stop advertising active pipeline runtime"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_deferred_startup_fingerprint_persists_exact_completed_fingerprint() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let rebuild = rebuild_tantivy_from_db_with_options(
&db_path,
&data_dir,
2,
None,
LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: true,
},
None,
)
.unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
assert!(rebuild.exact_checkpoint_persisted);
let index_path = index_dir(&data_dir).unwrap();
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("completed checkpoint after deferred-fingerprint rebuild");
assert!(checkpoint.completed);
assert_eq!(
checkpoint.storage_fingerprint,
lexical_rebuild_storage_fingerprint(&db_path).unwrap()
);
let state = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("completed lexical rebuild state after deferred-fingerprint rebuild");
assert_eq!(
state.committed_meta_fingerprint.as_deref(),
index_meta_fingerprint(&index_path).unwrap().as_deref(),
"shared-writer completion should retain the final committed meta fingerprint"
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 4);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_deferred_startup_emits_deferred_prep_profile_logs() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let _prep_profile = set_env("CASS_PREP_PROFILE", "1");
let _conversation_limit = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "2");
let logs = capture_logs(|| {
let rebuild =
rebuild_tantivy_from_db_deferred_startup(&db_path, &data_dir, 2, None).unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
});
assert!(
logs.contains(r#"step="prepare_db_state_deferred_fingerprint""#),
"expected deferred startup prep-profile log, got:\n{logs}"
);
assert!(
!logs.contains(r#"step="compute_db_state_fingerprint""#),
"deferred startup should skip exact startup fingerprinting: {logs}"
);
let start_packet_producer = logs
.find(r#"step="start_packet_producer""#)
.expect("start_packet_producer log position");
let persist_initial_checkpoint = logs
.find(r#"step="persist_initial_checkpoint""#)
.expect("persist_initial_checkpoint log position");
assert!(
start_packet_producer < persist_initial_checkpoint,
"deferred fresh staged rebuild should start the packet producer before \
persisting the initial checkpoint (producer-handoff overlap): {logs}"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_fresh_run_uses_staged_shard_build_path() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "6");
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let _fetch_conversations = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "1");
let _startup_fetch_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"1",
);
let _first_budget_promotion_wait = set_env(
"CASS_TANTIVY_REBUILD_FIRST_BUDGET_PROMOTION_WAIT_MS",
"5000",
);
let _commit_conversations = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS", "1");
let _startup_commit_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS",
"1",
);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db_with_options(
&db_path,
&data_dir,
2,
None,
LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: true,
},
None,
)
.unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
assert!(rebuild.exact_checkpoint_persisted);
});
assert!(
logs.contains("staged shard-build path"),
"expected staged shard-build log, got:\n{logs}"
);
assert!(
!logs.contains("timed out waiting for first durable budget promotion"),
"staged shard-build path should advance the first durable shard without a producer timeout: {logs}"
);
let index_path = index_dir(&data_dir).unwrap();
let state = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("completed lexical rebuild state after staged shard build");
assert!(state.completed);
assert_eq!(
state.execution_mode,
LexicalRebuildExecutionMode::SharedWriter
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 4);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_bounded_final_frontier_publishes_federated_without_remerge() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
for (suffix, base_ts) in [("3", 1_700_000_002_000_i64), ("4", 1_700_000_003_000_i64)] {
let external_id = format!("lexical-fixture-{suffix}");
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.clone()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "6");
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let _fetch_conversations = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "1");
let _startup_fetch_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"1",
);
let _commit_conversations = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS", "1");
let _startup_commit_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS",
"1",
);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db_with_options(
&db_path,
&data_dir,
2,
None,
LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: true,
},
None,
)
.unwrap();
assert_eq!(rebuild.indexed_docs, 8);
assert_eq!(rebuild.observed_messages, Some(8));
assert!(rebuild.exact_checkpoint_persisted);
});
assert!(
logs.contains("staged shard-build path"),
"expected staged shard-build log, got:\n{logs}"
);
assert!(
logs.contains(
"publishing staged lexical rebuild as federated lexical shard set without final assembly collapse"
),
"expected bounded final-frontier federated publish log, got:\n{logs}"
);
assert!(
!logs.contains("running staged lexical rebuild merge round"),
"a bounded final frontier should not trigger a redundant final merge round: {logs}"
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 8);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_multi_artifact_final_frontier_assembles_publish_generation_without_doc_remerge()
{
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let base_ts = 1_700_000_100_000_i64;
let external_id = "lexical-fixture-3";
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.into()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "6");
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let _fetch_conversations = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "1");
let _startup_fetch_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"1",
);
let _commit_conversations = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS", "1");
let _startup_commit_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS",
"1",
);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db_with_options(
&db_path,
&data_dir,
2,
None,
LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: true,
},
None,
)
.unwrap();
assert_eq!(rebuild.indexed_docs, 6);
assert_eq!(rebuild.observed_messages, Some(6));
assert!(rebuild.exact_checkpoint_persisted);
});
assert!(
logs.contains("staged shard-build path"),
"expected staged shard-build log, got:\n{logs}"
);
assert!(
logs.contains(
"publishing staged lexical rebuild as federated lexical shard set without final assembly collapse"
),
"expected federated final publish log, got:\n{logs}"
);
assert!(
!logs.contains("running staged lexical rebuild merge round"),
"federated multi-artifact publish should avoid the fallback merge-tree tail: {logs}"
);
let index_path = index_dir(&data_dir).unwrap();
assert_eq!(
crate::search::tantivy::open_federated_search_readers(
&index_path,
frankensearch::lexical::ReloadPolicy::Manual,
)
.unwrap()
.expect("published federated readers")
.len(),
3,
"published staged rebuild should preserve the three final shard artifacts as a federated lexical bundle"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.expect("searchable summary")
.segments,
3,
"published staged rebuild should report the three preserved final shard segments"
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 6);
}
#[test]
#[serial]
fn publish_staged_lexical_index_replaces_live_index_and_retains_prior_backup() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let old_conv = norm_conv(Some("publish-old"), vec![norm_msg(0, 1_700_000_000_000)]);
let mut live_index = TantivyIndex::open_or_create(&index_path).unwrap();
live_index
.add_messages_with_conversation_id(&old_conv, &old_conv.messages, Some(1))
.unwrap();
live_index.commit().unwrap();
drop(live_index);
let stage_root = TempDirBuilder::new()
.prefix("cass-test-publish-stage.")
.tempdir_in(index_path.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let new_conv = norm_conv(
Some("publish-new"),
vec![
norm_msg(0, 1_700_000_001_000),
norm_msg(1, 1_700_000_001_100),
],
);
let mut staged_index = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged_index
.add_messages_with_conversation_id(&new_conv, &new_conv.messages, Some(2))
.unwrap();
staged_index.commit().unwrap();
drop(staged_index);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&staged_index_path)
.unwrap()
.unwrap()
.docs,
2
);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
2,
"the live lexical index should expose the new staged publish"
);
assert!(
!staged_index_path.exists(),
"the staged publish path should be consumed into the retained backup flow"
);
let retained_backups = fs::read_dir(lexical_publish_backups_dir(&index_path))
.unwrap()
.map(|entry| entry.unwrap().path())
.collect::<Vec<_>>();
assert_eq!(retained_backups.len(), 1);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&retained_backups[0])
.unwrap()
.unwrap()
.docs,
1,
"the retained backup should preserve the previously published live index"
);
}
#[test]
#[serial]
fn publish_staged_lexical_index_moves_generation_audit_files_with_the_staged_directory() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let old_conv = norm_conv(Some("audit-old"), vec![norm_msg(0, 1_700_000_030_000)]);
let mut live_index = TantivyIndex::open_or_create(&index_path).unwrap();
live_index
.add_messages_with_conversation_id(&old_conv, &old_conv.messages, Some(30))
.unwrap();
live_index.commit().unwrap();
drop(live_index);
let old_evidence = super::LexicalRebuildEquivalenceEvidence {
document_count: 1,
manifest_fingerprint: "old-manifest-fingerprint".to_string(),
golden_query_digest: "old-golden-digest".to_string(),
golden_query_hit_counts: vec![super::LexicalRebuildEquivalenceGoldenHit {
probe: "audit-old".to_string(),
hit_count: 1,
}],
};
let old_manifest = persist_lexical_rebuild_generation_artifacts(
&index_path,
"content-v1:old-fingerprint",
1,
1,
1,
1,
&old_evidence,
)
.unwrap();
let stage_root = TempDirBuilder::new()
.prefix("cass-test-publish-audit-stage.")
.tempdir_in(index_path.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let new_conv = norm_conv(
Some("audit-new"),
vec![
norm_msg(0, 1_700_000_040_000),
norm_msg(1, 1_700_000_040_100),
],
);
let mut staged_index = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged_index
.add_messages_with_conversation_id(&new_conv, &new_conv.messages, Some(40))
.unwrap();
staged_index.commit().unwrap();
drop(staged_index);
let new_evidence = super::LexicalRebuildEquivalenceEvidence {
document_count: 2,
manifest_fingerprint: "new-manifest-fingerprint".to_string(),
golden_query_digest: "new-golden-digest".to_string(),
golden_query_hit_counts: vec![super::LexicalRebuildEquivalenceGoldenHit {
probe: "audit-new".to_string(),
hit_count: 2,
}],
};
let new_manifest = persist_lexical_rebuild_generation_artifacts(
&staged_index_path,
"content-v1:new-fingerprint",
2,
1,
2,
2,
&new_evidence,
)
.unwrap();
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
let live_manifest = lexical_generation::load_manifest(&index_path)
.unwrap()
.expect("live manifest after staged publish");
assert_eq!(live_manifest.generation_id, new_manifest.generation_id);
assert_eq!(
live_manifest.equivalence_manifest_fingerprint.as_deref(),
Some(new_evidence.manifest_fingerprint.as_str())
);
let live_evidence: super::LexicalRebuildEquivalenceEvidence = serde_json::from_slice(
&fs::read(lexical_rebuild_equivalence_evidence_path(&index_path)).unwrap(),
)
.unwrap();
assert_eq!(live_evidence, new_evidence);
let retained_backups = fs::read_dir(lexical_publish_backups_dir(&index_path))
.unwrap()
.map(|entry| entry.unwrap().path())
.collect::<Vec<_>>();
assert_eq!(retained_backups.len(), 1);
let backup_manifest = lexical_generation::load_manifest(&retained_backups[0])
.unwrap()
.expect("retained backup manifest");
assert_eq!(backup_manifest.generation_id, old_manifest.generation_id);
assert_eq!(
backup_manifest.equivalence_manifest_fingerprint.as_deref(),
Some(old_evidence.manifest_fingerprint.as_str())
);
let backup_evidence: super::LexicalRebuildEquivalenceEvidence = serde_json::from_slice(
&fs::read(lexical_rebuild_equivalence_evidence_path(
&retained_backups[0],
))
.unwrap(),
)
.unwrap();
assert_eq!(backup_evidence, old_evidence);
}
#[test]
#[serial]
fn publish_staged_lexical_index_recovers_interrupted_backup_before_replacing_live_index() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let old_conv = norm_conv(
Some("interrupted-old"),
vec![
norm_msg(0, 1_700_000_010_000),
norm_msg(1, 1_700_000_010_100),
],
);
let mut live_index = TantivyIndex::open_or_create(&index_path).unwrap();
live_index
.add_messages_with_conversation_id(&old_conv, &old_conv.messages, Some(10))
.unwrap();
live_index.commit().unwrap();
drop(live_index);
let in_progress_backup_path = lexical_publish_in_progress_backup_path(&index_path);
fs::rename(&index_path, &in_progress_backup_path).unwrap();
assert!(
!index_path.exists(),
"the live path should be missing to simulate an interrupted publish after parking the old index"
);
let stage_root = TempDirBuilder::new()
.prefix("cass-test-publish-recovery.")
.tempdir_in(in_progress_backup_path.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let new_conv = norm_conv(
Some("interrupted-new"),
vec![norm_msg(0, 1_700_000_020_000)],
);
let mut staged_index = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged_index
.add_messages_with_conversation_id(&new_conv, &new_conv.messages, Some(11))
.unwrap();
staged_index.commit().unwrap();
drop(staged_index);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
assert!(
!in_progress_backup_path.exists(),
"successful publish should consume the interrupted in-progress backup"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1,
"the new staged publish should become live after interrupted-backup recovery"
);
let retained_backups = fs::read_dir(lexical_publish_backups_dir(&index_path))
.unwrap()
.map(|entry| entry.unwrap().path())
.collect::<Vec<_>>();
assert_eq!(retained_backups.len(), 1);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&retained_backups[0])
.unwrap()
.unwrap()
.docs,
2,
"the retained backup should still preserve the old live publish recovered from the interrupted swap"
);
}
#[test]
fn publish_staged_lexical_index_first_publish_with_no_prior_live_index_uses_plain_rename() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
fs::remove_dir_all(&index_path).unwrap();
assert!(!index_path.exists(), "precondition: no prior live index");
let stage_root = TempDirBuilder::new()
.prefix("cass-test-first-publish-stage.")
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let conv = norm_conv(Some("first-publish"), vec![norm_msg(0, 1_700_000_100_000)]);
let mut staged_index = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged_index
.add_messages_with_conversation_id(&conv, &conv.messages, Some(1))
.unwrap();
staged_index.commit().unwrap();
drop(staged_index);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1,
"first publish must land the staged index exactly"
);
assert!(
!staged_index_path.exists(),
"staged path must be consumed by the rename"
);
let backups_dir = lexical_publish_backups_dir(&index_path);
if backups_dir.exists() {
let entries: Vec<_> = fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert!(
entries.is_empty(),
"first publish must not create retained backups; got {entries:?}"
);
}
}
#[test]
#[serial]
fn publish_staged_lexical_index_retains_stale_in_progress_backup_when_live_present() {
let _prior = std::env::var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION").ok();
unsafe {
std::env::set_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", "2");
}
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let live_conv = norm_conv(
Some("live-newer"),
vec![
norm_msg(0, 1_700_000_200_000),
norm_msg(1, 1_700_000_200_100),
],
);
let mut live_index = TantivyIndex::open_or_create(&index_path).unwrap();
live_index
.add_messages_with_conversation_id(&live_conv, &live_conv.messages, Some(20))
.unwrap();
live_index.commit().unwrap();
drop(live_index);
let stale_conv = norm_conv(Some("stale-older"), vec![norm_msg(0, 1_700_000_100_000)]);
let in_progress_backup_path = lexical_publish_in_progress_backup_path(&index_path);
let mut stale_index = TantivyIndex::open_or_create(&in_progress_backup_path).unwrap();
stale_index
.add_messages_with_conversation_id(&stale_conv, &stale_conv.messages, Some(10))
.unwrap();
stale_index.commit().unwrap();
drop(stale_index);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&in_progress_backup_path)
.unwrap()
.unwrap()
.docs,
1,
"precondition: stale in-progress backup has exactly 1 doc"
);
let stage_root = TempDirBuilder::new()
.prefix("cass-test-stale-recovery-stage.")
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let fresh_conv = norm_conv(
Some("freshly-staged"),
vec![
norm_msg(0, 1_700_000_300_000),
norm_msg(1, 1_700_000_300_100),
norm_msg(2, 1_700_000_300_200),
],
);
let mut staged_index = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged_index
.add_messages_with_conversation_id(&fresh_conv, &fresh_conv.messages, Some(30))
.unwrap();
staged_index.commit().unwrap();
drop(staged_index);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
3,
"live index must reflect the freshly-staged publish"
);
assert!(
!in_progress_backup_path.exists(),
"stale in-progress backup must be consumed by recovery"
);
let backups_dir = lexical_publish_backups_dir(&index_path);
let retained_backups: Vec<_> = fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert_eq!(
retained_backups.len(),
2,
"both the stale sidecar and the prior live index must be retained as separate backups; got {retained_backups:?}"
);
let mut backup_doc_counts: Vec<usize> = retained_backups
.iter()
.map(|path| {
crate::search::tantivy::searchable_index_summary(path)
.unwrap()
.unwrap()
.docs
})
.collect();
backup_doc_counts.sort_unstable();
assert_eq!(
backup_doc_counts,
vec![1, 2],
"retained backups must preserve both the stale-older (1 doc) \
and the just-displaced-prior-live (2 docs) artifacts"
);
unsafe {
match _prior {
Some(v) => std::env::set_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", v),
None => std::env::remove_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION"),
}
}
}
#[test]
#[serial]
fn publish_staged_lexical_index_prunes_retained_backups_to_default_retention_cap() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let _prior = std::env::var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION").ok();
unsafe {
std::env::remove_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION");
}
let seed = norm_conv(Some("retention-seed"), vec![norm_msg(0, 1_700_000_500_000)]);
let mut live = TantivyIndex::open_or_create(&index_path).unwrap();
live.add_messages_with_conversation_id(&seed, &seed.messages, Some(100))
.unwrap();
live.commit().unwrap();
drop(live);
let publish_count = 4_usize;
for iteration in 0..publish_count {
let stage_root = TempDirBuilder::new()
.prefix(&format!("cass-test-retention-stage-{iteration}."))
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let conv = norm_conv(
Some(&format!("retention-iter-{iteration}")),
vec![norm_msg(0, 1_700_000_510_000 + iteration as i64)],
);
let mut staged = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged
.add_messages_with_conversation_id(
&conv,
&conv.messages,
Some(200 + iteration as i64),
)
.unwrap();
staged.commit().unwrap();
drop(staged);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
}
let backups_dir = lexical_publish_backups_dir(&index_path);
let retained_backups: Vec<_> = fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert_eq!(
retained_backups.len(),
1,
"default retention cap is 1, got {} retained after {publish_count} publishes: {retained_backups:?}",
retained_backups.len()
);
unsafe {
match _prior {
Some(v) => std::env::set_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", v),
None => std::env::remove_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION"),
}
}
}
#[test]
#[serial]
fn publish_staged_lexical_index_retention_cap_is_env_configurable() {
let _prior = std::env::var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION").ok();
for (cap_value, expected_retained) in [("0", 0_usize), ("3", 3_usize)] {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
unsafe {
std::env::set_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", cap_value);
}
let seed = norm_conv(
Some(&format!("retention-env-seed-{cap_value}")),
vec![norm_msg(0, 1_700_000_600_000)],
);
let mut live = TantivyIndex::open_or_create(&index_path).unwrap();
live.add_messages_with_conversation_id(&seed, &seed.messages, Some(100))
.unwrap();
live.commit().unwrap();
drop(live);
for iteration in 0..5_usize {
let stage_root = TempDirBuilder::new()
.prefix(&format!("cass-test-retention-env-{cap_value}-{iteration}."))
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let conv = norm_conv(
Some(&format!("retention-env-{cap_value}-iter-{iteration}")),
vec![norm_msg(0, 1_700_000_610_000 + iteration as i64)],
);
let mut staged = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged
.add_messages_with_conversation_id(
&conv,
&conv.messages,
Some(200 + iteration as i64),
)
.unwrap();
staged.commit().unwrap();
drop(staged);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
}
let backups_dir = lexical_publish_backups_dir(&index_path);
let retained: Vec<_> = if backups_dir.exists() {
fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect()
} else {
Vec::new()
};
assert_eq!(
retained.len(),
expected_retained,
"CASS_LEXICAL_PUBLISH_BACKUP_RETENTION={cap_value} should produce {expected_retained} retained backups after 5 publishes; got {}: {retained:?}",
retained.len()
);
}
unsafe {
match _prior {
Some(v) => std::env::set_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", v),
None => std::env::remove_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION"),
}
}
}
#[test]
fn publish_staged_lexical_index_errors_cleanly_when_staged_path_does_not_exist() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let seed = norm_conv(
Some("missing-staged-sentinel"),
vec![norm_msg(0, 1_700_000_700_000)],
);
let mut live_index = TantivyIndex::open_or_create(&index_path).unwrap();
live_index
.add_messages_with_conversation_id(&seed, &seed.messages, Some(700))
.unwrap();
live_index.commit().unwrap();
drop(live_index);
let live_docs_before = crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs;
let nonexistent_staged = data_dir.parent().unwrap().join("definitely-not-staged");
assert!(
!nonexistent_staged.exists(),
"precondition: staged path must be missing"
);
let result = publish_staged_lexical_index(&nonexistent_staged, &index_path);
assert!(
result.is_err(),
"publish with missing staged path must return Err, got {result:?}"
);
let live_docs_after = crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs;
assert_eq!(
live_docs_after, live_docs_before,
"live index must not regress after a missing-staged publish error"
);
let in_progress = lexical_publish_in_progress_backup_path(&index_path);
assert!(
!in_progress.exists(),
"no stale .publish-in-progress.bak should be left behind when publish errors early"
);
}
#[test]
fn recover_or_finalize_interrupted_lexical_publish_backup_is_idempotent_no_op() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let seed = norm_conv(
Some("idempotent-recovery"),
vec![norm_msg(0, 1_700_000_800_000)],
);
let mut live_index = TantivyIndex::open_or_create(&index_path).unwrap();
live_index
.add_messages_with_conversation_id(&seed, &seed.messages, Some(800))
.unwrap();
live_index.commit().unwrap();
drop(live_index);
assert!(
!lexical_publish_in_progress_backup_path(&index_path).exists(),
"precondition: no in-progress backup sidecar"
);
let backups_dir = lexical_publish_backups_dir(&index_path);
assert!(
!backups_dir.exists(),
"precondition: no retained-backup dir yet"
);
for call in 0..3 {
recover_or_finalize_interrupted_lexical_publish_backup(&index_path).unwrap_or_else(
|err| panic!("idempotent recovery call {call} must not error; got {err:#}"),
);
}
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1,
"live index doc count must not change across no-op recovery calls"
);
assert!(
!backups_dir.exists(),
"no-op recovery must NOT create an empty backups directory; \
found {backups_dir:?}"
);
}
#[test]
#[serial]
fn publish_staged_lexical_index_recovers_from_crash_between_park_and_swap() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let prior_conv = norm_conv(
Some("crash-prior"),
vec![
norm_msg(0, 1_700_000_900_000),
norm_msg(1, 1_700_000_900_100),
norm_msg(2, 1_700_000_900_200),
],
);
let mut prior_live = TantivyIndex::open_or_create(&index_path).unwrap();
prior_live
.add_messages_with_conversation_id(&prior_conv, &prior_conv.messages, Some(900))
.unwrap();
prior_live.commit().unwrap();
drop(prior_live);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
3,
"precondition: prior-live index has 3 docs"
);
let in_progress_backup_path = lexical_publish_in_progress_backup_path(&index_path);
fs::rename(&index_path, &in_progress_backup_path).unwrap();
assert!(!index_path.exists());
assert!(in_progress_backup_path.exists());
let stage_root = TempDirBuilder::new()
.prefix("cass-test-crash-recovery-stage.")
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let new_conv = norm_conv(Some("crash-new"), vec![norm_msg(0, 1_700_000_910_000)]);
let mut staged = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged
.add_messages_with_conversation_id(&new_conv, &new_conv.messages, Some(901))
.unwrap();
staged.commit().unwrap();
drop(staged);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1,
"post-recovery live must be the new staged publish (1 doc)"
);
assert!(
!in_progress_backup_path.exists(),
"crash-window sidecar must be consumed by recovery"
);
let backups_dir = lexical_publish_backups_dir(&index_path);
let retained: Vec<_> = fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert_eq!(retained.len(), 1);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&retained[0])
.unwrap()
.unwrap()
.docs,
3,
"retained backup must preserve the prior-live artifact recovered from the crash sidecar"
);
}
#[test]
fn recover_or_finalize_interrupted_lexical_publish_backup_restores_live_index_without_new_publish()
{
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let prior_conv = norm_conv(
Some("restart-recovery-prior"),
vec![
norm_msg(0, 1_700_000_905_000),
norm_msg(1, 1_700_000_905_100),
norm_msg(2, 1_700_000_905_200),
],
);
let mut prior_live = TantivyIndex::open_or_create(&index_path).unwrap();
prior_live
.add_messages_with_conversation_id(&prior_conv, &prior_conv.messages, Some(905))
.unwrap();
prior_live.commit().unwrap();
drop(prior_live);
let in_progress_backup_path = lexical_publish_in_progress_backup_path(&index_path);
fs::rename(&index_path, &in_progress_backup_path).unwrap();
assert!(
!index_path.exists(),
"precondition: live index path should be absent after the simulated hard-kill window"
);
assert!(
in_progress_backup_path.exists(),
"precondition: interrupted publish sidecar should hold the recoverable live index"
);
recover_or_finalize_interrupted_lexical_publish_backup(&index_path).unwrap();
assert!(
index_path.exists(),
"startup recovery must restore the live index path from the stranded sidecar"
);
assert!(
!in_progress_backup_path.exists(),
"startup recovery must consume the stranded sidecar after restoring the live index"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
3,
"startup recovery must restore the exact prior-live document surface"
);
assert!(
!lexical_publish_backups_dir(&index_path).exists(),
"pure restore recovery should not materialize retained-backup storage when there is no newer live index yet"
);
}
#[test]
#[serial]
#[cfg(target_os = "linux")]
fn publish_staged_lexical_index_rolls_back_when_enospc_blocks_linux_sidecar_park() {
const ENOSPC_RAW_OS_ERROR: i32 = 28;
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let prior_conv = norm_conv(
Some("enospc-park-prior"),
vec![
norm_msg(0, 1_700_012_000_000),
norm_msg(1, 1_700_012_000_100),
norm_msg(2, 1_700_012_000_200),
],
);
let mut prior_live = TantivyIndex::open_or_create(&index_path).unwrap();
prior_live
.add_messages_with_conversation_id(&prior_conv, &prior_conv.messages, Some(1_200))
.unwrap();
prior_live.commit().unwrap();
drop(prior_live);
let stage_root = TempDirBuilder::new()
.prefix("cass-test-enospc-park-stage.")
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let new_conv = norm_conv(
Some("enospc-park-new"),
vec![norm_msg(0, 1_700_012_001_000)],
);
let mut staged = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged
.add_messages_with_conversation_id(&new_conv, &new_conv.messages, Some(1_201))
.unwrap();
staged.commit().unwrap();
drop(staged);
let _fault = inject_lexical_publish_rename_failure_once(
LexicalPublishRenameSite::LinuxParkPriorLiveToCanonicalSidecar,
ENOSPC_RAW_OS_ERROR,
);
let err = publish_staged_lexical_index(&staged_index_path, &index_path)
.expect_err("ENOSPC while parking OLD must fail the publish");
let err_text = format!("{err:#}");
assert!(
err_text.contains("rolled back to keep previous live index"),
"expected rollback context in ENOSPC publish error, got: {err_text}"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
3,
"ENOSPC while parking OLD must leave the prior live index intact after rollback"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&staged_index_path)
.unwrap()
.unwrap()
.docs,
1,
"rollback must return the new generation to the staged tempdir instead of discarding it"
);
assert!(
!lexical_publish_in_progress_backup_path(&index_path).exists(),
"rollback must not leave a stale canonical sidecar behind"
);
let retained_backups: Vec<_> = fs::read_dir(lexical_publish_backups_dir(&index_path))
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert!(
retained_backups.is_empty(),
"failed pre-commit publish must not retain any prior-live backups yet; got {retained_backups:?}"
);
}
#[test]
#[serial]
#[cfg(target_os = "linux")]
fn publish_staged_lexical_index_recovers_after_enospc_blocks_linux_retained_backup_move() {
const ENOSPC_RAW_OS_ERROR: i32 = 28;
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let prior_conv = norm_conv(
Some("enospc-retain-prior"),
vec![
norm_msg(0, 1_700_012_100_000),
norm_msg(1, 1_700_012_100_100),
norm_msg(2, 1_700_012_100_200),
],
);
let mut prior_live = TantivyIndex::open_or_create(&index_path).unwrap();
prior_live
.add_messages_with_conversation_id(&prior_conv, &prior_conv.messages, Some(1_210))
.unwrap();
prior_live.commit().unwrap();
drop(prior_live);
let stage_root = TempDirBuilder::new()
.prefix("cass-test-enospc-retain-stage.")
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let new_conv = norm_conv(
Some("enospc-retain-new"),
vec![norm_msg(0, 1_700_012_101_000)],
);
let mut staged = TantivyIndex::open_or_create(&staged_index_path).unwrap();
staged
.add_messages_with_conversation_id(&new_conv, &new_conv.messages, Some(1_211))
.unwrap();
staged.commit().unwrap();
drop(staged);
let _fault = inject_lexical_publish_rename_failure_once(
LexicalPublishRenameSite::LinuxRetainCanonicalSidecar,
ENOSPC_RAW_OS_ERROR,
);
publish_staged_lexical_index(&staged_index_path, &index_path)
.expect("ENOSPC while retaining OLD after the swap should keep NEW live");
let canonical_sidecar = lexical_publish_in_progress_backup_path(&index_path);
assert!(
canonical_sidecar.exists(),
"retain failure after commit must preserve the canonical sidecar for later recovery"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&canonical_sidecar)
.unwrap()
.unwrap()
.docs,
3,
"the canonical sidecar must continue to hold the old live generation after retain ENOSPC"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1,
"retain ENOSPC must not roll back the already-published live generation"
);
let retained_backups: Vec<_> = fs::read_dir(lexical_publish_backups_dir(&index_path))
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert!(
retained_backups.is_empty(),
"retain ENOSPC should leave recovery work in the canonical sidecar, not partially retain backups: {retained_backups:?}"
);
recover_or_finalize_interrupted_lexical_publish_backup(&index_path).unwrap();
assert!(
!canonical_sidecar.exists(),
"recovery after retain ENOSPC must consume the canonical sidecar"
);
let retained_backups: Vec<_> = fs::read_dir(lexical_publish_backups_dir(&index_path))
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert_eq!(
retained_backups.len(),
1,
"recovery should retain exactly one old live artifact after post-commit ENOSPC"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&retained_backups[0])
.unwrap()
.unwrap()
.docs,
3,
"recovery must preserve the old live generation in retained-backup storage"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1,
"recovery after retain ENOSPC must not disturb the already-published live generation"
);
}
#[test]
#[serial]
fn publish_staged_lexical_index_recovers_from_crash_between_linux_swap_and_retain() {
let _prior = std::env::var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION").ok();
unsafe {
std::env::set_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", "2");
}
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let new_live_conv = norm_conv(
Some("wkx5-linux-new-live"),
vec![norm_msg(0, 1_700_009_000_000)],
);
let mut new_live = TantivyIndex::open_or_create(&index_path).unwrap();
new_live
.add_messages_with_conversation_id(&new_live_conv, &new_live_conv.messages, Some(1))
.unwrap();
new_live.commit().unwrap();
drop(new_live);
let canonical_sidecar = lexical_publish_in_progress_backup_path(&index_path);
let old_backup_conv = norm_conv(
Some("wkx5-linux-old-backup"),
vec![
norm_msg(0, 1_700_008_000_000),
norm_msg(1, 1_700_008_000_100),
norm_msg(2, 1_700_008_000_200),
],
);
let mut sidecar_index = TantivyIndex::open_or_create(&canonical_sidecar).unwrap();
sidecar_index
.add_messages_with_conversation_id(&old_backup_conv, &old_backup_conv.messages, Some(2))
.unwrap();
sidecar_index.commit().unwrap();
drop(sidecar_index);
assert!(
canonical_sidecar.exists(),
"precondition: canonical sidecar with OLD content"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
1,
"precondition: live has NEW (1 doc)"
);
let stage_root = TempDirBuilder::new()
.prefix("cass-test-wkx5-linux-stage.")
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
let third_gen_conv = norm_conv(
Some("wkx5-linux-third-gen"),
vec![
norm_msg(0, 1_700_010_000_000),
norm_msg(1, 1_700_010_000_100),
],
);
let mut third_gen = TantivyIndex::open_or_create(&staged_index_path).unwrap();
third_gen
.add_messages_with_conversation_id(&third_gen_conv, &third_gen_conv.messages, Some(3))
.unwrap();
third_gen.commit().unwrap();
drop(third_gen);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
2,
"live must be third-gen publish after recovery + publish"
);
assert!(
!canonical_sidecar.exists(),
"canonical sidecar must be moved into retained-backup storage by recovery"
);
let backups_dir = lexical_publish_backups_dir(&index_path);
let retained: Vec<_> = fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert_eq!(
retained.len(),
2,
"both the A.5 sidecar (OLD, 3 docs) and the prior-live (NEW, 1 doc) must be retained; got {retained:?}"
);
let mut doc_counts: Vec<usize> = retained
.iter()
.map(|path| {
crate::search::tantivy::searchable_index_summary(path)
.unwrap()
.unwrap()
.docs
})
.collect();
doc_counts.sort_unstable();
assert_eq!(
doc_counts,
vec![1, 3],
"retained backups must have exactly the expected doc counts (1 = NEW prior-live, 3 = OLD from A.5 sidecar)"
);
unsafe {
match _prior {
Some(v) => std::env::set_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", v),
None => std::env::remove_var("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION"),
}
}
}
#[test]
#[serial]
#[cfg(unix)]
fn publish_staged_lexical_index_prunes_retained_backup_without_breaking_open_reader() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let build_generation =
|path: &Path, label: &str, doc_count: usize, conversation_id: i64, base_ts: i64| {
let messages = (0..doc_count)
.map(|idx| norm_msg(idx as i64, base_ts + (idx as i64 * 100)))
.collect::<Vec<_>>();
let conv = norm_conv(Some(label), messages);
let mut index = TantivyIndex::open_or_create(path).unwrap();
index
.add_messages_with_conversation_id(&conv, &conv.messages, Some(conversation_id))
.unwrap();
index.commit().unwrap();
drop(index);
};
build_generation(
&index_path,
"retained-reader-seed",
1,
1_100,
1_700_011_000_000,
);
let (oldest_backup_path, oldest_backup_reader) = {
let _retention_two = set_env("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", "2");
for (iteration, doc_count, conversation_id) in
[(1_usize, 2_usize, 1_101_i64), (2_usize, 3_usize, 1_102_i64)]
{
let stage_root = TempDirBuilder::new()
.prefix(&format!("cass-test-open-reader-retention-{iteration}."))
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
build_generation(
&staged_index_path,
&format!("retained-reader-stage-{iteration}"),
doc_count,
conversation_id,
1_700_011_100_000 + iteration as i64 * 1_000,
);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
}
let backups_dir = lexical_publish_backups_dir(&index_path);
let mut retained_with_docs: Vec<(usize, PathBuf)> = fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| {
let path = entry.unwrap().path();
let docs = crate::search::tantivy::searchable_index_summary(&path)
.unwrap()
.unwrap()
.docs;
(docs, path)
})
.collect();
retained_with_docs.sort_by_key(|(docs, _path)| *docs);
assert_eq!(
retained_with_docs
.iter()
.map(|(docs, _)| *docs)
.collect::<Vec<_>>(),
vec![1, 2],
"retention=2 should retain the oldest two prior-live generations before pruning is tightened"
);
let oldest_backup_path = retained_with_docs[0].1.clone();
let oldest_backup_index = TantivyIndex::open_or_create(&oldest_backup_path).unwrap();
let oldest_backup_reader = oldest_backup_index.reader().unwrap();
oldest_backup_reader.reload().unwrap();
assert_eq!(
oldest_backup_reader.searcher().num_docs(),
1,
"precondition: the pinned reader must observe the oldest retained backup generation"
);
drop(oldest_backup_index);
(oldest_backup_path, oldest_backup_reader)
};
let _retention_one = set_env("CASS_LEXICAL_PUBLISH_BACKUP_RETENTION", "1");
let stage_root = TempDirBuilder::new()
.prefix("cass-test-open-reader-retention-prune.")
.tempdir_in(data_dir.parent().unwrap())
.unwrap();
let staged_index_path = stage_root.path().join("staged");
build_generation(
&staged_index_path,
"retained-reader-stage-prune",
4,
1_103,
1_700_011_200_000,
);
publish_staged_lexical_index(&staged_index_path, &index_path).unwrap();
assert!(
!oldest_backup_path.exists(),
"bounded retention should prune the oldest retained backup path once the cap drops to 1"
);
assert_eq!(
oldest_backup_reader.searcher().num_docs(),
1,
"an open reader pinned to a pruned retained backup must keep serving the prior doc surface"
);
assert_eq!(
crate::search::tantivy::searchable_index_summary(&index_path)
.unwrap()
.unwrap()
.docs,
4,
"live index must advance to the newly published generation while stale-reader GC runs"
);
let backups_dir = lexical_publish_backups_dir(&index_path);
let retained_doc_counts: Vec<usize> = fs::read_dir(&backups_dir)
.unwrap()
.map(|entry| {
let path = entry.unwrap().path();
crate::search::tantivy::searchable_index_summary(&path)
.unwrap()
.unwrap()
.docs
})
.collect();
assert_eq!(
retained_doc_counts,
vec![3],
"retention=1 should leave exactly the immediately prior live generation after pruning"
);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_publishes_bounded_final_frontier_without_reduction() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
for suffix in 3..=15 {
let base_ts = 1_700_000_000_000_i64 + (i64::from(suffix) * 1_000_i64);
let external_id = format!("lexical-fixture-{suffix}");
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.clone()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + 10),
content: format!("{external_id}-first"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(base_ts + 20),
content: format!("{external_id}-second"),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
}
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "6");
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let _fetch_conversations = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "1");
let _startup_fetch_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"1",
);
let _commit_conversations = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS", "1");
let _startup_commit_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS",
"1",
);
let mut rebuild_equivalence: Option<super::LexicalRebuildEquivalenceEvidence> = None;
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db_with_options(
&db_path,
&data_dir,
2,
None,
LexicalRebuildStartupOptions {
defer_initial_content_fingerprint: true,
},
None,
)
.unwrap();
assert_eq!(rebuild.indexed_docs, 30);
assert_eq!(rebuild.observed_messages, Some(30));
assert!(rebuild.exact_checkpoint_persisted);
rebuild_equivalence = rebuild.equivalence.clone();
});
let evidence = rebuild_equivalence
.expect("staged shard rebuild must emit equivalence evidence like the normal path");
assert!(
logs.contains("staged shard-build path"),
"expected staged shard-build log, got:\n{logs}"
);
assert!(
logs.contains(
"publishing staged lexical rebuild as federated lexical shard set without final assembly collapse"
),
"expected bounded final frontier to publish as a federated bundle, got:\n{logs}"
);
assert!(
!logs
.contains("draining staged lexical rebuild final merge frontier via merge workers"),
"bounded final frontier should not pay foreground reduction cost: {logs}"
);
assert!(
!logs.contains("running staged lexical rebuild merge round"),
"bounded final frontier should avoid the fallback merge-tree tail: {logs}"
);
let index_path = index_dir(&data_dir).unwrap();
let state = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("completed lexical rebuild state after bounded final-frontier publish");
assert_eq!(
state.committed_meta_fingerprint.as_deref(),
index_meta_fingerprint(&index_path).unwrap().as_deref(),
"completed staged rebuild state should retain the live published meta fingerprint"
);
let evidence_path = lexical_rebuild_equivalence_evidence_path(&index_path);
let persisted_evidence: super::LexicalRebuildEquivalenceEvidence =
serde_json::from_slice(&std::fs::read(&evidence_path).unwrap()).unwrap();
assert_eq!(
persisted_evidence, evidence,
"staged shard rebuild must persist the same equivalence evidence it returns"
);
let manifest = lexical_generation::load_manifest(&index_path)
.unwrap()
.expect("staged shard rebuild must persist a published generation manifest");
assert_eq!(
manifest.build_state,
lexical_generation::LexicalGenerationBuildState::Validated
);
assert_eq!(
manifest.publish_state,
lexical_generation::LexicalGenerationPublishState::Published
);
assert_eq!(
manifest.equivalence_manifest_fingerprint.as_deref(),
Some(evidence.manifest_fingerprint.as_str()),
"staged shard generation manifest must point at the persisted equivalence evidence"
);
assert!(
logs.contains("lexical generation manifest published"),
"expected staged shard path to log generation manifest publish, got:\n{logs}"
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 30);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_discards_incomplete_staged_shard_checkpoint_and_restarts() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let index_path = index_dir(&data_dir).unwrap();
std::fs::create_dir_all(&index_path).unwrap();
let fingerprint = lexical_storage_fingerprint_for_db(&db_path).unwrap();
std::fs::write(
lexical_rebuild_state_path(&index_path),
serde_json::to_vec_pretty(&serde_json::json!({
"version": LEXICAL_REBUILD_STATE_VERSION,
"schema_hash": crate::search::tantivy::SCHEMA_HASH,
"db": {
"db_path": db_path.display().to_string(),
"total_conversations": 2,
"total_messages": 0,
"storage_fingerprint": fingerprint,
},
"page_size": LEXICAL_REBUILD_PAGE_SIZE,
"committed_offset": 0,
"committed_conversation_id": null,
"processed_conversations": 1,
"indexed_docs": 2,
"committed_meta_fingerprint": null,
"pending": {
"next_offset": 1,
"next_conversation_id": 1,
"processed_conversations": 1,
"indexed_docs": 2,
"base_meta_fingerprint": null,
},
"completed": false,
"updated_at_ms": FrankenStorage::now_millis(),
"execution_mode": "staged_shard_build",
"runtime": {
"queue_depth": 1,
"inflight_message_bytes": 128,
"pending_batch_conversations": 1,
"pending_batch_message_bytes": 128,
"page_prep_workers": 1,
"active_page_prep_jobs": 1,
"ordered_buffered_pages": 0,
"budget_generation": 0,
"host_loadavg_1m_milli": null,
"controller_mode": "",
"controller_reason": "",
"updated_at_ms": FrankenStorage::now_millis(),
}
}))
.unwrap(),
)
.unwrap();
let _prep_profile = set_env("CASS_PREP_PROFILE", "1");
let _workers = set_env("CASS_TANTIVY_REBUILD_WORKERS", "6");
let _writer_threads = set_env("CASS_TANTIVY_MAX_WRITER_THREADS", "2");
let _fetch_conversations = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "1");
let _startup_fetch_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"1",
);
let _commit_conversations = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_CONVERSATIONS", "1");
let _startup_commit_conversations = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_CONVERSATIONS",
"1",
);
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db_with_options(
&db_path,
&data_dir,
2,
None,
LexicalRebuildStartupOptions::default(),
None,
)
.unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
});
assert!(
logs.contains("discarding non-resumable staged lexical rebuild checkpoint"),
"expected staged checkpoint reset log, got:\n{logs}"
);
for needle in [
r#"step="restart_from_zero_reset""#,
r#"step="plan_lexical_shards""#,
r#"step="start_packet_producer""#,
r#"step="persist_initial_checkpoint""#,
r#"step="first_batch_handoff""#,
] {
assert!(
logs.contains(needle),
"expected staged restart prep-profile log fragment `{needle}`, got:\n{logs}"
);
}
let restart_from_zero_reset = logs
.find(r#"step="restart_from_zero_reset""#)
.expect("restart_from_zero_reset log position");
let plan_lexical_shards = logs
.find(r#"step="plan_lexical_shards""#)
.expect("plan_lexical_shards log position");
let start_packet_producer = logs
.find(r#"step="start_packet_producer""#)
.expect("start_packet_producer log position");
let persist_initial_checkpoint = logs
.find(r#"step="persist_initial_checkpoint""#)
.expect("persist_initial_checkpoint log position");
let first_batch_handoff = logs
.find(r#"step="first_batch_handoff""#)
.expect("first_batch_handoff log position");
assert!(
plan_lexical_shards < restart_from_zero_reset,
"9ct8r: staged shard planning must run BEFORE the restart-from-zero \
reset so the reset can decide whether to skip the pre-wipe: {logs}"
);
assert!(
plan_lexical_shards < start_packet_producer,
"restart-from-zero rebuild should finish shard planning before producer startup overlap: {logs}"
);
assert!(
start_packet_producer < persist_initial_checkpoint,
"restart-from-zero staged rebuild should overlap producer startup before persisting the fresh startup checkpoint: {logs}"
);
assert!(
start_packet_producer < first_batch_handoff,
"restart-from-zero rebuild must start the producer before it hands off its first batch: {logs}"
);
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("completed checkpoint after staged checkpoint reset");
assert!(checkpoint.completed);
assert_eq!(checkpoint.processed_conversations, 2);
assert_eq!(checkpoint.indexed_docs, 4);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 4);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_falls_back_for_single_conversation_guardrail_outlier() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
storage
.insert_conversation_tree(
agent_id,
None,
&Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("oversized-single-conversation".into()),
title: Some("Oversized single conversation".into()),
source_path: PathBuf::from("/tmp/oversized-single-conversation.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: None,
created_at: Some(1_700_000_000_010),
content: "first".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: None,
created_at: Some(1_700_000_000_020),
content: "second".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
Message {
id: None,
idx: 2,
role: MessageRole::Tool,
author: None,
created_at: Some(1_700_000_000_030),
content: "third".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
},
],
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
},
)
.unwrap();
drop(storage);
let _conversation_limit = set_env("CASS_TANTIVY_REBUILD_BATCH_FETCH_CONVERSATIONS", "1");
let _commit_messages = set_env("CASS_TANTIVY_REBUILD_COMMIT_EVERY_MESSAGES", "2");
let _initial_commit_messages =
set_env("CASS_TANTIVY_REBUILD_INITIAL_COMMIT_EVERY_MESSAGES", "2");
let logs = capture_logs(|| {
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 1, None).unwrap();
assert_eq!(rebuild.indexed_docs, 3);
assert_eq!(rebuild.observed_messages, Some(3));
assert!(rebuild.exact_checkpoint_persisted);
});
assert!(
logs.contains("falling back to per-conversation fetches"),
"expected oversized single conversation fallback log, got:\n{logs}"
);
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 3);
}
#[test]
#[serial]
fn rebuild_tantivy_from_db_preserves_empty_conversation_gaps_in_stream() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: Some("0.2.3".into()),
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let insert_fixture = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some(external_id.to_string()),
title: Some("Lexical rebuild fixture".into()),
source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
started_at: Some(base_ts),
ended_at: Some(base_ts + 100),
approx_tokens: Some(64),
metadata_json: serde_json::Value::Null,
messages,
source_id: LOCAL_SOURCE_ID.into(),
origin_host: None,
};
storage
.insert_conversation_tree(agent_id, None, &conversation)
.unwrap();
};
insert_fixture(
"lexical-fixture-1",
1_700_000_000_000_i64,
vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_010_i64),
content: "lexical-fixture-1-first".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_000_020_i64),
content: "lexical-fixture-1-second".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
);
insert_fixture("lexical-fixture-empty", 1_700_000_000_500_i64, Vec::new());
insert_fixture(
"lexical-fixture-2",
1_700_000_001_000_i64,
vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_001_010_i64),
content: "lexical-fixture-2-first".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_001_020_i64),
content: "lexical-fixture-2-second".into(),
extra_json: serde_json::json!({"opaque": true}),
snippets: Vec::new(),
},
],
);
let rebuild = rebuild_tantivy_from_db(&db_path, &data_dir, 3, None).unwrap();
assert_eq!(rebuild.indexed_docs, 4);
assert_eq!(rebuild.observed_messages, Some(4));
assert_eq!(tantivy_doc_count_for_data_dir(&data_dir), 4);
let state = load_lexical_rebuild_state(&index_dir(&data_dir).unwrap())
.unwrap()
.unwrap();
assert_eq!(state.committed_offset, 3);
assert_eq!(state.processed_conversations, 3);
assert!(state.completed);
}
#[test]
fn classify_paths_uses_latest_mtime_per_connector() {
let tmp = TempDir::new().unwrap();
let codex = tmp.path().join(".codex/sessions/rollout-1.jsonl");
std::fs::create_dir_all(codex.parent().unwrap()).unwrap();
std::fs::write(&codex, "{{}}\n{{}}").unwrap();
let claude = tmp.path().join("project/.claude.json");
std::fs::create_dir_all(claude.parent().unwrap()).unwrap();
std::fs::write(&claude, "{{}}").unwrap();
let aider = tmp.path().join("repo/.aider.chat.history.md");
std::fs::create_dir_all(aider.parent().unwrap()).unwrap();
std::fs::write(&aider, "user\nassistant").unwrap();
let cursor = tmp.path().join("Cursor/User/globalStorage/state.vscdb");
std::fs::create_dir_all(cursor.parent().unwrap()).unwrap();
std::fs::write(&cursor, b"").unwrap();
let chatgpt = tmp
.path()
.join("Library/Application Support/com.openai.chat/conversations-abc/data.json");
std::fs::create_dir_all(chatgpt.parent().unwrap()).unwrap();
std::fs::write(&chatgpt, "{}").unwrap();
let roots = vec![
(
ConnectorKind::Codex,
ScanRoot::local(tmp.path().join(".codex")),
),
(
ConnectorKind::Claude,
ScanRoot::local(tmp.path().join("project")),
),
(
ConnectorKind::Aider,
ScanRoot::local(tmp.path().join("repo")),
),
(
ConnectorKind::Cursor,
ScanRoot::local(tmp.path().join("Cursor/User")),
),
(
ConnectorKind::ChatGpt,
ScanRoot::local(
tmp.path()
.join("Library/Application Support/com.openai.chat"),
),
),
];
let paths = vec![codex.clone(), claude.clone(), aider, cursor, chatgpt];
let classified = classify_paths(paths, &roots, false);
let kinds: std::collections::HashSet<_> =
classified.iter().map(|(k, _, _, _)| *k).collect();
assert!(kinds.contains(&ConnectorKind::Codex));
assert!(kinds.contains(&ConnectorKind::Claude));
assert!(kinds.contains(&ConnectorKind::Aider));
assert!(kinds.contains(&ConnectorKind::Cursor));
assert!(kinds.contains(&ConnectorKind::ChatGpt));
for (_, _, mtime, _) in classified {
assert!(mtime.is_some(), "mtime should be captured");
}
}
#[test]
fn classify_paths_prefers_explicit_watch_once_paths() {
let tmp = tempfile::tempdir().unwrap();
let project_root = tmp.path().join("project");
let session = project_root.join("subagents").join("session.jsonl");
std::fs::create_dir_all(session.parent().unwrap()).unwrap();
std::fs::write(&session, b"{}").unwrap();
let roots = vec![(ConnectorKind::Claude, ScanRoot::local(project_root.clone()))];
let classified = classify_paths(vec![session.clone()], &roots, true);
assert_eq!(classified.len(), 1);
assert_eq!(classified[0].0, ConnectorKind::Claude);
assert_eq!(classified[0].1.path, session);
}
#[test]
fn classify_paths_hints_codex_connector_for_explicit_codex_paths() {
let tmp = tempfile::tempdir().unwrap();
let codex_root = tmp.path().join(".codex").join("sessions");
let session = codex_root.join("2026").join("03").join("rollout-1.jsonl");
std::fs::create_dir_all(session.parent().unwrap()).unwrap();
std::fs::write(&session, b"{}").unwrap();
let roots = vec![
(ConnectorKind::Codex, ScanRoot::local(codex_root.clone())),
(ConnectorKind::Claude, ScanRoot::local(codex_root.clone())),
(ConnectorKind::Gemini, ScanRoot::local(codex_root)),
];
let classified = classify_paths(vec![session.clone()], &roots, true);
assert_eq!(classified.len(), 1);
assert_eq!(classified[0].0, ConnectorKind::Codex);
assert_eq!(classified[0].1.path, session);
}
#[test]
fn classify_paths_keeps_explicit_codex_path_without_detected_root() {
let tmp = tempfile::tempdir().unwrap();
let session = tmp
.path()
.join(".codex")
.join("sessions")
.join("2026")
.join("03")
.join("rollout-1.jsonl");
std::fs::create_dir_all(session.parent().unwrap()).unwrap();
std::fs::write(&session, b"{}").unwrap();
let classified = classify_paths(vec![session.clone()], &[], true);
assert_eq!(classified.len(), 1);
assert_eq!(classified[0].0, ConnectorKind::Codex);
assert_eq!(classified[0].1.path, session);
assert!(classified[0].2.is_some());
assert!(classified[0].3.is_some());
}
#[test]
#[serial]
fn reindex_paths_watch_once_indexes_explicit_codex_path_without_detected_root() {
let tmp = tempfile::tempdir().unwrap();
let data_dir = tmp.path().join("cass-data");
std::fs::create_dir_all(&data_dir).unwrap();
let session = tmp
.path()
.join(".codex")
.join("sessions")
.join("2026")
.join("05")
.join("08")
.join("rollout-explicit-watch-once.jsonl");
std::fs::create_dir_all(session.parent().unwrap()).unwrap();
std::fs::write(
&session,
r#"{"timestamp":"2026-05-08T23:09:00.000Z","type":"session_meta","payload":{"id":"explicit-watch-once","cwd":"/data/projects/ntm"}}
{"timestamp":"2026-05-08T23:09:01.000Z","type":"response_item","payload":{"type":"message","role":"user","content":[{"type":"input_text","text":"bd-2mb03 explicit watch once"}]}}
{"timestamp":"2026-05-08T23:09:02.000Z","type":"response_item","payload":{"type":"function_call_output","call_id":"call-explicit","output":"bd-2mb03 explicit tool output\n"}}
"#,
)
.unwrap();
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: Some(vec![session.clone()]),
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let state = Mutex::new(HashMap::new());
let storage = Mutex::new(storage);
let t_index = Mutex::new(None);
let indexed = reindex_paths(
&opts,
vec![session],
&[],
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(indexed, 1);
let message_count: i64 = storage
.lock()
.unwrap()
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(message_count, 2);
}
#[test]
#[serial]
fn run_index_watch_once_reindexes_changed_explicit_codex_path_already_in_db() {
let tmp = tempfile::tempdir().unwrap();
let data_dir = tmp.path().join("cass-data");
std::fs::create_dir_all(&data_dir).unwrap();
let session = tmp
.path()
.join(".codex")
.join("sessions")
.join("2026")
.join("05")
.join("08")
.join("rollout-explicit-watch-once-repeat.jsonl");
std::fs::create_dir_all(session.parent().unwrap()).unwrap();
let initial = r#"{"timestamp":"2026-05-08T23:09:00.000Z","type":"session_meta","payload":{"id":"explicit-watch-once-repeat","cwd":"/data/projects/ntm"}}
{"timestamp":"2026-05-08T23:09:01.000Z","type":"response_item","payload":{"type":"message","role":"user","content":[{"type":"input_text","text":"bd-2mb03 repeat first"}]}}
"#;
std::fs::write(&session, initial).unwrap();
let opts = |data_dir: &Path, session: &Path| super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: Some(vec![session.to_path_buf()]),
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.to_path_buf(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
};
run_index(opts(&data_dir, &session), None).unwrap();
std::thread::sleep(std::time::Duration::from_millis(25));
std::fs::write(
&session,
format!(
"{initial}{}",
r#"{"timestamp":"2026-05-08T23:09:02.000Z","type":"response_item","payload":{"type":"function_call_output","call_id":"call-repeat","output":"bd-2mb03 repeat second\n"}}
"#,
),
)
.unwrap();
run_index(opts(&data_dir, &session), None).unwrap();
let storage = FrankenStorage::open(&data_dir.join("db.sqlite")).unwrap();
let message_count: i64 = storage
.raw()
.query_row_map("SELECT COUNT(*) FROM messages", &[], |row| row.get_typed(0))
.unwrap();
assert_eq!(message_count, 2);
}
fn write_semantic_watch_once_codex_session(path: &Path, id: &str, marker: &str) -> Result<()> {
let parent = path
.parent()
.context("semantic watch-once fixture path should have a parent")?;
std::fs::create_dir_all(parent)?;
std::fs::write(
path,
format!(
r#"{{"timestamp":"2026-05-28T09:00:00.000Z","type":"session_meta","payload":{{"id":"{id}","cwd":"/data/projects/coding_agent_session_search"}}}}
{{"timestamp":"2026-05-28T09:00:01.000Z","type":"response_item","payload":{{"type":"message","role":"user","content":[{{"type":"input_text","text":"{marker} user semantic"}}]}}}}
{{"timestamp":"2026-05-28T09:00:02.000Z","type":"response_item","payload":{{"type":"message","role":"assistant","content":[{{"type":"output_text","text":"{marker} assistant semantic"}}]}}}}
"#,
),
)?;
Ok(())
}
fn semantic_watch_once_opts(
data_dir: &Path,
session: &Path,
progress: Arc<IndexingProgress>,
) -> super::IndexOptions {
super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: Some(vec![session.to_path_buf()]),
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.to_path_buf(),
semantic: true,
build_hnsw: false,
embedder: "hash".to_string(),
progress: Some(progress),
watch_interval_secs: 30,
}
}
fn semantic_watch_once_stats(
progress: &Arc<IndexingProgress>,
) -> Result<SemanticWatchOnceStats> {
let stats = progress
.stats
.lock()
.map_err(|err| anyhow::anyhow!("semantic watch-once stats lock poisoned: {err}"))?;
stats
.semantic_watch_once
.clone()
.context("semantic watch-once proof")
}
#[test]
#[serial]
fn run_index_semantic_watch_once_publishes_targeted_manifest() -> Result<()> {
let tmp = tempfile::tempdir()?;
let data_dir = tmp.path().join("cass-data");
std::fs::create_dir_all(&data_dir)?;
let session = tmp
.path()
.join(".codex")
.join("sessions")
.join("2026")
.join("05")
.join("28")
.join("rollout-semantic-watch-once.jsonl");
write_semantic_watch_once_codex_session(
&session,
"semantic-watch-once-fresh",
"swonce-fresh",
)?;
let progress = Arc::new(IndexingProgress::default());
run_index(
semantic_watch_once_opts(&data_dir, &session, Arc::clone(&progress)),
None,
)?;
let stats = semantic_watch_once_stats(&progress)?;
anyhow::ensure!(stats.published, "semantic watch-once did not publish");
anyhow::ensure!(
matches!(stats.reason.as_str(), "fresh_watch_once_db"),
"wrong reason"
);
anyhow::ensure!(matches!(stats.tier.as_str(), "fast"), "wrong tier");
anyhow::ensure!(matches!(stats.selected_docs, 2), "wrong selected doc count");
anyhow::ensure!(matches!(stats.embedded_docs, 2), "wrong embedded doc count");
anyhow::ensure!(
matches!(
stats.manifest_after_db_fingerprint.as_deref(),
Some("content-v1:1:1:2")
),
"wrong post-publish fingerprint"
);
let manifest = SemanticManifest::load_or_default(&data_dir)?;
let artifact = manifest.fast_tier.as_ref().context("fast artifact")?;
anyhow::ensure!(
matches!(artifact.conversation_count, 1),
"wrong conversation count"
);
anyhow::ensure!(matches!(artifact.doc_count, 2), "wrong doc count");
anyhow::ensure!(
matches!(artifact.db_fingerprint.as_str(), "content-v1:1:1:2"),
"wrong artifact fingerprint"
);
let index = FsVectorIndex::open(&vector_index_path(&data_dir, "fnv1a-384"))?;
anyhow::ensure!(
matches!(index.record_count(), 2),
"wrong vector record count"
);
Ok(())
}
#[test]
#[serial]
fn run_index_semantic_watch_once_catches_up_append_only_prefix_manifest() -> Result<()> {
let tmp = tempfile::tempdir()?;
let data_dir = tmp.path().join("cass-data");
std::fs::create_dir_all(&data_dir)?;
let first = tmp
.path()
.join(".codex")
.join("sessions")
.join("2026")
.join("05")
.join("28")
.join("rollout-semantic-watch-once-first.jsonl");
let second = first.with_file_name("rollout-semantic-watch-once-second.jsonl");
write_semantic_watch_once_codex_session(&first, "semantic-watch-once-first", "swonce-one")?;
write_semantic_watch_once_codex_session(
&second,
"semantic-watch-once-second",
"swonce-two",
)?;
let first_progress = Arc::new(IndexingProgress::default());
run_index(
semantic_watch_once_opts(&data_dir, &first, first_progress),
None,
)?;
let second_progress = Arc::new(IndexingProgress::default());
run_index(
semantic_watch_once_opts(&data_dir, &second, Arc::clone(&second_progress)),
None,
)?;
let stats = semantic_watch_once_stats(&second_progress)?;
anyhow::ensure!(stats.published, "semantic watch-once did not publish");
anyhow::ensure!(
matches!(
stats.reason.as_str(),
"semantic_artifact_is_append_only_prefix"
),
"wrong reason"
);
anyhow::ensure!(
matches!(
stats.manifest_before_db_fingerprint.as_deref(),
Some("content-v1:1:1:2")
),
"wrong pre-publish fingerprint"
);
anyhow::ensure!(
matches!(
stats.manifest_after_db_fingerprint.as_deref(),
Some("content-v1:2:2:4")
),
"wrong post-publish fingerprint"
);
anyhow::ensure!(matches!(stats.selected_docs, 2), "wrong selected doc count");
anyhow::ensure!(matches!(stats.embedded_docs, 2), "wrong embedded doc count");
let manifest = SemanticManifest::load_or_default(&data_dir)?;
let artifact = manifest.fast_tier.as_ref().context("fast artifact")?;
anyhow::ensure!(
matches!(artifact.conversation_count, 2),
"wrong conversation count"
);
anyhow::ensure!(matches!(artifact.doc_count, 4), "wrong doc count");
anyhow::ensure!(
matches!(artifact.db_fingerprint.as_str(), "content-v1:2:2:4"),
"wrong artifact fingerprint"
);
let index = FsVectorIndex::open(&vector_index_path(&data_dir, "fnv1a-384"))?;
anyhow::ensure!(
matches!(index.record_count(), 4),
"wrong vector record count"
);
Ok(())
}
#[test]
#[serial]
fn run_index_semantic_watch_once_reports_already_covered_manifest() -> Result<()> {
let tmp = tempfile::tempdir()?;
let data_dir = tmp.path().join("cass-data");
std::fs::create_dir_all(&data_dir)?;
let session = tmp
.path()
.join(".codex")
.join("sessions")
.join("2026")
.join("05")
.join("28")
.join("rollout-semantic-watch-once-rerun.jsonl");
write_semantic_watch_once_codex_session(
&session,
"semantic-watch-once-rerun",
"swonce-rerun",
)?;
let first_progress = Arc::new(IndexingProgress::default());
run_index(
semantic_watch_once_opts(&data_dir, &session, first_progress),
None,
)?;
let second_progress = Arc::new(IndexingProgress::default());
run_index(
semantic_watch_once_opts(&data_dir, &session, Arc::clone(&second_progress)),
None,
)?;
let stats = semantic_watch_once_stats(&second_progress)?;
anyhow::ensure!(stats.published, "semantic watch-once did not publish");
anyhow::ensure!(
matches!(stats.reason.as_str(), "semantic_artifact_already_covers_db"),
"wrong reason"
);
anyhow::ensure!(
matches!(
stats.manifest_before_db_fingerprint.as_deref(),
Some("content-v1:1:1:2")
),
"wrong pre-publish fingerprint"
);
anyhow::ensure!(
matches!(
stats.manifest_after_db_fingerprint.as_deref(),
Some("content-v1:1:1:2")
),
"wrong post-publish fingerprint"
);
anyhow::ensure!(matches!(stats.selected_docs, 0), "wrong selected doc count");
anyhow::ensure!(matches!(stats.embedded_docs, 0), "wrong embedded doc count");
let manifest = SemanticManifest::load_or_default(&data_dir)?;
let artifact = manifest.fast_tier.as_ref().context("fast artifact")?;
anyhow::ensure!(
matches!(artifact.conversation_count, 1),
"wrong conversation count"
);
anyhow::ensure!(matches!(artifact.doc_count, 2), "wrong doc count");
anyhow::ensure!(
matches!(artifact.db_fingerprint.as_str(), "content-v1:1:1:2"),
"wrong artifact fingerprint"
);
let index = FsVectorIndex::open(&vector_index_path(&data_dir, "fnv1a-384"))?;
anyhow::ensure!(
matches!(index.record_count(), 2),
"wrong vector record count"
);
Ok(())
}
#[test]
#[serial]
fn run_index_semantic_watch_once_fails_when_no_conversation_is_indexed() -> Result<()> {
let tmp = tempfile::tempdir()?;
let data_dir = tmp.path().join("cass-data");
std::fs::create_dir_all(&data_dir)?;
let session = tmp.path().join("not-a-supported-watch-once-file.jsonl");
std::fs::write(&session, "{}\n")?;
let progress = Arc::new(IndexingProgress::default());
let err = match run_index(
semantic_watch_once_opts(&data_dir, &session, progress),
None,
) {
Ok(_) => anyhow::bail!("zero-conversation semantic watch-once unexpectedly succeeded"),
Err(err) => err,
};
let rendered = err.to_string();
anyhow::ensure!(
rendered.contains("indexed zero conversations"),
"unexpected error: {rendered}"
);
Ok(())
}
#[test]
fn watch_event_filter_ignores_read_access_noise() {
let event = notify::Event::new(notify::event::EventKind::Access(AccessKind::Read))
.add_path(PathBuf::from("/tmp/session.jsonl"));
assert!(
!watch_event_should_trigger_reindex(&event),
"read-only access events should not retrigger watch indexing"
);
let event = notify::Event::new(notify::event::EventKind::Access(AccessKind::Close(
AccessMode::Read,
)))
.add_path(PathBuf::from("/tmp/session.jsonl"));
assert!(
!watch_event_should_trigger_reindex(&event),
"close-after-read events should not retrigger watch indexing"
);
}
#[test]
fn watch_event_filter_keeps_mutating_events() {
let event = notify::Event::new(notify::event::EventKind::Access(AccessKind::Close(
AccessMode::Write,
)))
.add_path(PathBuf::from("/tmp/session.jsonl"));
assert!(
watch_event_should_trigger_reindex(&event),
"close-after-write events should still retrigger indexing"
);
let event = notify::Event::new(notify::event::EventKind::Modify(ModifyKind::Metadata(
MetadataKind::WriteTime,
)))
.add_path(PathBuf::from("/tmp/session.jsonl"));
assert!(
watch_event_should_trigger_reindex(&event),
"write-time metadata changes should still retrigger indexing"
);
}
#[test]
fn watch_event_filter_ignores_access_time_metadata() {
let event = notify::Event::new(notify::event::EventKind::Modify(ModifyKind::Metadata(
MetadataKind::AccessTime,
)))
.add_path(PathBuf::from("/tmp/session.jsonl"));
assert!(
!watch_event_should_trigger_reindex(&event),
"access-time metadata changes are read noise and should be ignored"
);
}
#[test]
fn watch_event_filter_ignores_remove_events_without_delete_support() {
let event = notify::Event::new(notify::event::EventKind::Remove(
notify::event::RemoveKind::File,
))
.add_path(PathBuf::from("/tmp/session.jsonl"));
assert!(
!watch_event_should_trigger_reindex(&event),
"remove events should be ignored until watch mode can remove stale indexed rows"
);
}
#[test]
#[serial]
fn watch_state_round_trips_to_disk() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let mut state = HashMap::new();
state.insert(ConnectorKind::Codex, 123);
state.insert(ConnectorKind::Gemini, 456);
save_watch_state(&data_dir, &state).unwrap();
let loaded = load_watch_state(&data_dir);
assert_eq!(loaded.get(&ConnectorKind::Codex), Some(&123));
assert_eq!(loaded.get(&ConnectorKind::Gemini), Some(&456));
}
#[test]
#[serial]
fn watch_state_overwrites_existing_file() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let mut first = HashMap::new();
first.insert(ConnectorKind::Codex, 111);
save_watch_state(&data_dir, &first).unwrap();
let mut second = HashMap::new();
second.insert(ConnectorKind::Amp, 222);
save_watch_state(&data_dir, &second).unwrap();
let loaded = load_watch_state(&data_dir);
assert_eq!(loaded.len(), 1);
assert_eq!(loaded.get(&ConnectorKind::Amp), Some(&222));
assert!(!loaded.contains_key(&ConnectorKind::Codex));
}
#[test]
fn watch_state_temp_paths_are_unique() {
let final_path = Path::new("/tmp/watch_state.json");
let first = unique_atomic_temp_path(final_path);
let second = unique_atomic_temp_path(final_path);
assert_ne!(first, second);
assert_eq!(first.parent(), final_path.parent());
assert_eq!(second.parent(), final_path.parent());
}
#[cfg(unix)]
#[test]
fn atomic_sidecar_file_creation_refuses_preexisting_symlink() {
use std::os::unix::fs::symlink;
let tmp = TempDir::new().unwrap();
let protected = tmp.path().join("protected.json");
let sidecar = tmp.path().join(".watch_state.json.tmp");
std::fs::write(&protected, b"protected").unwrap();
symlink(&protected, &sidecar).unwrap();
let err =
create_new_atomic_sidecar_file(&sidecar).expect_err("symlink collision should fail");
assert_eq!(err.kind(), std::io::ErrorKind::AlreadyExists);
assert_eq!(std::fs::read(&protected).unwrap(), b"protected");
}
#[test]
#[serial]
fn watch_state_loads_legacy_map_format() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let legacy = r#"{"Codex":123,"Gemini":456}"#;
std::fs::write(data_dir.join("watch_state.json"), legacy).unwrap();
let loaded = load_watch_state(&data_dir);
assert_eq!(loaded.get(&ConnectorKind::Codex), Some(&123));
assert_eq!(loaded.get(&ConnectorKind::Gemini), Some(&456));
}
#[test]
#[serial]
fn watch_state_saves_compact_keys() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let mut state = HashMap::new();
state.insert(ConnectorKind::Codex, 123);
save_watch_state(&data_dir, &state).unwrap();
let raw = std::fs::read_to_string(data_dir.join("watch_state.json")).unwrap();
assert!(raw.contains("\"m\""));
assert!(raw.contains("\"cx\""));
assert!(!raw.contains("Codex"));
}
#[test]
#[serial]
fn watch_state_updates_after_reindex_paths() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_watch_state");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-002.json");
std::fs::write(
&_file,
r#"{
"id": "thread-002",
"title": "Amp test",
"messages": [
{"role":"user","text":"hi","createdAt":1700000000100},
{"role":"assistant","text":"hello","createdAt":1700000000200}
]
}"#,
)
.unwrap();
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
db_path: data_dir.join("agent_search.db"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_once_paths: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = std::sync::Mutex::new(std::collections::HashMap::new());
let storage = std::sync::Mutex::new(storage);
let t_index = std::sync::Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_dir))];
reindex_paths(
&opts,
vec![amp_file.clone()],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
let loaded = load_watch_state(&data_dir);
assert!(loaded.contains_key(&ConnectorKind::Amp));
let ts = loaded.get(&ConnectorKind::Amp).copied().unwrap();
assert!(ts > 0);
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn watch_reindex_splits_oom_batches_and_still_advances_state() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_watch_oom_split");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let _oom_guard = set_env("CASS_TEST_WATCH_INGEST_OOM_MIN_CONVS", "2");
let _chunk_guard = set_env("CASS_WATCH_INGEST_CHUNK_SIZE", "4");
let _window_guard = set_env("CASS_ACTIVE_SESSION_RECENT_WRITE_WINDOW_SECS", "0");
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let now_u128 = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let base_ts = i64::try_from(now_u128)
.unwrap_or(i64::MAX)
.saturating_add(10_000);
let mut paths = Vec::new();
for idx in 0..4 {
let path = amp_dir.join(format!("thread-oom-{idx}.json"));
std::fs::write(
&path,
format!(
r#"{{"id":"thread-oom-{idx}","messages":[{{"role":"user","text":"p{idx}","createdAt":{}}}]}}"#,
base_ts + i64::from(idx)
),
)
.unwrap();
paths.push(path);
}
let progress = Arc::new(super::IndexingProgress::default());
let opts = super::IndexOptions {
full: false,
watch: true,
force_rebuild: false,
db_path: data_dir.join("agent_search.db"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: Some(progress.clone()),
watch_once_paths: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = std::sync::Mutex::new(std::collections::HashMap::new());
let storage = std::sync::Mutex::new(storage);
let t_index = std::sync::Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_dir))];
let indexed = reindex_paths(
&opts,
paths,
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(indexed, 4);
assert_eq!(progress.current.load(Ordering::Relaxed), 4);
let conversation_rows: i64 = storage
.lock()
.unwrap()
.raw()
.query_row_map(
"SELECT COUNT(*) FROM conversations",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.unwrap();
assert_eq!(conversation_rows, 4);
assert!(
load_watch_state(&data_dir).contains_key(&ConnectorKind::Amp),
"watch state should advance after split batches persist"
);
assert!(
!data_dir
.join("quarantine/watch_ingest_poison.jsonl")
.exists(),
"split batches should avoid quarantining conversations that fit singly"
);
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn watch_reindex_quarantine_preserves_watermark_for_retry() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("watch-quarantine-retry");
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let now_u128 = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let created_at = i64::try_from(now_u128)
.unwrap_or(i64::MAX)
.saturating_add(10_000);
let amp_file = amp_dir.join("thread-watch-quarantine-retry.json");
std::fs::write(
&_file,
format!(
r#"{{"id":"thread-watch-quarantine-retry","messages":[{{"role":"user","text":"retry me","createdAt":{created_at}}}]}}"#
),
)
.unwrap();
let _oom_guard = set_env("CASS_TEST_WATCH_INGEST_OOM_MIN_CONVS", "1");
let _window_guard = set_env("CASS_ACTIVE_SESSION_RECENT_WRITE_WINDOW_SECS", "0");
let opts = super::IndexOptions {
full: false,
watch: true,
force_rebuild: false,
db_path: data_dir.join("agent_search.db"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_once_paths: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
storage.run_migrations().unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = std::sync::Mutex::new(std::collections::HashMap::new());
let storage = std::sync::Mutex::new(storage);
let t_index = std::sync::Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_dir))];
let indexed = reindex_paths(
&opts,
vec![amp_file],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(indexed, 1);
assert!(
load_watch_state(&data_dir).is_empty(),
"quarantined watch sources must remain eligible for later retry"
);
assert!(
data_dir
.join("quarantine/watch_ingest_poison.jsonl")
.exists(),
"watch OOM should still be recorded for operator visibility"
);
}
#[test]
#[serial]
fn explicit_watch_once_defers_lexical_oom_without_quarantining_source() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_watch_once_lexical_oom");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let _lexical_oom_guard = set_env("CASS_TEST_INCREMENTAL_LEXICAL_UPDATE_OOM", "1");
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-watch-once-lexical-oom.json");
std::fs::write(
&_file,
r#"{"id":"thread-watch-once-lexical-oom","messages":[{"role":"user","text":"persist me","createdAt":1700000000100}]}"#,
)
.unwrap();
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
db_path: data_dir.join("agent_search.db"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_once_paths: Some(vec![amp_file.clone()]),
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = std::sync::Mutex::new(std::collections::HashMap::new());
let storage = std::sync::Mutex::new(storage);
let t_index = std::sync::Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_file.clone()))];
let indexed = reindex_paths(
&opts,
vec![amp_file.clone()],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(indexed, 1);
let (conversation_rows, message_rows): (i64, i64) = {
let storage = storage.lock().unwrap();
let conversations = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM conversations",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.unwrap();
let messages = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM messages",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.unwrap();
(conversations, messages)
};
assert_eq!(conversation_rows, 1);
assert_eq!(message_rows, 1);
assert!(
!data_dir
.join("quarantine/watch_ingest_poison.jsonl")
.exists(),
"lexical update OOM must not quarantine a source conversation that persisted to SQLite"
);
assert!(
t_index.lock().unwrap().is_none(),
"dirty Tantivy writer should be dropped after deferred lexical update"
);
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn explicit_watch_once_skips_active_session_source_without_watermarking() {
let _active_skip_reset = ActiveSessionSkipReset;
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("active-source-skip");
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-active-source.json");
std::fs::write(
&_file,
r#"{"id":"thread-active-source","messages":[{"role":"user","text":"not ready yet","createdAt":1700000000100}]}"#,
)
.unwrap();
let active_paths = std::env::join_paths([amp_file.as_os_str()]).unwrap();
let active_paths = active_paths.to_string_lossy().to_string();
let _active_guard = set_env("CASS_TEST_ACTIVE_SESSION_SOURCE_PATHS", &active_paths);
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
db_path: data_dir.join("agent_search.db"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_once_paths: Some(vec![amp_file.clone()]),
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
storage.run_migrations().unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = std::sync::Mutex::new(std::collections::HashMap::new());
let storage = std::sync::Mutex::new(storage);
let t_index = std::sync::Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_file.clone()))];
let indexed = reindex_paths(
&opts,
vec![amp_file.clone()],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(indexed, 0);
let (conversation_rows, message_rows, last_indexed_at): (i64, i64, Option<i64>) = {
let storage = storage.lock().unwrap();
let conversations = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM conversations",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.unwrap();
let messages = storage
.raw()
.query_row_map(
"SELECT COUNT(*) FROM messages",
&[] as &[ParamValue],
|row| row.get_typed(0),
)
.unwrap();
(
conversations,
messages,
storage.get_last_indexed_at().unwrap(),
)
};
assert_eq!(conversation_rows, 0);
assert_eq!(message_rows, 0);
assert_eq!(
last_indexed_at, None,
"active source skips must not advance last_indexed_at"
);
assert!(
!data_dir
.join("quarantine/watch_ingest_poison.jsonl")
.exists(),
"active source skips are retryable, not poison quarantine"
);
}
#[test]
#[serial]
fn watch_mode_recent_session_source_skips_without_quarantine_or_watermarking() {
let _active_skip_reset = ActiveSessionSkipReset;
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("recent-active-source-skip");
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-recent-active-source.json");
std::fs::write(
&_file,
r#"{"id":"thread-recent-active-source","messages":[{"role":"user","text":"still streaming","createdAt":1700000000100}]}"#,
)
.unwrap();
let _window_guard = set_env("CASS_ACTIVE_SESSION_RECENT_WRITE_WINDOW_SECS", "3600");
let opts = super::IndexOptions {
full: false,
watch: true,
force_rebuild: false,
db_path: data_dir.join("agent_search.db"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_once_paths: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
storage.run_migrations().unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = std::sync::Mutex::new(std::collections::HashMap::new());
let storage = std::sync::Mutex::new(storage);
let t_index = std::sync::Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_file.clone()))];
let indexed = reindex_paths(
&opts,
vec![amp_file.clone()],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(indexed, 0);
let last_indexed_at = storage.lock().unwrap().get_last_indexed_at().unwrap();
assert_eq!(
last_indexed_at, None,
"recent active source skips must not advance last_indexed_at"
);
assert!(
!data_dir
.join("quarantine/watch_ingest_poison.jsonl")
.exists(),
"recent active source skips are retryable, not poison quarantine"
);
}
#[test]
fn watch_conversation_sort_orders_missing_then_ascending_watermarks() {
fn conv(
source: &str,
external_id: &str,
started_at: Option<i64>,
) -> NormalizedConversation {
NormalizedConversation {
agent_slug: "amp".to_string(),
source_path: PathBuf::from(source),
external_id: Some(external_id.to_string()),
title: None,
workspace: None,
started_at,
ended_at: None,
messages: Vec::new(),
metadata: serde_json::Value::Null,
}
}
let mut convs = vec![
conv("/tmp/c", "c", Some(30)),
conv("/tmp/a", "a", None),
conv("/tmp/b", "b", Some(10)),
];
sort_watch_conversations_for_watermark(&mut convs);
let ordered: Vec<_> = convs
.iter()
.map(|conv| conv.external_id.as_deref().unwrap())
.collect();
assert_eq!(ordered, vec!["a", "b", "c"]);
}
#[test]
#[serial]
fn reindex_paths_carries_only_new_packet_semantic_delta_messages() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_watch_semantic_delta");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let _window_guard = set_env("CASS_ACTIVE_SESSION_RECENT_WRITE_WINDOW_SECS", "0");
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-semantic.json");
let now_u128 = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let base_ts = i64::try_from(now_u128)
.unwrap_or(i64::MAX)
.saturating_add(10_000);
std::fs::write(
&_file,
format!(
r#"{{
"id": "thread-semantic",
"title": "Amp semantic delta test",
"messages": [
{{"role":"user","text":"hi","createdAt":{base_ts}}},
{{"role":"assistant","text":"hello","createdAt":{assistant_ts}}}
]
}}"#,
assistant_ts = base_ts + 100,
),
)
.unwrap();
let opts = super::IndexOptions {
full: false,
watch: true,
force_rebuild: false,
db_path: data_dir.join("agent_search.db"),
data_dir: data_dir.clone(),
semantic: true,
build_hnsw: false,
embedder: "hash".to_string(),
progress: None,
watch_once_paths: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = std::sync::Mutex::new(std::collections::HashMap::new());
let storage = std::sync::Mutex::new(storage);
let t_index = std::sync::Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_dir.clone()))];
let mut first_delta = WatchSemanticDelta::default();
let indexed = reindex_paths_with_semantic_delta(
&opts,
vec![amp_file.clone()],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
Some(&mut first_delta),
)
.unwrap();
assert_eq!(indexed, 1);
let first_contents: Vec<_> = first_delta
.inputs
.iter()
.map(|input| input.content.as_str())
.collect();
assert_eq!(first_contents, vec!["hi", "hello"]);
assert!(first_delta.max_message_id.is_some());
std::thread::sleep(Duration::from_millis(25));
std::fs::write(
&_file,
format!(
r#"{{
"id": "thread-semantic",
"title": "Amp semantic delta test",
"messages": [
{{"role":"user","text":"hi","createdAt":{base_ts}}},
{{"role":"assistant","text":"hello","createdAt":{assistant_ts}}},
{{"role":"assistant","text":"followup","createdAt":{followup_ts}}}
]
}}"#,
assistant_ts = base_ts + 100,
followup_ts = base_ts + 200,
),
)
.unwrap();
let mut second_delta = WatchSemanticDelta::default();
let indexed = reindex_paths_with_semantic_delta(
&opts,
vec![amp_file.clone()],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
Some(&mut second_delta),
)
.unwrap();
assert_eq!(indexed, 1);
assert_eq!(second_delta.inputs.len(), 1);
assert_eq!(second_delta.inputs[0].content, "followup");
assert_eq!(
semantic_role_name(second_delta.inputs[0].role),
Some("assistant")
);
assert!(second_delta.max_message_id.is_some());
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn reindex_paths_uses_oldest_trigger_window_when_state_is_newer() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_oldest_window");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-window.json");
let now_u128 = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let now = i64::try_from(now_u128)
.unwrap_or(i64::MAX)
.saturating_add(10_000);
std::fs::write(
&_file,
format!(r#"{{"id":"tw","messages":[{{"role":"user","text":"p","createdAt":{now}}}]}}"#),
)
.unwrap();
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: None,
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let mut initial = HashMap::new();
initial.insert(ConnectorKind::Amp, i64::MAX / 4);
let state = Mutex::new(initial);
let storage = Mutex::new(storage);
let t_index = Mutex::new(Some(t_index));
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_dir))];
let indexed = reindex_paths(
&opts,
vec![amp_file],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert!(
indexed > 0,
"expected indexing to use trigger min_ts instead of stale future watch-state"
);
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn reindex_paths_does_not_advance_watch_state_when_scan_yields_no_conversations() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_zero_scan");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-zero.json");
std::fs::write(&_file, "not valid json").unwrap();
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: None,
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let mut initial = HashMap::new();
initial.insert(ConnectorKind::Amp, 10_000);
let state = Mutex::new(initial);
let storage = Mutex::new(storage);
let t_index = Mutex::new(None);
let roots = vec![(ConnectorKind::Amp, ScanRoot::local(amp_dir))];
let indexed = reindex_paths(
&opts,
vec![amp_file],
&roots,
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(
indexed, 0,
"fixture should produce no indexed conversations"
);
assert_eq!(
state.lock().unwrap().get(&ConnectorKind::Amp),
Some(&10_000)
);
assert!(
t_index.lock().unwrap().is_none(),
"empty watch scan must not open Tantivy"
);
assert_eq!(
storage.lock().unwrap().get_last_indexed_at().unwrap(),
None,
"empty watch scan must not bump last_indexed_at"
);
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn reindex_paths_updates_progress() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_progress");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-progress.json");
let now_u128 = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let now = i64::try_from(now_u128)
.unwrap_or(i64::MAX)
.saturating_add(10_000);
std::fs::write(
&_file,
format!(r#"{{"id":"tp","messages":[{{"role":"user","text":"p","createdAt":{now}}}]}}"#),
)
.unwrap();
let progress = Arc::new(super::IndexingProgress::default());
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: None,
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: Some(progress.clone()),
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = Mutex::new(HashMap::new());
let storage = Mutex::new(storage);
let t_index = Mutex::new(Some(t_index));
reindex_paths(
&opts,
vec![amp_file],
&[(ConnectorKind::Amp, ScanRoot::local(amp_dir))],
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(progress.total.load(Ordering::Relaxed), 1);
assert_eq!(progress.current.load(Ordering::Relaxed), 1);
assert_eq!(progress.phase.load(Ordering::Relaxed), 0);
drop(t_index);
storage.into_inner().unwrap().close().unwrap();
drop(state);
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn reindex_paths_watch_once_ignores_file_mtime_since_ts() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_watch_once_old_messages");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-old.json");
std::fs::write(
&_file,
r#"{"id":"old","messages":[{"role":"user","text":"p","createdAt":1000}]}"#,
)
.unwrap();
let progress = Arc::new(IndexingProgress::default());
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: Some(vec![amp_file.clone()]),
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: Some(progress.clone()),
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let mut initial = HashMap::new();
initial.insert(ConnectorKind::Amp, i64::MAX / 4);
let state = Mutex::new(initial);
let storage = Mutex::new(storage);
let t_index = Mutex::new(Some(t_index));
let indexed = reindex_paths(
&opts,
vec![amp_file],
&[(ConnectorKind::Amp, ScanRoot::local(amp_dir))],
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(
indexed, 1,
"explicit watch_once imports should ignore file mtime watermarks"
);
{
let stats = progress.stats.lock().unwrap();
assert_eq!(stats.total_conversations, 1);
assert_eq!(stats.total_messages, 1);
assert_eq!(stats.connectors.len(), 1);
assert_eq!(stats.connectors[0].name, "amp");
assert_eq!(stats.connectors[0].conversations, 1);
assert_eq!(stats.connectors[0].messages, 1);
}
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn reindex_paths_watch_once_skips_unchanged_indexed_file() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_watch_once_unchanged");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-unchanged.json");
std::fs::write(
&_file,
r#"{"id":"unchanged","messages":[{"role":"user","text":"p","createdAt":1700000000100}]}"#,
)
.unwrap();
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: Some(vec![amp_file.clone()]),
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = Mutex::new(HashMap::new());
let storage = Mutex::new(storage);
let t_index = Mutex::new(Some(t_index));
let first = reindex_paths(
&opts,
vec![amp_file.clone()],
&[(ConnectorKind::Amp, ScanRoot::local(amp_dir.clone()))],
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(first, 1);
storage.lock().unwrap().set_last_indexed_at(0).unwrap();
let before_last_indexed_marker = {
let guard = storage.lock().unwrap();
should_skip_unchanged_explicit_watch_once_paths(
&opts,
&guard,
&[(ConnectorKind::Amp, ScanRoot::local(amp_dir.clone()))],
)
.unwrap()
};
assert!(
!before_last_indexed_marker,
"indexed files without a settled last_indexed_at marker must still flow through watch-once ingest"
);
storage
.lock()
.unwrap()
.set_last_indexed_at(FrankenStorage::now_millis().saturating_add(10_000))
.unwrap();
let preopen_skip = {
let guard = storage.lock().unwrap();
should_skip_unchanged_explicit_watch_once_paths(
&opts,
&guard,
&[(ConnectorKind::Amp, ScanRoot::local(amp_dir.clone()))],
)
.unwrap()
};
assert!(
preopen_skip,
"unchanged explicit watch-once files can skip before Tantivy is opened"
);
let startup_skip = {
let guard = storage.lock().unwrap();
can_skip_unchanged_explicit_watch_once_index_run(&opts, &guard, &index_path).unwrap()
};
assert!(
startup_skip,
"unchanged explicit watch-once files with current lexical assets can skip startup maintenance"
);
let second = reindex_paths(
&opts,
vec![amp_file],
&[(ConnectorKind::Amp, ScanRoot::local(amp_dir))],
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(
second, 0,
"unchanged explicit watch-once files already indexed before last_indexed_at should skip storage ingest"
);
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
#[serial]
fn reindex_paths_watch_once_does_not_advance_persistent_watch_state() {
let tmp = TempDir::new().unwrap();
let xdg = tmp.path().join("xdg_watch_once_state_isolation");
std::fs::create_dir_all(&xdg).unwrap();
let prev = dotenvy::var("XDG_DATA_HOME").ok();
unsafe { std::env::set_var("XDG_DATA_HOME", &xdg) };
let data_dir = xdg.join("amp");
std::fs::create_dir_all(&data_dir).unwrap();
let amp_dir = data_dir.join("amp");
std::fs::create_dir_all(&_dir).unwrap();
let amp_file = amp_dir.join("thread-watch-once.json");
std::fs::write(
&_file,
r#"{"id":"watch-once","messages":[{"role":"user","text":"p","createdAt":1700000000100}]}"#,
)
.unwrap();
let persisted_ts = 123_456_i64;
let mut persisted_state = HashMap::new();
persisted_state.insert(ConnectorKind::Amp, persisted_ts);
save_watch_state(&data_dir, &persisted_state).unwrap();
let opts = super::IndexOptions {
full: false,
watch: false,
force_rebuild: false,
watch_once_paths: Some(vec![amp_file.clone()]),
db_path: data_dir.join("db.sqlite"),
data_dir: data_dir.clone(),
semantic: false,
build_hnsw: false,
embedder: "fastembed".to_string(),
progress: None,
watch_interval_secs: 30,
};
let storage = FrankenStorage::open(&opts.db_path).unwrap();
let index_path = index_dir(&opts.data_dir).unwrap();
let t_index = TantivyIndex::open_or_create(&index_path).unwrap();
let state = Mutex::new(persisted_state.clone());
let storage = Mutex::new(storage);
let t_index = Mutex::new(Some(t_index));
let indexed = reindex_paths(
&opts,
vec![amp_file],
&[(ConnectorKind::Amp, ScanRoot::local(amp_dir))],
&state,
&storage,
&t_index,
&index_path,
false,
)
.unwrap();
assert_eq!(indexed, 1);
let in_memory = state.lock().unwrap();
assert_eq!(
in_memory.get(&ConnectorKind::Amp).copied(),
Some(persisted_ts)
);
drop(in_memory);
let loaded = load_watch_state(&data_dir);
assert_eq!(loaded.get(&ConnectorKind::Amp).copied(), Some(persisted_ts));
if let Some(prev) = prev {
unsafe { std::env::set_var("XDG_DATA_HOME", prev) };
} else {
unsafe { std::env::remove_var("XDG_DATA_HOME") };
}
}
#[test]
fn inject_provenance_adds_cass_origin_to_metadata() {
let mut conv = norm_conv(Some("test"), vec![norm_msg(0, 100)]);
assert!(conv.metadata.get("cass").is_none());
let origin = Origin::local();
inject_provenance(&mut conv, &origin);
let cass = conv.metadata.get("cass").expect("cass field should exist");
let origin_obj = cass.get("origin").expect("origin should exist");
assert_eq!(origin_obj.get("source_id").unwrap().as_str(), Some("local"));
assert_eq!(origin_obj.get("kind").unwrap().as_str(), Some("local"));
}
#[test]
fn inject_provenance_handles_remote_origin() {
let mut conv = norm_conv(Some("test"), vec![norm_msg(0, 100)]);
let origin = Origin::remote_with_host("laptop", "user@laptop.local");
inject_provenance(&mut conv, &origin);
let cass = conv.metadata.get("cass").expect("cass field should exist");
let origin_obj = cass.get("origin").expect("origin should exist");
assert_eq!(
origin_obj.get("source_id").unwrap().as_str(),
Some("laptop")
);
assert_eq!(origin_obj.get("kind").unwrap().as_str(), Some("ssh"));
assert_eq!(
origin_obj.get("host").unwrap().as_str(),
Some("user@laptop.local")
);
}
#[test]
fn large_codex_extra_compaction_preserves_cass_metadata() {
let mut conv = norm_conv(Some("codex-large"), vec![norm_msg(0, 100)]);
conv.agent_slug = "codex".to_string();
conv.messages[0].extra = serde_json::json!({
"payload": {
"delta": "duplicated raw codex event payload"
},
"response": {
"model": "gpt-5.4"
},
"attachment_refs": [
"file:src/lib.rs"
],
"cass": {
"token_usage": {
"input_tokens": 11,
"output_tokens": 7
},
"event_line": 42
}
});
compact_large_connector_extras_for_size(
"codex",
&mut conv,
Some(CODEX_INDEXER_EXTRA_COMPACT_THRESHOLD_BYTES),
);
let extra = &conv.messages[0].extra;
assert_eq!(
extra.pointer("/cass/token_usage/input_tokens"),
Some(&serde_json::json!(11))
);
assert_eq!(
extra.pointer("/cass/token_usage/output_tokens"),
Some(&serde_json::json!(7))
);
assert_eq!(
extra.pointer("/cass/event_line"),
Some(&serde_json::json!(42))
);
assert_eq!(
extra
.pointer("/cass/model")
.and_then(serde_json::Value::as_str),
Some("gpt-5.4")
);
assert_eq!(
extra.pointer("/cass/attachments"),
Some(&serde_json::json!(["file:src/lib.rs"]))
);
assert!(extra.get("payload").is_none());
assert!(extra.get("response").is_none());
assert!(extra.get("attachment_refs").is_none());
}
#[test]
fn large_extra_compaction_skips_small_or_non_codex_sources() {
let raw_extra = serde_json::json!({
"payload": {
"delta": "keep me"
},
"response": {
"model": "gpt-5.4"
},
"cass": {
"token_usage": {
"input_tokens": 3
}
}
});
let mut small_codex = norm_conv(Some("codex-small"), vec![norm_msg(0, 100)]);
small_codex.agent_slug = "codex".to_string();
small_codex.messages[0].extra = raw_extra.clone();
compact_large_connector_extras_for_size(
"codex",
&mut small_codex,
Some(CODEX_INDEXER_EXTRA_COMPACT_THRESHOLD_BYTES - 1),
);
assert_eq!(small_codex.messages[0].extra, raw_extra);
let mut large_claude = norm_conv(Some("claude-large"), vec![norm_msg(0, 100)]);
large_claude.agent_slug = "claude-code".to_string();
large_claude.messages[0].extra = raw_extra.clone();
compact_large_connector_extras_for_size(
"claude_code",
&mut large_claude,
Some(CODEX_INDEXER_EXTRA_COMPACT_THRESHOLD_BYTES),
);
assert_eq!(large_claude.messages[0].extra, raw_extra);
}
#[test]
fn extract_provenance_returns_local_for_empty_metadata() {
let conv = persist::map_to_internal(&NormalizedConversation {
agent_slug: "test".into(),
external_id: None,
title: None,
workspace: None,
source_path: PathBuf::from("/test"),
started_at: None,
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![],
});
assert_eq!(conv.source_id, "local");
assert!(conv.origin_host.is_none());
}
#[test]
fn extract_provenance_extracts_remote_origin() {
let metadata = serde_json::json!({
"cass": {
"origin": {
"source_id": "laptop",
"kind": "ssh",
"host": "user@laptop.local"
}
}
});
let conv = persist::map_to_internal(&NormalizedConversation {
agent_slug: "test".into(),
external_id: None,
title: None,
workspace: None,
source_path: PathBuf::from("/test"),
started_at: None,
ended_at: None,
metadata,
messages: vec![],
});
assert_eq!(conv.source_id, "laptop");
assert_eq!(conv.origin_host, Some("user@laptop.local".to_string()));
}
#[test]
fn extract_provenance_infers_remote_source_from_host_without_source_id() {
let metadata = serde_json::json!({
"cass": {
"origin": {
"source_id": " ",
"host": "user@laptop.local"
}
}
});
let conv = persist::map_to_internal(&NormalizedConversation {
agent_slug: "test".into(),
external_id: None,
title: None,
workspace: None,
source_path: PathBuf::from("/test"),
started_at: None,
ended_at: None,
metadata,
messages: vec![],
});
assert_eq!(conv.source_id, "user@laptop.local");
assert_eq!(conv.origin_host, Some("user@laptop.local".to_string()));
}
#[test]
#[serial]
fn build_scan_roots_creates_local_root() {
let _guard = ignore_sources_config();
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
let roots = build_scan_roots(&storage, &data_dir);
assert!(!roots.is_empty());
assert_eq!(roots[0].origin.source_id, "local");
assert!(!roots[0].origin.is_remote());
}
#[test]
#[serial]
fn build_scan_roots_includes_remote_mirror_if_exists() {
let _guard = ignore_sources_config();
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
storage
.upsert_source(&crate::sources::provenance::Source {
id: "laptop".to_string(),
kind: SourceKind::Ssh,
host_label: Some("user@laptop.local".to_string()),
machine_id: None,
platform: Some("linux".to_string()),
config_json: None,
created_at: None,
updated_at: None,
})
.unwrap();
let mirror_dir = data_dir.join("remotes").join("laptop").join("mirror");
std::fs::create_dir_all(&mirror_dir).unwrap();
let roots = build_scan_roots(&storage, &data_dir);
assert_eq!(roots.len(), 2);
let remote_root = roots.iter().find(|r| r.origin.source_id == "laptop");
assert!(remote_root.is_some());
let remote_root = remote_root.unwrap();
assert!(remote_root.origin.is_remote());
assert_eq!(
remote_root.origin.host,
Some("user@laptop.local".to_string())
);
assert_eq!(remote_root.platform, Some(Platform::Linux));
}
#[test]
#[serial]
fn build_scan_roots_skips_nonexistent_mirror() {
let _guard = ignore_sources_config();
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
storage
.upsert_source(&crate::sources::provenance::Source {
id: "nonexistent".to_string(),
kind: SourceKind::Ssh,
host_label: Some("user@host".to_string()),
machine_id: None,
platform: None,
config_json: None,
created_at: None,
updated_at: None,
})
.unwrap();
let mirror_dir = data_dir.join("remotes").join("laptop").join("mirror");
std::fs::create_dir_all(&mirror_dir).unwrap();
let roots = build_scan_roots(&storage, &data_dir);
assert_eq!(roots.len(), 1);
assert_eq!(roots[0].origin.source_id, "local");
}
#[test]
#[serial]
fn build_scan_roots_includes_configured_local_source_paths() {
let _guard = ignore_sources_config();
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
let backup_root = tmp.path().join("backup-root");
std::fs::create_dir_all(&data_dir).unwrap();
std::fs::create_dir_all(&backup_root).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
storage
.upsert_source(&crate::sources::provenance::Source {
id: "backup-local".to_string(),
kind: SourceKind::Local,
host_label: None,
machine_id: None,
platform: Some("linux".to_string()),
config_json: Some(serde_json::json!({
"paths": [backup_root.to_string_lossy().to_string()],
"path_mappings": [],
"sync_schedule": {
"enabled": false
}
})),
created_at: None,
updated_at: None,
})
.unwrap();
let roots = build_scan_roots(&storage, &data_dir);
assert_eq!(roots.len(), 2);
let backup_scan_root = roots
.iter()
.find(|root| root.origin.source_id == "backup-local")
.expect("configured local backup root should be included");
assert_eq!(backup_scan_root.path, backup_root);
assert!(!backup_scan_root.origin.is_remote());
assert_eq!(backup_scan_root.platform, Some(Platform::Linux));
}
#[test]
fn apply_workspace_rewrite_no_rewrites() {
let mut conv = norm_conv(None, vec![norm_msg(0, 1000)]);
conv.workspace = Some(PathBuf::from("/home/user/projects/app"));
let root = crate::connectors::ScanRoot::local(PathBuf::from("/"));
apply_workspace_rewrite(&mut conv, &root);
assert_eq!(
conv.workspace,
Some(PathBuf::from("/home/user/projects/app"))
);
assert!(
conv.metadata
.get("cass")
.and_then(|c| c.get("workspace_original"))
.is_none()
);
}
#[test]
fn apply_workspace_rewrite_no_workspace() {
let mut conv = norm_conv(None, vec![norm_msg(0, 1000)]);
conv.workspace = None;
let mappings = vec![crate::sources::config::PathMapping::new(
"/home/user",
"/Users/me",
)];
let mut root = crate::connectors::ScanRoot::local(PathBuf::from("/"));
root.workspace_rewrites = mappings;
apply_workspace_rewrite(&mut conv, &root);
assert!(conv.workspace.is_none());
}
#[test]
fn apply_workspace_rewrite_applies_mapping() {
let mut conv = norm_conv(None, vec![norm_msg(0, 1000)]);
conv.workspace = Some(PathBuf::from("/home/user/projects/app"));
let mappings = vec![crate::sources::config::PathMapping::new(
"/home/user",
"/Users/me",
)];
let mut root = crate::connectors::ScanRoot::local(PathBuf::from("/"));
root.workspace_rewrites = mappings;
apply_workspace_rewrite(&mut conv, &root);
assert_eq!(
conv.workspace,
Some(PathBuf::from("/Users/me/projects/app"))
);
let workspace_original = conv
.metadata
.get("cass")
.and_then(|c| c.get("workspace_original"))
.and_then(|v| v.as_str());
assert_eq!(workspace_original, Some("/home/user/projects/app"));
}
#[test]
fn apply_workspace_rewrite_longest_prefix_match() {
let mut conv = norm_conv(None, vec![norm_msg(0, 1000)]);
conv.workspace = Some(PathBuf::from("/home/user/projects/special/app"));
let mappings = vec![crate::sources::config::PathMapping::new(
"/home/user",
"/Users/me",
)];
let mut root = crate::connectors::ScanRoot::local(PathBuf::from("/"));
root.workspace_rewrites = mappings;
apply_workspace_rewrite(&mut conv, &root);
assert_eq!(
conv.workspace,
Some(PathBuf::from("/Users/me/projects/special/app"))
);
}
#[test]
fn apply_workspace_rewrite_no_match() {
let mut conv = norm_conv(None, vec![norm_msg(0, 1000)]);
conv.workspace = Some(PathBuf::from("/opt/other/path"));
let mappings = vec![crate::sources::config::PathMapping::new(
"/home/user",
"/Users/me",
)];
let mut root = crate::connectors::ScanRoot::local(PathBuf::from("/"));
root.workspace_rewrites = mappings;
apply_workspace_rewrite(&mut conv, &root);
assert_eq!(conv.workspace, Some(PathBuf::from("/opt/other/path")));
assert!(
conv.metadata
.get("cass")
.and_then(|c| c.get("workspace_original"))
.is_none()
);
}
#[test]
fn apply_workspace_rewrite_with_agent_filter() {
let mut conv = norm_conv(None, vec![norm_msg(0, 1000)]);
conv.agent_slug = "claude-code".to_string();
conv.workspace = Some(PathBuf::from("/home/user/projects/app"));
let mappings = vec![crate::sources::config::PathMapping::with_agents(
"/home/user/projects",
"/Volumes/Work",
vec!["cursor".to_string()], )];
let mut root = crate::connectors::ScanRoot::local(PathBuf::from("/"));
root.workspace_rewrites = mappings;
apply_workspace_rewrite(&mut conv, &root);
assert_eq!(
conv.workspace,
Some(PathBuf::from("/home/user/projects/app"))
);
}
#[test]
fn apply_workspace_rewrite_preserves_existing_metadata() {
let mut conv = norm_conv(None, vec![norm_msg(0, 1000)]);
conv.workspace = Some(PathBuf::from("/home/user/app"));
conv.metadata = serde_json::json!({
"cass": {
"origin": {
"source_id": "laptop",
"kind": "ssh",
"host": "user@laptop.local"
}
}
});
let mappings = vec![crate::sources::config::PathMapping::new(
"/home/user",
"/Users/me",
)];
let mut root = crate::connectors::ScanRoot::local(PathBuf::from("/"));
root.workspace_rewrites = mappings;
apply_workspace_rewrite(&mut conv, &root);
assert_eq!(
conv.metadata["cass"]["origin"]["source_id"].as_str(),
Some("laptop")
);
assert_eq!(
conv.metadata["cass"]["workspace_original"].as_str(),
Some("/home/user/app")
);
}
#[test]
fn stale_action_from_env_str_parses_correctly() {
assert_eq!(StaleAction::from_env_str("warn"), StaleAction::Warn);
assert_eq!(StaleAction::from_env_str("WARN"), StaleAction::Warn);
assert_eq!(StaleAction::from_env_str("rebuild"), StaleAction::Rebuild);
assert_eq!(StaleAction::from_env_str("auto"), StaleAction::Rebuild);
assert_eq!(StaleAction::from_env_str("fix"), StaleAction::Rebuild);
assert_eq!(StaleAction::from_env_str("none"), StaleAction::None);
assert_eq!(StaleAction::from_env_str("off"), StaleAction::None);
assert_eq!(StaleAction::from_env_str("0"), StaleAction::None);
assert_eq!(StaleAction::from_env_str("false"), StaleAction::None);
assert_eq!(StaleAction::from_env_str("unknown"), StaleAction::Warn);
}
#[test]
fn stale_config_default_values() {
let cfg = StaleConfig::default();
assert_eq!(cfg.threshold_hours, 24);
assert_eq!(cfg.action, StaleAction::Warn);
assert_eq!(cfg.check_interval_mins, 60);
assert_eq!(cfg.min_zero_scans, 10);
assert!(cfg.is_enabled());
}
#[test]
fn stale_config_none_action_disables_detection() {
let cfg = StaleConfig {
action: StaleAction::None,
..Default::default()
};
assert!(!cfg.is_enabled());
}
#[test]
fn stale_detector_records_successful_ingest() {
let detector = StaleDetector::new(StaleConfig::default());
assert_eq!(detector.stats().total_ingests, 0);
assert_eq!(detector.stats().consecutive_zero_scans, 0);
detector.record_scan(5);
assert_eq!(detector.stats().total_ingests, 1);
assert_eq!(detector.stats().consecutive_zero_scans, 0);
assert!(detector.stats().seconds_since_last_ingest.is_some());
}
#[test]
fn stale_detector_tracks_zero_scans() {
let detector = StaleDetector::new(StaleConfig::default());
detector.record_scan(0);
assert_eq!(detector.stats().consecutive_zero_scans, 1);
detector.record_scan(0);
assert_eq!(detector.stats().consecutive_zero_scans, 2);
detector.record_scan(1);
assert_eq!(detector.stats().consecutive_zero_scans, 0);
}
#[test]
fn stale_detector_reset_clears_state() {
let detector = StaleDetector::new(StaleConfig::default());
detector.record_scan(0);
detector.record_scan(0);
assert_eq!(detector.stats().consecutive_zero_scans, 2);
detector.reset();
assert_eq!(detector.stats().consecutive_zero_scans, 0);
assert!(detector.stats().seconds_since_last_ingest.is_some());
}
#[test]
fn finalize_watch_reindex_result_records_error_and_resets_phase() {
let detector = StaleDetector::new(StaleConfig::default());
let progress = Arc::new(IndexingProgress::default());
progress.phase.store(2, Ordering::Relaxed);
let indexed = finalize_watch_reindex_result(
Err(anyhow::anyhow!("boom")),
&detector,
Some(&progress),
"watch incremental reindex",
);
assert_eq!(
indexed, 0,
"failed watch reindex should report zero indexed"
);
assert_eq!(
detector.stats().consecutive_zero_scans,
1,
"failed watch reindex should still count as a zero-result scan for stale detection"
);
assert_eq!(
progress.phase.load(Ordering::Relaxed),
0,
"failed watch reindex should reset progress phase back to idle"
);
assert_eq!(
progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner())
.as_deref(),
Some("watch incremental reindex: boom"),
"failed watch reindex should surface the real error"
);
}
#[test]
fn run_index_progress_reset_guard_resets_idle_state_without_clobbering_error() {
let progress = Arc::new(IndexingProgress::default());
progress.phase.store(2, Ordering::Relaxed);
progress.is_rebuilding.store(true, Ordering::Relaxed);
*progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some("boom".to_string());
{
let _guard = RunIndexProgressReset::new(Some(progress.clone()));
}
assert_eq!(progress.phase.load(Ordering::Relaxed), 0);
assert!(
!progress.is_rebuilding.load(Ordering::Relaxed),
"drop guard should clear stale rebuild state after failures"
);
assert_eq!(
progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner())
.as_deref(),
Some("boom"),
"idle-state cleanup should not erase the real error"
);
}
#[test]
fn reconcile_pending_lexical_commit_promotes_committed_offset_when_meta_changes() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"before").unwrap();
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
};
let mut state = LexicalRebuildState::new(db_state, LEXICAL_REBUILD_PAGE_SIZE);
state.set_runtime(&LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 3,
inflight_message_bytes: 65_536,
max_message_bytes_in_flight: 131_072,
pending_batch_conversations: 9,
pending_batch_message_bytes: 131_072,
page_prep_workers: 6,
active_page_prep_jobs: 2,
ordered_buffered_pages: 4,
budget_generation: 1,
producer_budget_wait_count: 2,
producer_budget_wait_ms: 15,
producer_handoff_wait_count: 1,
producer_handoff_wait_ms: 7,
host_loadavg_1m_milli: None,
controller_mode: "pressure_limited".to_string(),
controller_reason: "queue_depth_3_reached_pipeline_capacity_3".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_124_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
});
state.record_pending_commit(
Some(200),
200,
600,
index_meta_fingerprint(&index_path).unwrap(),
);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
fs::write(index_path.join("meta.json"), b"after").unwrap();
let reconciled = reconcile_pending_lexical_commit(&index_path, state).unwrap();
assert_eq!(reconciled.committed_offset, 200);
assert_eq!(reconciled.committed_conversation_id, Some(200));
assert_eq!(reconciled.processed_conversations, 200);
assert_eq!(reconciled.indexed_docs, 600);
assert!(reconciled.pending.is_none());
assert_eq!(
reconciled.runtime,
LexicalRebuildPipelineRuntimeSnapshot::default()
);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("reconciled checkpoint should persist");
assert_eq!(
persisted.runtime,
LexicalRebuildPipelineRuntimeSnapshot::default()
);
}
#[test]
fn reconcile_pending_lexical_commit_rolls_back_uncommitted_batch_when_meta_unchanged() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"stable").unwrap();
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
};
let mut state = LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE);
state.committed_offset = 100;
state.committed_conversation_id = Some(100);
state.processed_conversations = 100;
state.indexed_docs = 250;
state.set_runtime(&LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 2,
inflight_message_bytes: 32_768,
max_message_bytes_in_flight: 65_536,
pending_batch_conversations: 4,
pending_batch_message_bytes: 65_536,
page_prep_workers: 4,
active_page_prep_jobs: 1,
ordered_buffered_pages: 3,
budget_generation: 2,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 11,
producer_handoff_wait_count: 2,
producer_handoff_wait_ms: 22,
host_loadavg_1m_milli: None,
controller_mode: "steady".to_string(),
controller_reason: "first_durable_commit_promoted_steady_budget".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_224_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
});
state.record_pending_commit(
Some(200),
200,
600,
index_meta_fingerprint(&index_path).unwrap(),
);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let reconciled = reconcile_pending_lexical_commit(&index_path, state).unwrap();
assert_eq!(reconciled.committed_offset, 100);
assert_eq!(reconciled.committed_conversation_id, Some(100));
assert_eq!(reconciled.processed_conversations, 100);
assert_eq!(reconciled.indexed_docs, 250);
assert!(reconciled.pending.is_none());
assert_eq!(
reconciled.runtime,
LexicalRebuildPipelineRuntimeSnapshot::default()
);
let status = matching_lexical_rebuild_state_status(&index_path, &db_state).unwrap();
assert!(status.has_pending_resume);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("rolled-back checkpoint should persist");
assert_eq!(
persisted.runtime,
LexicalRebuildPipelineRuntimeSnapshot::default()
);
}
#[test]
fn legacy_lexical_rebuild_page_size_still_counts_as_pending_rebuild() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
};
let state = LexicalRebuildState::new(db_state.clone(), 200);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let status = matching_lexical_rebuild_state_status(&index_path, &db_state).unwrap();
assert!(status.has_pending_resume);
}
#[test]
fn pending_lexical_rebuild_matches_equivalent_db_path_spellings() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let db_path = tmp.path().join("agent_search.db");
fs::write(&db_path, b"db").unwrap();
let db_path_variant = tmp.path().join(".").join("agent_search.db");
let checkpoint_db_state = LexicalRebuildDbState {
db_path: db_path_variant.to_string_lossy().into_owned(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
};
let current_db_state = LexicalRebuildDbState {
db_path: crate::normalize_path_identity(&db_path)
.to_string_lossy()
.into_owned(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
};
let state = LexicalRebuildState::new(checkpoint_db_state, 200);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let status = matching_lexical_rebuild_state_status(&index_path, ¤t_db_state).unwrap();
assert!(status.has_pending_resume);
}
#[test]
fn legacy_lexical_rebuild_matches_despite_storage_fingerprint_drift_when_counts_match() {
let checkpoint_db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 0,
storage_fingerprint: "22396870656:1776366130822:8536672:1776366130775".to_string(),
};
let current_db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "22396870656:1776384918595:8577872:1776384918548".to_string(),
};
let state = LexicalRebuildState::new(checkpoint_db_state, 200);
assert!(state.matches_run(¤t_db_state, LEXICAL_REBUILD_PAGE_SIZE));
}
#[test]
fn lexical_rebuild_rejects_resume_when_content_fingerprint_changes() {
let checkpoint_db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 0,
storage_fingerprint: "content-v1:400:1200:4800".to_string(),
};
let current_db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 0,
storage_fingerprint: "content-v1:400:1200:4801".to_string(),
};
let state = LexicalRebuildState::new(checkpoint_db_state, LEXICAL_REBUILD_PAGE_SIZE);
assert!(!state.matches_run(¤t_db_state, LEXICAL_REBUILD_PAGE_SIZE));
}
#[test]
fn normalize_legacy_lexical_rebuild_page_size_adopts_current_contract() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
};
let mut state = LexicalRebuildState::new(db_state, 200);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
normalize_lexical_rebuild_state_for_current_run(&index_path, &mut state).unwrap();
assert_eq!(state.page_size, LEXICAL_REBUILD_PAGE_SIZE);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("normalized checkpoint");
assert_eq!(persisted.page_size, LEXICAL_REBUILD_PAGE_SIZE);
}
#[test]
fn upgrade_legacy_lexical_rebuild_resume_cursor_records_stable_conversation_ids() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("db.sqlite");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
seed_lexical_rebuild_fixture(&storage);
let rows = storage
.list_conversations_for_lexical_rebuild_after_id(2, 0, &HashMap::new(), &HashMap::new())
.unwrap();
let first_conversation_id = rows[0].id.expect("first conversation id");
let second_conversation_id = rows[1].id.expect("second conversation id");
let index_path = index_dir(&data_dir).unwrap();
std::fs::create_dir_all(&index_path).unwrap();
let mut state = LexicalRebuildState::new(
lexical_rebuild_db_state(&storage, &db_path).unwrap(),
LEXICAL_REBUILD_PAGE_SIZE,
);
state.committed_offset = 1;
state.processed_conversations = 1;
state.indexed_docs = 2;
state.pending = Some(PendingLexicalCommit {
next_offset: 2,
next_conversation_id: None,
processed_conversations: 2,
indexed_docs: 4,
base_meta_fingerprint: None,
});
upgrade_lexical_rebuild_state_resume_cursor_if_needed(&storage, &index_path, &mut state)
.unwrap();
assert_eq!(state.committed_conversation_id, Some(first_conversation_id));
assert_eq!(
state
.pending
.as_ref()
.and_then(|pending| pending.next_conversation_id),
Some(second_conversation_id)
);
}
#[test]
fn incompatible_lexical_rebuild_page_size_does_not_count_as_pending_rebuild() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
};
let state = LexicalRebuildState::new(db_state.clone(), 13);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let status = matching_lexical_rebuild_state_status(&index_path, &db_state).unwrap();
assert!(!status.has_pending_resume);
}
#[test]
fn clear_lexical_rebuild_state_removes_stale_snapshot() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
assert!(
load_lexical_rebuild_snapshot(&index_path, Path::new("/tmp/agent_search.db"))
.unwrap()
.is_some()
);
clear_lexical_rebuild_state(&index_path).unwrap();
assert!(
load_lexical_rebuild_snapshot(&index_path, Path::new("/tmp/agent_search.db"))
.unwrap()
.is_none()
);
}
#[test]
fn load_active_lexical_rebuild_pipeline_runtime_ignores_empty_snapshot() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
assert!(
load_active_lexical_rebuild_pipeline_runtime(
&index_path,
Path::new("/tmp/agent_search.db")
)
.unwrap()
.is_none()
);
}
#[test]
fn prepare_lexical_rebuild_state_for_active_run_clears_stale_runtime_and_persists() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
state.committed_offset = 4;
state.committed_conversation_id = Some(4);
state.processed_conversations = 4;
state.indexed_docs = 8;
state.set_runtime(&LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 2,
inflight_message_bytes: 32_768,
max_message_bytes_in_flight: 65_536,
pending_batch_conversations: 4,
pending_batch_message_bytes: 65_536,
page_prep_workers: 4,
active_page_prep_jobs: 1,
ordered_buffered_pages: 3,
budget_generation: 2,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 11,
producer_handoff_wait_count: 2,
producer_handoff_wait_ms: 22,
host_loadavg_1m_milli: None,
controller_mode: "steady".to_string(),
controller_reason: "first_durable_commit_promoted_steady_budget".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_224_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
});
persist_lexical_rebuild_state(&index_path, &state).unwrap();
prepare_lexical_rebuild_state_for_active_run(&index_path, &mut state).unwrap();
assert_eq!(
state.runtime,
LexicalRebuildPipelineRuntimeSnapshot::default()
);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("prepared checkpoint should persist");
assert_eq!(
persisted.runtime,
LexicalRebuildPipelineRuntimeSnapshot::default()
);
}
#[test]
fn prepare_lexical_rebuild_state_for_active_run_preserves_completed_runtime_snapshot() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
state.mark_completed(Some("stable-meta".to_string()));
persist_lexical_rebuild_state(&index_path, &state).unwrap();
prepare_lexical_rebuild_state_for_active_run(&index_path, &mut state).unwrap();
assert!(state.completed);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("completed checkpoint should persist");
assert!(persisted.completed);
}
#[test]
fn persist_lexical_rebuild_state_for_active_run_start_overwrites_stale_checkpoint() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let mut stale_state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 999,
total_messages: 1998,
storage_fingerprint: "stale-seed".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
stale_state.committed_offset = 500;
stale_state.committed_conversation_id = Some(500);
stale_state.processed_conversations = 500;
stale_state.indexed_docs = 1000;
stale_state.set_runtime(&LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 2,
inflight_message_bytes: 32_768,
max_message_bytes_in_flight: 65_536,
pending_batch_conversations: 4,
pending_batch_message_bytes: 65_536,
page_prep_workers: 4,
active_page_prep_jobs: 1,
ordered_buffered_pages: 3,
budget_generation: 2,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 11,
producer_handoff_wait_count: 2,
producer_handoff_wait_ms: 22,
host_loadavg_1m_milli: None,
controller_mode: "steady".to_string(),
controller_reason: "first_durable_commit_promoted_steady_budget".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_224_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
});
persist_lexical_rebuild_state(&index_path, &stale_state).unwrap();
let fresh_state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "fresh-seed".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
persist_lexical_rebuild_state_for_active_run_start(&index_path, &fresh_state).unwrap();
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("fresh startup checkpoint should persist");
assert_eq!(persisted.db.total_conversations, 12);
assert_eq!(persisted.db.total_messages, 24);
assert_eq!(persisted.db.storage_fingerprint, "fresh-seed");
assert_eq!(persisted.committed_offset, 0);
assert_eq!(persisted.committed_conversation_id, None);
assert_eq!(persisted.processed_conversations, 0);
assert_eq!(persisted.indexed_docs, 0);
assert_eq!(
persisted.runtime,
LexicalRebuildPipelineRuntimeSnapshot::default()
);
}
#[test]
fn persist_pending_lexical_rebuild_progress_refreshes_runtime_when_progress_is_unchanged() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"stable-meta").unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
state.committed_offset = 4;
state.committed_conversation_id = Some(4);
state.processed_conversations = 4;
state.indexed_docs = 8;
state.record_pending_commit(Some(6), 6, 12, index_meta_fingerprint(&index_path).unwrap());
state.set_runtime(&LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 1,
inflight_message_bytes: 1_024,
max_message_bytes_in_flight: 4_096,
pending_batch_conversations: 1,
pending_batch_message_bytes: 2_048,
page_prep_workers: 2,
active_page_prep_jobs: 1,
ordered_buffered_pages: 0,
budget_generation: 1,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 3,
producer_handoff_wait_count: 0,
producer_handoff_wait_ms: 0,
host_loadavg_1m_milli: None,
controller_mode: "startup".to_string(),
controller_reason: "seeded-runtime".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_111_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
});
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let refreshed_runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 3,
inflight_message_bytes: 4_096,
max_message_bytes_in_flight: 16_384,
pending_batch_conversations: 2,
pending_batch_message_bytes: 8_192,
page_prep_workers: 4,
active_page_prep_jobs: 2,
ordered_buffered_pages: 1,
budget_generation: 2,
producer_budget_wait_count: 2,
producer_budget_wait_ms: 9,
producer_handoff_wait_count: 1,
producer_handoff_wait_ms: 5,
host_loadavg_1m_milli: Some(7_250),
controller_mode: "pressure_limited".to_string(),
controller_reason: "queue_depth_3_reached_pipeline_capacity_3".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_222_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
persist_pending_lexical_rebuild_progress(
&index_path,
&mut state,
Some(6),
6,
12,
&refreshed_runtime,
)
.unwrap();
assert_eq!(state.runtime, refreshed_runtime);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("heartbeat checkpoint should persist");
assert_eq!(persisted.runtime, refreshed_runtime);
assert_eq!(
persisted
.pending
.as_ref()
.map(|pending| pending.next_conversation_id),
Some(Some(6))
);
}
#[test]
fn stale_stage_heartbeat_does_not_rewind_rebuild_checkpoint() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"stable-meta").unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
state.committed_offset = 4;
state.committed_conversation_id = Some(4);
state.processed_conversations = 4;
state.indexed_docs = 8;
let stable_meta = index_meta_fingerprint(&index_path).unwrap();
state.record_pending_commit(Some(6), 6, 12, stable_meta);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let stale_stage_runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 2,
inflight_message_bytes: 2_048,
max_message_bytes_in_flight: 8_192,
pending_batch_conversations: 1,
pending_batch_message_bytes: 4_096,
page_prep_workers: 2,
active_page_prep_jobs: 1,
producer_handoff_wait_count: 1,
producer_handoff_wait_ms: 5,
controller_mode: "startup".to_string(),
controller_reason: "stale-stage-heartbeat".to_string(),
updated_at_ms: 1_733_000_777_000_i64,
..Default::default()
};
persist_pending_lexical_rebuild_progress(
&index_path,
&mut state,
Some(5),
5,
10,
&stale_stage_runtime,
)
.unwrap();
assert_eq!(
state.reported_processed_conversations(),
6,
"stale stage heartbeat must not move the restart checkpoint backwards"
);
assert_eq!(state.reported_indexed_docs(), 12);
assert_eq!(
state.reported_committed_conversation_id(),
Some(6),
"stale next_conversation_id must not replace the monotone pending cursor"
);
assert_eq!(
state.runtime, stale_stage_runtime,
"runtime telemetry should still refresh for attach/status visibility"
);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("monotone checkpoint should remain persisted");
assert_eq!(persisted.reported_processed_conversations(), 6);
assert_eq!(persisted.reported_indexed_docs(), 12);
assert_eq!(persisted.reported_committed_conversation_id(), Some(6));
assert_eq!(persisted.runtime, stale_stage_runtime);
}
#[test]
fn persist_pending_lexical_rebuild_progress_can_reuse_cached_base_meta_fingerprint() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"old-live-meta").unwrap();
let cached_base_meta_fingerprint = index_meta_fingerprint(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"new-live-meta").unwrap();
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 2,
inflight_message_bytes: 2_048,
max_message_bytes_in_flight: 8_192,
pending_batch_conversations: 1,
pending_batch_message_bytes: 4_096,
page_prep_workers: 2,
active_page_prep_jobs: 1,
ordered_buffered_pages: 0,
budget_generation: 1,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 2,
producer_handoff_wait_count: 0,
producer_handoff_wait_ms: 0,
host_loadavg_1m_milli: None,
controller_mode: "startup".to_string(),
controller_reason: "cached-live-meta".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_333_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
persist_pending_lexical_rebuild_progress_with_base_meta_fingerprint(
&index_path,
&mut state,
Some(5),
5,
10,
&runtime,
cached_base_meta_fingerprint.as_deref(),
)
.unwrap();
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("pending checkpoint should persist");
assert_eq!(
persisted
.pending
.as_ref()
.and_then(|pending| pending.base_meta_fingerprint.as_deref()),
cached_base_meta_fingerprint.as_deref()
);
assert_ne!(
persisted
.pending
.as_ref()
.and_then(|pending| pending.base_meta_fingerprint.as_deref()),
index_meta_fingerprint(&index_path).unwrap().as_deref()
);
}
#[test]
fn commit_lexical_rebuild_progress_without_finalized_checkpoint_persist_leaves_recoverable_pending_state()
{
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let mut index = TantivyIndex::open_or_create(&index_path).unwrap();
let conversation = norm_conv(Some("final-commit"), vec![norm_msg(0, 1_700_000_000_000)]);
index
.add_messages_with_conversation_id(&conversation, &conversation.messages, Some(1))
.unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 1,
total_messages: 1,
storage_fingerprint: "seed:1".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 0,
inflight_message_bytes: 0,
max_message_bytes_in_flight: 4_096,
pending_batch_conversations: 1,
pending_batch_message_bytes: 128,
page_prep_workers: 1,
active_page_prep_jobs: 0,
ordered_buffered_pages: 0,
budget_generation: 1,
producer_budget_wait_count: 0,
producer_budget_wait_ms: 0,
producer_handoff_wait_count: 1,
producer_handoff_wait_ms: 4,
host_loadavg_1m_milli: None,
controller_mode: "steady".to_string(),
controller_reason: "final-commit".to_string(),
staged_merge_workers_max: 0,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: String::new(),
staged_shard_build_workers_max: 0,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: String::new(),
updated_at_ms: 1_733_000_555_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
commit_lexical_rebuild_progress(
&index_path,
&mut state,
Some(1),
1,
1,
&runtime,
&mut index,
false,
None,
)
.unwrap();
assert!(state.pending.is_none());
assert_eq!(state.committed_offset, 1);
assert_eq!(state.committed_conversation_id, Some(1));
assert_eq!(state.processed_conversations, 1);
assert_eq!(state.indexed_docs, 1);
assert_eq!(
state.committed_meta_fingerprint.as_deref(),
index_meta_fingerprint(&index_path).unwrap().as_deref()
);
let persisted = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("pending checkpoint should remain on disk until completion");
assert!(persisted.pending.is_some());
assert_eq!(persisted.committed_meta_fingerprint, None);
assert_eq!(persisted.runtime, runtime);
let reconciled = reconcile_pending_lexical_commit(&index_path, persisted).unwrap();
assert!(reconciled.pending.is_none());
assert_eq!(reconciled.committed_offset, 1);
assert_eq!(reconciled.committed_conversation_id, Some(1));
assert_eq!(reconciled.processed_conversations, 1);
assert_eq!(reconciled.indexed_docs, 1);
assert_eq!(
reconciled.committed_meta_fingerprint.as_deref(),
index_meta_fingerprint(&index_path).unwrap().as_deref()
);
}
#[test]
fn maybe_persist_staged_lexical_rebuild_progress_skips_write_until_heartbeat_is_due() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"stable-meta").unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
let runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 1,
inflight_message_bytes: 1_024,
max_message_bytes_in_flight: 4_096,
pending_batch_conversations: 1,
pending_batch_message_bytes: 2_048,
page_prep_workers: 2,
active_page_prep_jobs: 1,
ordered_buffered_pages: 0,
budget_generation: 1,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 3,
producer_handoff_wait_count: 0,
producer_handoff_wait_ms: 0,
host_loadavg_1m_milli: None,
controller_mode: "startup".to_string(),
controller_reason: "staged-heartbeat".to_string(),
staged_merge_workers_max: 1,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: "warming".to_string(),
staged_shard_build_workers_max: 4,
staged_shard_build_allowed_jobs: 2,
staged_shard_build_active_jobs: 2,
staged_shard_build_pending_jobs: 3,
staged_shard_build_controller_reason: "backlog".to_string(),
updated_at_ms: 1_733_000_444_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let base_meta_fingerprint = index_meta_fingerprint(&index_path).unwrap();
let mut conversations_since_progress_persist = 1usize;
let mut last_progress_persist = Instant::now();
let persisted = maybe_persist_staged_lexical_rebuild_progress(
&index_path,
&mut state,
Some(1),
1,
2,
&runtime,
base_meta_fingerprint.as_deref(),
false,
&mut conversations_since_progress_persist,
8,
&mut last_progress_persist,
Duration::from_secs(60),
None,
None,
)
.unwrap();
assert!(!persisted);
assert_eq!(conversations_since_progress_persist, 1);
assert!(
load_lexical_rebuild_state(&index_path).unwrap().is_none(),
"staged progress should not rewrite the state file before the heartbeat is due"
);
}
#[test]
fn maybe_persist_staged_lexical_rebuild_progress_force_refreshes_runtime_without_new_progress()
{
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"stable-meta").unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
let base_meta_fingerprint = index_meta_fingerprint(&index_path).unwrap();
state.record_pending_commit(Some(6), 6, 12, base_meta_fingerprint.clone());
state.set_runtime(&LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 1,
inflight_message_bytes: 1_024,
max_message_bytes_in_flight: 4_096,
pending_batch_conversations: 1,
pending_batch_message_bytes: 2_048,
page_prep_workers: 2,
active_page_prep_jobs: 1,
ordered_buffered_pages: 0,
budget_generation: 1,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 3,
producer_handoff_wait_count: 0,
producer_handoff_wait_ms: 0,
host_loadavg_1m_milli: None,
controller_mode: "startup".to_string(),
controller_reason: "seeded-runtime".to_string(),
staged_merge_workers_max: 1,
staged_merge_allowed_jobs: 0,
staged_merge_active_jobs: 0,
staged_merge_ready_artifacts: 0,
staged_merge_ready_groups: 0,
staged_merge_controller_reason: "warming".to_string(),
staged_shard_build_workers_max: 4,
staged_shard_build_allowed_jobs: 2,
staged_shard_build_active_jobs: 2,
staged_shard_build_pending_jobs: 3,
staged_shard_build_controller_reason: "backlog".to_string(),
updated_at_ms: 1_733_000_555_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
});
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let refreshed_runtime = LexicalRebuildPipelineRuntimeSnapshot {
queue_depth: 0,
inflight_message_bytes: 0,
max_message_bytes_in_flight: 4_096,
pending_batch_conversations: 0,
pending_batch_message_bytes: 0,
page_prep_workers: 2,
active_page_prep_jobs: 0,
ordered_buffered_pages: 0,
budget_generation: 2,
producer_budget_wait_count: 1,
producer_budget_wait_ms: 3,
producer_handoff_wait_count: 1,
producer_handoff_wait_ms: 5,
host_loadavg_1m_milli: Some(4_250),
controller_mode: "merge_tail".to_string(),
controller_reason: "draining_eager_merges".to_string(),
staged_merge_workers_max: 2,
staged_merge_allowed_jobs: 2,
staged_merge_active_jobs: 1,
staged_merge_ready_artifacts: 3,
staged_merge_ready_groups: 1,
staged_merge_controller_reason: "merge_backlog".to_string(),
staged_shard_build_workers_max: 4,
staged_shard_build_allowed_jobs: 0,
staged_shard_build_active_jobs: 0,
staged_shard_build_pending_jobs: 0,
staged_shard_build_controller_reason: "builders_idle".to_string(),
updated_at_ms: 1_733_000_666_000_i64,
..LexicalRebuildPipelineRuntimeSnapshot::default()
};
let mut conversations_since_progress_persist = 0usize;
let mut last_progress_persist = Instant::now();
let persisted = maybe_persist_staged_lexical_rebuild_progress(
&index_path,
&mut state,
Some(6),
6,
12,
&refreshed_runtime,
base_meta_fingerprint.as_deref(),
true,
&mut conversations_since_progress_persist,
64,
&mut last_progress_persist,
Duration::from_secs(60),
None,
None,
)
.unwrap();
assert!(persisted);
assert_eq!(state.runtime, refreshed_runtime);
let persisted_state = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("forced staged heartbeat should persist");
assert_eq!(persisted_state.runtime, refreshed_runtime);
assert_eq!(
persisted_state
.pending
.as_ref()
.map(|pending| pending.next_conversation_id),
Some(Some(6)),
"forced staged heartbeat should keep the existing pending progress visible"
);
}
#[test]
fn load_lexical_rebuild_checkpoint_reports_pending_progress() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
fs::write(index_path.join("meta.json"), b"stable-meta").unwrap();
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 12,
total_messages: 24,
storage_fingerprint: "seed:12".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
state.committed_offset = 4;
state.committed_conversation_id = Some(4);
state.processed_conversations = 4;
state.indexed_docs = 20;
state.record_pending_commit(Some(7), 7, 35, index_meta_fingerprint(&index_path).unwrap());
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("checkpoint should load");
assert_eq!(checkpoint.committed_offset, 4);
assert_eq!(checkpoint.committed_conversation_id, Some(7));
assert_eq!(
checkpoint.processed_conversations, 7,
"pending rebuild progress should stay visible to status/health readers"
);
assert_eq!(
checkpoint.indexed_docs, 35,
"pending indexed doc counts should stay visible to status/health readers"
);
}
#[test]
fn refresh_completed_lexical_rebuild_checkpoint_preserves_content_fingerprint_across_meta_only_writes()
{
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = norm_conv(
Some("checkpoint-refresh"),
vec![norm_msg(0, 1_700_000_000_000)],
);
storage
.insert_conversation_tree(
agent_id,
None,
&crate::model::types::Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: conv.title.clone(),
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: conv.metadata.clone(),
messages: conv
.messages
.iter()
.map(|m| crate::model::types::Message {
id: None,
idx: m.idx,
role: crate::model::types::MessageRole::User,
author: m.author.clone(),
created_at: m.created_at,
content: m.content.clone(),
extra_json: m.extra.clone(),
snippets: Vec::new(),
})
.collect(),
source_id: "local".to_string(),
origin_host: None,
},
)
.unwrap();
let index_path = index_dir(&data_dir).unwrap();
let mut index = TantivyIndex::open_or_create(&index_path).unwrap();
index
.add_messages_with_conversation_id(&conv, &conv.messages, Some(1))
.unwrap();
index.commit().unwrap();
drop(index);
let total_conversations = count_total_conversations_exact(&storage).unwrap();
let total_messages = count_total_messages_exact(&storage).unwrap();
let original_fingerprint = lexical_rebuild_storage_fingerprint(&db_path).unwrap();
let mut state = LexicalRebuildState::new(
lexical_rebuild_db_state(&storage, &db_path).unwrap(),
LEXICAL_REBUILD_PAGE_SIZE,
);
state.mark_completed(index_meta_fingerprint(&index_path).unwrap());
persist_lexical_rebuild_state(&index_path, &state).unwrap();
std::thread::sleep(Duration::from_millis(5));
storage
.set_last_indexed_at(FrankenStorage::now_millis())
.unwrap();
let changed_fingerprint = lexical_rebuild_storage_fingerprint(&db_path).unwrap();
assert_eq!(
original_fingerprint, changed_fingerprint,
"meta-only writes should not churn the lexical content fingerprint"
);
refresh_completed_lexical_rebuild_checkpoint(&storage, &db_path, &data_dir).unwrap();
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("refreshed checkpoint");
assert!(checkpoint.completed);
assert_eq!(checkpoint.storage_fingerprint, original_fingerprint);
assert_eq!(checkpoint.total_conversations, total_conversations);
assert_eq!(checkpoint.processed_conversations, total_conversations);
assert_eq!(
checkpoint.committed_offset,
i64::try_from(total_conversations).unwrap_or(i64::MAX)
);
assert_eq!(checkpoint.committed_conversation_id, Some(1));
assert_eq!(checkpoint.indexed_docs, total_messages);
}
#[test]
#[serial]
fn refresh_completed_lexical_rebuild_checkpoint_bootstraps_missing_state_from_live_tantivy() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
let convs = vec![norm_conv(
Some("bootstrap"),
vec![
norm_msg(0, 1_700_000_000_000),
norm_msg(1, 1_700_000_000_100),
],
)];
ingest_batch(
&storage,
Some(&mut index),
&data_dir,
&convs,
&None,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.unwrap();
index.commit().unwrap();
drop(index);
let index_path = index_dir(&data_dir).unwrap();
assert!(
load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.is_none(),
"plain incremental ingest should start with no lexical checkpoint"
);
refresh_completed_lexical_rebuild_checkpoint(&storage, &db_path, &data_dir).unwrap();
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("bootstrapped checkpoint");
assert!(checkpoint.completed);
assert_eq!(checkpoint.total_conversations, 1);
assert_eq!(checkpoint.processed_conversations, 1);
assert_eq!(checkpoint.indexed_docs, 2);
assert_eq!(
checkpoint.storage_fingerprint,
lexical_rebuild_storage_fingerprint(&db_path).unwrap()
);
}
#[test]
#[serial]
fn refresh_completed_lexical_rebuild_checkpoint_skips_rewriting_exact_completed_state() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let convs = vec![norm_conv(
Some("exact-refresh"),
vec![
norm_msg(0, 1_700_000_000_000),
norm_msg(1, 1_700_000_000_100),
],
)];
let mut index = TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
ingest_batch(
&storage,
Some(&mut index),
&data_dir,
&convs,
&None,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.unwrap();
index.commit().unwrap();
drop(index);
let index_path = index_dir(&data_dir).unwrap();
refresh_completed_lexical_rebuild_checkpoint(&storage, &db_path, &data_dir).unwrap();
let state_path = lexical_rebuild_state_path(&index_path);
let before_state = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("exact completed checkpoint");
let before_modified = fs::metadata(&state_path).unwrap().modified().unwrap();
std::thread::sleep(Duration::from_millis(5));
refresh_completed_lexical_rebuild_checkpoint(&storage, &db_path, &data_dir).unwrap();
let after_state = load_lexical_rebuild_state(&index_path)
.unwrap()
.expect("exact completed checkpoint should remain present");
let after_modified = fs::metadata(&state_path).unwrap().modified().unwrap();
assert_eq!(
after_state, before_state,
"refresh should not rewrite an already exact completed checkpoint"
);
assert_eq!(
after_modified, before_modified,
"refresh should not touch the checkpoint file when no fields changed"
);
}
#[test]
#[serial]
fn refresh_completed_lexical_rebuild_checkpoint_skips_sparse_live_tantivy() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let conv_a = vec![norm_conv(
Some("sparse-a"),
vec![norm_msg(0, 1_700_000_000_000)],
)];
let conv_b = vec![norm_conv(
Some("sparse-b"),
vec![norm_msg(0, 1_700_000_001_000)],
)];
let mut canonical_index =
TantivyIndex::open_or_create(&index_dir(&data_dir).unwrap()).unwrap();
ingest_batch(
&storage,
Some(&mut canonical_index),
&data_dir,
&conv_a,
&None,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.unwrap();
ingest_batch(
&storage,
Some(&mut canonical_index),
&data_dir,
&conv_b,
&None,
LexicalPopulationStrategy::IncrementalInline,
false,
)
.unwrap();
drop(canonical_index);
let index_path = index_dir(&data_dir).unwrap();
let sparse_backup = data_dir.join("index-full-backup");
fs::rename(&index_path, &sparse_backup).unwrap();
fs::create_dir_all(&index_path).unwrap();
let mut sparse_index = TantivyIndex::open_or_create(&index_path).unwrap();
sparse_index
.add_messages_with_conversation_id(&conv_a[0], &conv_a[0].messages, Some(1))
.unwrap();
sparse_index.commit().unwrap();
drop(sparse_index);
refresh_completed_lexical_rebuild_checkpoint(&storage, &db_path, &data_dir).unwrap();
assert!(
load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.is_none(),
"sparse live Tantivy should not bootstrap a completed lexical checkpoint"
);
}
#[test]
fn final_checkpoint_refresh_uses_settled_storage_fingerprint() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let mut storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = norm_conv(
Some("checkpoint-settled"),
vec![norm_msg(0, 1_700_000_000_000)],
);
storage
.insert_conversation_tree(
agent_id,
None,
&crate::model::types::Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: conv.title.clone(),
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: conv.metadata.clone(),
messages: conv
.messages
.iter()
.map(|m| crate::model::types::Message {
id: None,
idx: m.idx,
role: crate::model::types::MessageRole::User,
author: m.author.clone(),
created_at: m.created_at,
content: m.content.clone(),
extra_json: m.extra.clone(),
snippets: Vec::new(),
})
.collect(),
source_id: "local".to_string(),
origin_host: None,
},
)
.unwrap();
let index_path = index_dir(&data_dir).unwrap();
let mut index = TantivyIndex::open_or_create(&index_path).unwrap();
index
.add_messages_with_conversation_id(&conv, &conv.messages, Some(1))
.unwrap();
index.commit().unwrap();
drop(index);
storage
.set_last_indexed_at(FrankenStorage::now_millis())
.unwrap();
let total_conversations = count_total_conversations_exact(&storage).unwrap();
let total_messages = count_total_messages_exact(&storage).unwrap();
let mut state = LexicalRebuildState::new(
lexical_rebuild_db_state(&storage, &db_path).unwrap(),
LEXICAL_REBUILD_PAGE_SIZE,
);
state.mark_completed(index_meta_fingerprint(&index_path).unwrap());
persist_lexical_rebuild_state(&index_path, &state).unwrap();
refresh_completed_lexical_rebuild_checkpoint_for_final_state(
&mut storage,
&db_path,
&data_dir,
false,
Some((total_conversations, total_messages)),
)
.unwrap();
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("refreshed checkpoint");
assert!(checkpoint.completed);
assert_eq!(checkpoint.total_conversations, total_conversations);
assert_eq!(checkpoint.processed_conversations, total_conversations);
assert_eq!(checkpoint.indexed_docs, total_messages);
assert_eq!(
checkpoint.storage_fingerprint,
lexical_rebuild_storage_fingerprint(&db_path).unwrap()
);
}
#[test]
fn matching_lexical_rebuild_state_status_recognizes_completed_checkpoint() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let db_state = lexical_rebuild_db_state(&storage, &db_path).unwrap();
let index_path = index_dir(&data_dir).unwrap();
let mut state = LexicalRebuildState::new(db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE);
state.mark_completed(Some("stable-meta".to_string()));
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let status = matching_lexical_rebuild_state_status(&index_path, &db_state).unwrap();
assert!(!status.has_pending_resume);
assert!(status.has_completed_checkpoint);
assert_eq!(status.completed_indexed_docs, Some(0));
assert_eq!(status.completed_exact_totals, Some((0, 0)));
}
#[test]
fn matching_lexical_rebuild_state_status_if_present_skips_db_state_without_checkpoint() {
let tmp = TempDir::new().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let load_current_db_state_called = std::cell::Cell::new(false);
let status = matching_lexical_rebuild_state_status_if_present(&index_path, || {
load_current_db_state_called.set(true);
Ok(LexicalRebuildDbState {
db_path: "unused.db".to_string(),
total_conversations: 1,
total_messages: 0,
storage_fingerprint: "content-v1:1:1:1".to_string(),
})
})
.unwrap();
assert_eq!(status, MatchingLexicalRebuildStateStatus::default());
assert!(!load_current_db_state_called.get());
}
#[test]
fn completed_checkpoint_can_match_without_content_fingerprint() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
fs::write(&db_path, b"db").unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let checkpoint_db_state = LexicalRebuildDbState {
db_path: db_path.to_string_lossy().into_owned(),
total_conversations: 7,
total_messages: 0,
storage_fingerprint: "content-v1:7:42:42".to_string(),
};
let mut state = LexicalRebuildState::new(checkpoint_db_state, LEXICAL_REBUILD_PAGE_SIZE);
state.indexed_docs = 42;
state.mark_completed(Some("meta-fingerprint".to_string()));
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let status = matching_completed_lexical_rebuild_state_status_without_fingerprint(
&index_path,
&db_path,
7,
)
.unwrap()
.expect("completed checkpoints do not need an exact storage fingerprint");
assert!(status.has_completed_checkpoint);
assert_eq!(status.completed_indexed_docs, Some(42));
assert_eq!(status.completed_exact_totals, Some((7, 42)));
assert_eq!(
status.completed_storage_fingerprint.as_deref(),
Some("content-v1:7:42:42")
);
}
#[test]
fn incomplete_checkpoint_still_requires_content_fingerprint() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
fs::write(&db_path, b"db").unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let checkpoint_db_state = LexicalRebuildDbState {
db_path: db_path.to_string_lossy().into_owned(),
total_conversations: 7,
total_messages: 0,
storage_fingerprint: "content-v1:7:42:42".to_string(),
};
let state = LexicalRebuildState::new(checkpoint_db_state, LEXICAL_REBUILD_PAGE_SIZE);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
assert!(
matching_completed_lexical_rebuild_state_status_without_fingerprint(
&index_path,
&db_path,
7,
)
.unwrap()
.is_none(),
"incomplete shared-writer checkpoints must keep the exact-fingerprint resume path"
);
}
#[test]
fn nonresumable_staged_checkpoint_can_restart_without_current_fingerprint() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
fs::write(&db_path, b"db").unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let checkpoint_db_state = LexicalRebuildDbState {
db_path: db_path.to_string_lossy().into_owned(),
total_conversations: 400,
total_messages: 0,
storage_fingerprint: "content-v1:400:1200:4800".to_string(),
};
let mut state =
LexicalRebuildState::new(checkpoint_db_state.clone(), LEXICAL_REBUILD_PAGE_SIZE);
state.set_execution_mode(LexicalRebuildExecutionMode::StagedShardBuild);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let status = nonresumable_pending_lexical_rebuild_status_without_fingerprint(
&index_path,
&db_path,
400,
)
.unwrap()
.expect("staged checkpoint should route to restart without exact fingerprint");
assert!(status.has_pending_resume);
assert!(!status.has_completed_checkpoint);
let shared_index_path = tmp.path().join("shared-index");
fs::create_dir_all(&shared_index_path).unwrap();
let shared_state = LexicalRebuildState::new(checkpoint_db_state, LEXICAL_REBUILD_PAGE_SIZE);
persist_lexical_rebuild_state(&shared_index_path, &shared_state).unwrap();
assert!(
nonresumable_pending_lexical_rebuild_status_without_fingerprint(
&shared_index_path,
&db_path,
400,
)
.unwrap()
.is_none(),
"shared-writer checkpoints still require the exact fingerprint path"
);
}
#[test]
fn readonly_db_probe_classifies_nonresumable_staged_checkpoint() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
assert_eq!(count_total_conversations_exact(&storage).unwrap(), 0);
storage.close().unwrap();
let index_path = tmp.path().join("index");
fs::create_dir_all(&index_path).unwrap();
let checkpoint_db_state = LexicalRebuildDbState {
db_path: db_path.to_string_lossy().into_owned(),
total_conversations: 0,
total_messages: 0,
storage_fingerprint: "content-v1:0:0:0".to_string(),
};
let mut state = LexicalRebuildState::new(checkpoint_db_state, LEXICAL_REBUILD_PAGE_SIZE);
state.set_execution_mode(LexicalRebuildExecutionMode::StagedShardBuild);
persist_lexical_rebuild_state(&index_path, &state).unwrap();
let (status, total_conversations) =
nonresumable_pending_lexical_rebuild_status_from_readonly_db(&index_path, &db_path)
.unwrap()
.expect("readonly probe should classify matching staged checkpoint");
assert!(status.has_pending_resume);
assert_eq!(total_conversations, 0);
}
#[test]
fn skip_noop_final_checkpoint_refresh_requires_matching_completed_checkpoint_and_no_mutations()
{
let exact_counts = None;
let no_mutations = CanonicalMutationCounts::default();
let changed = CanonicalMutationCounts {
inserted_conversations: 0,
inserted_messages: 1,
};
assert!(should_skip_noop_final_lexical_checkpoint_refresh(
false,
false,
&MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(0),
completed_exact_totals: Some((0, 0)),
completed_storage_fingerprint: Some("content-v1:0:0:0".to_string()),
},
exact_counts,
no_mutations,
));
assert!(!should_skip_noop_final_lexical_checkpoint_refresh(
true,
false,
&MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(0),
completed_exact_totals: Some((0, 0)),
completed_storage_fingerprint: Some("content-v1:0:0:0".to_string()),
},
exact_counts,
no_mutations,
));
assert!(!should_skip_noop_final_lexical_checkpoint_refresh(
false,
false,
&MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: false,
completed_indexed_docs: None,
completed_exact_totals: None,
completed_storage_fingerprint: None,
},
exact_counts,
no_mutations,
));
assert!(!should_skip_noop_final_lexical_checkpoint_refresh(
false,
false,
&MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(0),
completed_exact_totals: Some((0, 0)),
completed_storage_fingerprint: Some("content-v1:0:0:0".to_string()),
},
Some((1, 2)),
no_mutations,
));
assert!(!should_skip_noop_final_lexical_checkpoint_refresh(
false,
false,
&MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(0),
completed_exact_totals: Some((0, 0)),
completed_storage_fingerprint: Some("content-v1:0:0:0".to_string()),
},
exact_counts,
changed,
));
assert!(!should_skip_noop_final_lexical_checkpoint_refresh(
false,
true,
&MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(0),
completed_exact_totals: Some((0, 0)),
completed_storage_fingerprint: Some("content-v1:0:0:0".to_string()),
},
exact_counts,
no_mutations,
));
}
#[test]
fn preserve_matching_completed_lexical_checkpoint_during_full_scan_requires_plain_full_scan_with_checkpoint()
{
let completed = MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(42),
completed_exact_totals: Some((7, 42)),
completed_storage_fingerprint: Some("content-v1:7:42:42".to_string()),
};
assert!(
should_preserve_matching_completed_lexical_checkpoint_during_full_scan(
true, false, false, &completed
)
);
assert!(
!should_preserve_matching_completed_lexical_checkpoint_during_full_scan(
false, false, false, &completed
)
);
assert!(
!should_preserve_matching_completed_lexical_checkpoint_during_full_scan(
true, true, false, &completed
)
);
assert!(
!should_preserve_matching_completed_lexical_checkpoint_during_full_scan(
true, false, true, &completed
)
);
assert!(
!should_preserve_matching_completed_lexical_checkpoint_during_full_scan(
true,
false,
false,
&MatchingLexicalRebuildStateStatus::default(),
)
);
}
#[test]
fn preflight_existing_tantivy_reader_is_skipped_for_resume_or_full_rebuilds() {
assert!(should_preflight_existing_tantivy_reader(false, false));
assert!(!should_preflight_existing_tantivy_reader(true, false));
assert!(!should_preflight_existing_tantivy_reader(false, true));
}
#[test]
fn live_tantivy_doc_probe_for_post_full_skip_only_runs_for_noop_full_candidates() {
let checkpoint = MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(42),
completed_exact_totals: Some((7, 42)),
completed_storage_fingerprint: Some("content-v1:7:42:42".to_string()),
};
assert!(should_probe_live_tantivy_docs_for_post_full_scan_skip(
true,
false,
0,
&checkpoint,
CanonicalMutationCounts::default(),
None,
));
assert!(!should_probe_live_tantivy_docs_for_post_full_scan_skip(
false,
false,
0,
&checkpoint,
CanonicalMutationCounts::default(),
None,
));
assert!(!should_probe_live_tantivy_docs_for_post_full_scan_skip(
true,
true,
0,
&checkpoint,
CanonicalMutationCounts::default(),
None,
));
assert!(!should_probe_live_tantivy_docs_for_post_full_scan_skip(
true,
false,
1,
&checkpoint,
CanonicalMutationCounts::default(),
None,
));
assert!(!should_probe_live_tantivy_docs_for_post_full_scan_skip(
true,
false,
0,
&checkpoint,
CanonicalMutationCounts {
inserted_conversations: 1,
inserted_messages: 0,
},
None,
));
assert!(!should_probe_live_tantivy_docs_for_post_full_scan_skip(
true,
false,
0,
&MatchingLexicalRebuildStateStatus::default(),
CanonicalMutationCounts::default(),
None,
));
assert!(!should_probe_live_tantivy_docs_for_post_full_scan_skip(
true,
false,
0,
&checkpoint,
CanonicalMutationCounts::default(),
Some(42),
));
}
#[test]
fn authoritative_rebuild_requirement_only_treats_canonical_storage_reset_as_rebuild() {
assert!(!should_force_authoritative_rebuild(false, false));
assert!(should_force_authoritative_rebuild(true, false));
assert!(should_force_authoritative_rebuild(false, true));
}
#[test]
#[serial]
fn configured_connector_factories_skip_disabled_agents_from_sources_config() {
let temp = TempDir::new().unwrap();
let config_home = temp.path().join("xdg-config");
fs::create_dir_all(config_home.join("cass")).unwrap();
fs::write(
config_home.join("cass").join("sources.toml"),
"disabled_agents = [\"openclaw\"]\n",
)
.unwrap();
let _config_home_guard = set_env_var("XDG_CONFIG_HOME", config_home.to_string_lossy());
let _sources_guard = unset_env_var("CASS_IGNORE_SOURCES_CONFIG");
let filtered = filter_disabled_connector_factories(vec![
("openclaw", never_constructed_connector_factory),
("codex", never_constructed_connector_factory),
]);
let names = filtered
.into_iter()
.map(|(name, _)| name)
.collect::<Vec<_>>();
assert_eq!(names, vec!["codex"]);
}
#[test]
fn skip_post_full_scan_authoritative_rebuild_requires_matching_completed_checkpoint_doc_match_and_no_mutations()
{
let checkpoint = MatchingLexicalRebuildStateStatus {
has_pending_resume: false,
has_completed_checkpoint: true,
completed_indexed_docs: Some(42),
completed_exact_totals: Some((7, 42)),
completed_storage_fingerprint: Some("content-v1:7:42:42".to_string()),
};
assert!(should_skip_post_full_scan_authoritative_rebuild(
true,
false,
0,
&checkpoint,
CanonicalMutationCounts::default(),
Some(42),
));
assert!(!should_skip_post_full_scan_authoritative_rebuild(
false,
false,
0,
&checkpoint,
CanonicalMutationCounts::default(),
Some(42),
));
assert!(!should_skip_post_full_scan_authoritative_rebuild(
true,
true,
0,
&checkpoint,
CanonicalMutationCounts::default(),
Some(42),
));
assert!(!should_skip_post_full_scan_authoritative_rebuild(
true,
false,
1,
&checkpoint,
CanonicalMutationCounts::default(),
Some(42),
));
assert!(!should_skip_post_full_scan_authoritative_rebuild(
true,
false,
0,
&checkpoint,
CanonicalMutationCounts {
inserted_conversations: 1,
inserted_messages: 0,
},
Some(42),
));
assert!(!should_skip_post_full_scan_authoritative_rebuild(
true,
false,
0,
&MatchingLexicalRebuildStateStatus::default(),
CanonicalMutationCounts::default(),
Some(42),
));
assert!(!should_skip_post_full_scan_authoritative_rebuild(
true,
false,
0,
&checkpoint,
CanonicalMutationCounts::default(),
Some(41),
));
assert!(!should_skip_post_full_scan_authoritative_rebuild(
true,
false,
0,
&checkpoint,
CanonicalMutationCounts::default(),
None,
));
}
#[test]
fn lexical_rebuild_db_state_with_exact_totals_matches_settled_storage() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = norm_conv(
Some("exact-totals-helper"),
vec![
norm_msg(0, 1_700_000_000_000),
norm_msg(1, 1_700_000_000_001),
],
);
storage
.insert_conversation_tree(
agent_id,
None,
&crate::model::types::Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: conv.title.clone(),
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: conv.metadata.clone(),
messages: conv
.messages
.iter()
.map(|m| crate::model::types::Message {
id: None,
idx: m.idx,
role: crate::model::types::MessageRole::User,
author: m.author.clone(),
created_at: m.created_at,
content: m.content.clone(),
extra_json: m.extra.clone(),
snippets: Vec::new(),
})
.collect(),
source_id: "local".to_string(),
origin_host: None,
},
)
.unwrap();
let total_conversations = count_total_conversations_exact(&storage).unwrap();
let total_messages = count_total_messages_exact(&storage).unwrap();
let db_state = lexical_rebuild_db_state_with_exact_totals(
&storage,
&db_path,
total_conversations,
total_messages,
)
.unwrap();
assert_eq!(db_state.total_conversations, total_conversations);
assert_eq!(db_state.total_messages, total_messages);
assert_eq!(
db_state.storage_fingerprint,
lexical_rebuild_storage_fingerprint(&db_path).unwrap()
);
}
#[test]
fn persist_final_index_run_metadata_updates_last_scan_ts_and_last_indexed_at_together() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
persist_final_index_run_metadata(&storage, &db_path, true, 123, 456).unwrap();
assert_eq!(storage.get_last_scan_ts().unwrap(), Some(123));
assert_eq!(storage.get_last_indexed_at().unwrap(), Some(456));
}
#[test]
fn persist_final_index_run_metadata_preserves_last_scan_ts_for_lexical_resume() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
let storage = FrankenStorage::open(&db_path).unwrap();
storage.set_last_scan_ts(123).unwrap();
persist_final_index_run_metadata(&storage, &db_path, false, 999, 456).unwrap();
assert_eq!(storage.get_last_scan_ts().unwrap(), Some(123));
assert_eq!(storage.get_last_indexed_at().unwrap(), Some(456));
}
#[test]
fn persist_final_index_run_metadata_logs_warning_and_returns_ok_when_retries_exhausted() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
let logs = capture_logs(|| {
let result =
persist_final_index_run_metadata_with_writer(&db_path, true, 999, 456, || {
Err(anyhow::anyhow!(
"ephemeral writer preflight write failed for updating final \
last_indexed_at after index run at {}: database is busy",
db_path.display()
))
});
assert!(
result.is_ok(),
"metadata-only contention must not fail a fully-committed rebuild; \
got {result:?}"
);
});
assert!(
logs.contains("deferred final index-run metadata update after retries exhausted"),
"expected deferred-update warning in logs; got:\n{logs}"
);
assert!(
logs.contains("database is busy"),
"expected underlying writer error surfaced in warn log; got:\n{logs}"
);
assert!(
!logs.contains("updated last_indexed_at for status display"),
"must not emit the success INFO log when the writer retry was swallowed; got:\n{logs}"
);
}
#[test]
fn persist_final_index_run_metadata_with_writer_logs_success_info_when_writer_succeeds() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
let logs = capture_logs(|| {
let result =
persist_final_index_run_metadata_with_writer(&db_path, true, 111, 222, || Ok(()));
assert!(result.is_ok());
});
assert!(
logs.contains("updated last_scan_ts for incremental indexing"),
"expected scan_ts INFO log; got:\n{logs}"
);
assert!(
logs.contains("updated last_indexed_at for status display"),
"expected indexed_at INFO log; got:\n{logs}"
);
assert!(
!logs.contains("deferred final index-run metadata update"),
"must not emit deferred-update warn on happy path; got:\n{logs}"
);
}
#[test]
fn final_checkpoint_refresh_normalizes_db_path_identity() {
let tmp = TempDir::new().unwrap();
let data_dir = tmp.path().join("data");
fs::create_dir_all(&data_dir).unwrap();
let db_path = data_dir.join("agent_search.db");
let db_path_variant = data_dir.join(".").join("agent_search.db");
let mut storage = FrankenStorage::open(&db_path).unwrap();
ensure_fts_schema(&storage);
let agent = crate::model::types::Agent {
id: None,
slug: "tester".into(),
name: "Tester".into(),
version: None,
kind: crate::model::types::AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent).unwrap();
let conv = norm_conv(
Some("checkpoint-path-normalize"),
vec![norm_msg(0, 1_700_000_000_000)],
);
storage
.insert_conversation_tree(
agent_id,
None,
&crate::model::types::Conversation {
id: None,
agent_slug: conv.agent_slug.clone(),
workspace: conv.workspace.clone(),
external_id: conv.external_id.clone(),
title: conv.title.clone(),
source_path: conv.source_path.clone(),
started_at: conv.started_at,
ended_at: conv.ended_at,
approx_tokens: None,
metadata_json: conv.metadata.clone(),
messages: conv
.messages
.iter()
.map(|m| crate::model::types::Message {
id: None,
idx: m.idx,
role: crate::model::types::MessageRole::User,
author: m.author.clone(),
created_at: m.created_at,
content: m.content.clone(),
extra_json: m.extra.clone(),
snippets: Vec::new(),
})
.collect(),
source_id: "local".to_string(),
origin_host: None,
},
)
.unwrap();
let index_path = index_dir(&data_dir).unwrap();
let mut index = TantivyIndex::open_or_create(&index_path).unwrap();
index
.add_messages_with_conversation_id(&conv, &conv.messages, Some(1))
.unwrap();
index.commit().unwrap();
drop(index);
storage
.set_last_indexed_at(FrankenStorage::now_millis())
.unwrap();
let mut state = LexicalRebuildState::new(
lexical_rebuild_db_state(&storage, &db_path).unwrap(),
LEXICAL_REBUILD_PAGE_SIZE,
);
state.mark_completed(index_meta_fingerprint(&index_path).unwrap());
persist_lexical_rebuild_state(&index_path, &state).unwrap();
refresh_completed_lexical_rebuild_checkpoint_for_final_state(
&mut storage,
&db_path_variant,
&data_dir,
false,
None,
)
.unwrap();
let checkpoint = load_lexical_rebuild_checkpoint(&index_path)
.unwrap()
.expect("refreshed checkpoint");
assert_eq!(
checkpoint.db_path,
crate::normalize_path_identity(&db_path)
.to_string_lossy()
.into_owned()
);
}
#[test]
fn should_commit_lexical_rebuild_when_message_count_threshold_is_hit() {
assert!(should_commit_lexical_rebuild(
10,
5_000,
1_024,
1_000,
5_000,
16 * 1024 * 1024
));
assert!(!should_commit_lexical_rebuild(
10,
4_999,
1_024,
1_000,
5_000,
16 * 1024 * 1024
));
}
#[test]
fn should_commit_lexical_rebuild_when_message_byte_threshold_is_hit() {
assert!(should_commit_lexical_rebuild(
10,
100,
16 * 1024 * 1024,
1_000,
5_000,
16 * 1024 * 1024
));
assert!(!should_commit_lexical_rebuild(
10,
100,
(16 * 1024 * 1024) - 1,
1_000,
5_000,
16 * 1024 * 1024
));
}
#[test]
fn lexical_rebuild_commit_intervals_keep_initial_slice_bounded_before_first_commit() {
let state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
let (conversations, messages, message_bytes) =
lexical_rebuild_commit_intervals_for_state(&state);
assert_eq!(
conversations,
lexical_rebuild_initial_commit_interval_conversations()
.min(lexical_rebuild_commit_interval_conversations())
);
assert_eq!(
messages,
lexical_rebuild_initial_commit_interval_messages()
.min(lexical_rebuild_commit_interval_messages())
);
assert_eq!(
message_bytes,
lexical_rebuild_initial_commit_interval_message_bytes()
.min(lexical_rebuild_commit_interval_message_bytes())
);
}
#[test]
fn lexical_rebuild_commit_intervals_return_to_steady_state_after_first_commit() {
let mut state = LexicalRebuildState::new(
LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 400,
total_messages: 800,
storage_fingerprint: "seed:400".to_string(),
},
LEXICAL_REBUILD_PAGE_SIZE,
);
state.committed_offset = 2_048;
state.processed_conversations = 2_048;
let (conversations, messages, message_bytes) =
lexical_rebuild_commit_intervals_for_state(&state);
assert_eq!(
conversations,
lexical_rebuild_commit_interval_conversations()
);
assert_eq!(messages, lexical_rebuild_commit_interval_messages());
assert_eq!(
message_bytes,
lexical_rebuild_commit_interval_message_bytes()
);
}
#[test]
fn should_persist_lexical_rebuild_progress_when_conversation_threshold_is_hit() {
assert!(should_persist_lexical_rebuild_progress(
250,
250,
Duration::from_millis(10),
Duration::from_secs(2)
));
assert!(!should_persist_lexical_rebuild_progress(
249,
250,
Duration::from_millis(10),
Duration::from_secs(2)
));
}
#[test]
fn should_persist_lexical_rebuild_progress_when_time_threshold_is_hit() {
assert!(should_persist_lexical_rebuild_progress(
1,
250,
Duration::from_secs(2),
Duration::from_secs(2)
));
assert!(!should_persist_lexical_rebuild_progress(
1,
250,
Duration::from_millis(1999),
Duration::from_secs(2)
));
}
#[test]
#[serial]
fn initial_batch_fetch_limit_defaults_to_bounded_warmup_chunk() {
let _initial_conversation_limit = set_env(
"CASS_TANTIVY_REBUILD_INITIAL_BATCH_FETCH_CONVERSATIONS",
"0",
);
assert_eq!(
lexical_rebuild_initial_batch_fetch_conversation_limit(16),
16,
"initial chunk should respect smaller steady-state limits"
);
assert_eq!(
lexical_rebuild_initial_batch_fetch_conversation_limit(128),
32,
"initial chunk should default to one SQL batch when steady-state is larger"
);
assert_eq!(
lexical_rebuild_initial_batch_fetch_conversation_limit(1),
1,
"clamp to default_limit when it is the smaller value"
);
assert_eq!(
lexical_rebuild_initial_batch_fetch_conversation_limit(0),
1,
"never return 0 even if default_limit degenerates to 0"
);
}
#[test]
fn finalize_watch_reindex_result_clears_stale_error_on_success() {
let detector = StaleDetector::new(StaleConfig::default());
let progress = Arc::new(IndexingProgress::default());
*progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some("old".to_string());
let indexed = finalize_watch_reindex_result(
Ok(3),
&detector,
Some(&progress),
"watch incremental reindex",
);
assert_eq!(indexed, 3);
assert_eq!(detector.stats().total_ingests, 1);
assert_eq!(
progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner())
.as_deref(),
None,
"successful watch reindex should clear stale error diagnostics"
);
}
#[test]
fn finalize_watch_once_reindex_result_propagates_error_and_resets_phase() {
let detector = StaleDetector::new(StaleConfig::default());
let progress = Arc::new(IndexingProgress::default());
progress.phase.store(2, Ordering::Relaxed);
let error = finalize_watch_once_reindex_result(
Err(anyhow::anyhow!("boom")),
&detector,
Some(&progress),
"watch incremental reindex",
)
.expect_err("watch-once failures must propagate to the CLI");
assert_eq!(error.to_string(), "boom");
assert_eq!(
detector.stats().consecutive_zero_scans,
1,
"failed watch-once reindex should still count as a zero-result scan for stale detection"
);
assert_eq!(
progress.phase.load(Ordering::Relaxed),
0,
"failed watch-once reindex should reset progress phase back to idle"
);
assert_eq!(
progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner())
.as_deref(),
Some("watch incremental reindex: boom"),
"failed watch-once reindex should surface the real error"
);
}
#[test]
fn finalize_watch_once_reindex_result_clears_stale_error_on_success() {
let detector = StaleDetector::new(StaleConfig::default());
let progress = Arc::new(IndexingProgress::default());
*progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some("old".to_string());
let indexed = finalize_watch_once_reindex_result(
Ok(5),
&detector,
Some(&progress),
"watch incremental reindex",
)
.expect("watch-once success should be preserved");
assert_eq!(indexed, 5);
assert_eq!(detector.stats().total_ingests, 1);
assert_eq!(
progress
.last_error
.lock()
.unwrap_or_else(|e| e.into_inner())
.as_deref(),
None,
"successful watch-once reindex should clear stale error diagnostics"
);
}
#[test]
fn stale_detector_check_respects_disabled() {
let detector = StaleDetector::new(StaleConfig {
action: StaleAction::None,
..Default::default()
});
for _ in 0..20 {
detector.record_scan(0);
}
assert!(detector.check_stale().is_none());
}
#[test]
fn stale_detector_requires_min_zero_scans() {
let detector = StaleDetector::new(StaleConfig {
min_zero_scans: 5,
check_interval_mins: 0, threshold_hours: 0, ..Default::default()
});
for _ in 0..4 {
detector.record_scan(0);
}
assert!(detector.check_stale().is_none());
detector.record_scan(0);
}
#[test]
fn stale_stats_serializes_correctly() {
let stats = StaleStats {
consecutive_zero_scans: 5,
total_ingests: 10,
seconds_since_last_ingest: Some(3600),
warning_emitted: true,
config_action: "Warn".to_string(),
config_threshold_hours: 24,
};
let json = serde_json::to_string(&stats).unwrap();
assert!(json.contains("consecutive_zero_scans"));
assert!(json.contains("total_ingests"));
}
#[test]
fn quarantine_failed_seed_bundle_moves_sidecars_and_uses_unique_paths() {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("agent_search.db");
std::fs::write(&db_path, b"db-one").unwrap();
std::fs::write(tmp.path().join("agent_search.db-wal"), b"wal-one").unwrap();
std::fs::write(tmp.path().join("agent_search.db-shm"), b"shm-one").unwrap();
let first_backup = quarantine_failed_seed_bundle(&db_path)
.unwrap()
.expect("first quarantine path");
let first_name = first_backup
.file_name()
.unwrap()
.to_string_lossy()
.into_owned();
assert_eq!(std::fs::read(&first_backup).unwrap(), b"db-one");
assert_eq!(
std::fs::read(first_backup.with_file_name(format!("{first_name}-wal"))).unwrap(),
b"wal-one"
);
assert_eq!(
std::fs::read(first_backup.with_file_name(format!("{first_name}-shm"))).unwrap(),
b"shm-one"
);
assert!(!db_path.exists());
assert!(!tmp.path().join("agent_search.db-wal").exists());
assert!(!tmp.path().join("agent_search.db-shm").exists());
std::fs::write(&db_path, b"db-two").unwrap();
std::fs::write(tmp.path().join("agent_search.db-wal"), b"wal-two").unwrap();
std::fs::write(tmp.path().join("agent_search.db-shm"), b"shm-two").unwrap();
let second_backup = quarantine_failed_seed_bundle(&db_path)
.unwrap()
.expect("second quarantine path");
assert_ne!(
first_backup, second_backup,
"repeated quarantines should not collide on backup path"
);
assert_eq!(std::fs::read(&second_backup).unwrap(), b"db-two");
}
#[test]
fn record_pending_commit_derives_next_offset_from_processed_conversations() {
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 100,
total_messages: 200,
storage_fingerprint: "seed:100".to_string(),
};
let mut state = LexicalRebuildState::new(db_state, LEXICAL_REBUILD_PAGE_SIZE);
state.record_pending_commit(Some(42), 17, 34, None);
let pending = state.pending.as_ref().unwrap();
assert_eq!(pending.next_offset, 17);
assert_eq!(pending.next_conversation_id, Some(42));
assert_eq!(pending.processed_conversations, 17);
}
#[test]
fn finalize_commit_derives_committed_offset_from_processed_conversations() {
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 100,
total_messages: 200,
storage_fingerprint: "seed:100".to_string(),
};
let mut state = LexicalRebuildState::new(db_state, LEXICAL_REBUILD_PAGE_SIZE);
state.record_pending_commit(Some(42), 17, 34, None);
state.finalize_commit(Some("fp-abc".to_string()));
assert_eq!(state.committed_offset, 17);
assert_eq!(state.committed_conversation_id, Some(42));
assert_eq!(state.processed_conversations, 17);
assert_eq!(state.indexed_docs, 34);
}
#[test]
fn keyset_checkpoint_round_trips_through_serialization() {
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 500,
total_messages: 1000,
storage_fingerprint: "content-v1:500:999:5000".to_string(),
};
let mut state = LexicalRebuildState::new(db_state, LEXICAL_REBUILD_PAGE_SIZE);
state.record_pending_commit(Some(999), 250, 500, None);
state.finalize_commit(None);
state.record_pending_commit(Some(1500), 300, 600, None);
let json = serde_json::to_string(&state).unwrap();
let restored: LexicalRebuildState = serde_json::from_str(&json).unwrap();
assert_eq!(restored.committed_offset, 250);
assert_eq!(restored.committed_conversation_id, Some(999));
assert_eq!(restored.processed_conversations, 250);
let pending = restored.pending.as_ref().unwrap();
assert_eq!(pending.next_offset, 300);
assert_eq!(pending.next_conversation_id, Some(1500));
assert_eq!(pending.processed_conversations, 300);
}
#[test]
fn legacy_checkpoint_without_conversation_id_deserializes() {
let legacy_json = r#"{
"version": 3,
"schema_hash": "test-hash",
"db": {
"db_path": "/tmp/agent_search.db",
"total_conversations": 100,
"total_messages": 200,
"storage_fingerprint": "seed:100"
},
"page_size": 64,
"committed_offset": 50,
"processed_conversations": 50,
"indexed_docs": 100,
"committed_meta_fingerprint": null,
"pending": {
"next_offset": 60,
"processed_conversations": 60,
"indexed_docs": 120,
"base_meta_fingerprint": null
},
"completed": false,
"updated_at_ms": 1700000000000
}"#;
let state: LexicalRebuildState = serde_json::from_str(legacy_json).unwrap();
assert_eq!(state.committed_offset, 50);
assert_eq!(state.committed_conversation_id, None);
assert_eq!(state.processed_conversations, 50);
let pending = state.pending.as_ref().unwrap();
assert_eq!(pending.next_offset, 60);
assert_eq!(pending.next_conversation_id, None);
assert_eq!(pending.processed_conversations, 60);
}
#[test]
fn checkpoint_progress_is_monotone_across_commits() {
let db_state = LexicalRebuildDbState {
db_path: "/tmp/agent_search.db".to_string(),
total_conversations: 1000,
total_messages: 5000,
storage_fingerprint: "seed:1000".to_string(),
};
let mut state = LexicalRebuildState::new(db_state, LEXICAL_REBUILD_PAGE_SIZE);
let conversation_ids = [10, 25, 100, 500, 999];
for (i, &cid) in conversation_ids.iter().enumerate() {
let processed = i + 1;
let docs = processed * 5;
state.record_pending_commit(Some(cid), processed, docs, None);
state.finalize_commit(None);
assert_eq!(state.committed_conversation_id, Some(cid));
assert_eq!(state.processed_conversations, processed);
assert_eq!(state.indexed_docs, docs);
assert_eq!(state.committed_offset, processed as i64);
}
assert_eq!(state.committed_conversation_id, Some(999));
assert_eq!(state.processed_conversations, 5);
assert_eq!(state.committed_offset, 5);
}
}