Skip to main content

fathomdb_engine/
admin.rs

1use std::fmt::Write as _;
2use std::fs;
3use std::io;
4use std::path::{Path, PathBuf};
5use std::sync::Arc;
6use std::sync::mpsc::SyncSender;
7use std::time::SystemTime;
8
9use fathomdb_schema::{SchemaError, SchemaManager};
10use rusqlite::{DatabaseName, OptionalExtension, TransactionBehavior};
11use serde::{Deserialize, Serialize};
12use sha2::{Digest, Sha256};
13
14use crate::rebuild_actor::{RebuildMode, RebuildRequest, RebuildStateRow};
15
16use crate::{
17    EngineError, ProjectionRepairReport, ProjectionService,
18    embedder::{BatchEmbedder, QueryEmbedder, QueryEmbedderIdentity},
19    ids::new_id,
20    operational::{
21        OperationalCollectionKind, OperationalCollectionRecord, OperationalCompactionReport,
22        OperationalCurrentRow, OperationalFilterClause, OperationalFilterField,
23        OperationalFilterFieldType, OperationalFilterMode, OperationalFilterValue,
24        OperationalHistoryValidationIssue, OperationalHistoryValidationReport,
25        OperationalMutationRow, OperationalPurgeReport, OperationalReadReport,
26        OperationalReadRequest, OperationalRegisterRequest, OperationalRepairReport,
27        OperationalRetentionActionKind, OperationalRetentionPlanItem,
28        OperationalRetentionPlanReport, OperationalRetentionRunItem, OperationalRetentionRunReport,
29        OperationalSecondaryIndexDefinition, OperationalSecondaryIndexRebuildReport,
30        OperationalTraceReport, extract_secondary_index_entries_for_current,
31        extract_secondary_index_entries_for_mutation, parse_operational_secondary_indexes_json,
32        parse_operational_validation_contract, validate_operational_payload_against_contract,
33    },
34    projection::ProjectionTarget,
35    sqlite,
36};
37
38/// Results of a physical and structural integrity check on the database.
39#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
40pub struct IntegrityReport {
41    pub physical_ok: bool,
42    pub foreign_keys_ok: bool,
43    pub missing_fts_rows: usize,
44    pub missing_property_fts_rows: usize,
45    pub duplicate_active_logical_ids: usize,
46    pub operational_missing_collections: usize,
47    pub operational_missing_last_mutations: usize,
48    pub warnings: Vec<String>,
49}
50
51/// A registered FTS property projection schema for a node kind.
52#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
53pub struct FtsPropertySchemaRecord {
54    /// The node kind this schema applies to.
55    pub kind: String,
56    /// Flat display list of registered JSON property paths
57    /// (e.g. `["$.name", "$.title"]`). For recursive entries this lists
58    /// only the root path; mode information is carried by
59    /// [`Self::entries`].
60    pub property_paths: Vec<String>,
61    /// Full per-entry schema shape with mode
62    /// ([`FtsPropertyPathMode::Scalar`] | [`FtsPropertyPathMode::Recursive`]).
63    /// Read this field for mode-accurate round-trip of the registered
64    /// schema.
65    pub entries: Vec<FtsPropertyPathSpec>,
66    /// Subtree paths excluded from recursive walks. Empty for
67    /// scalar-only schemas or recursive schemas with no exclusions.
68    pub exclude_paths: Vec<String>,
69    /// Separator used when concatenating extracted values.
70    pub separator: String,
71    /// Schema format version.
72    pub format_version: i64,
73}
74
75/// Extraction mode for a single registered FTS property path.
76#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize)]
77#[serde(rename_all = "snake_case")]
78pub enum FtsPropertyPathMode {
79    /// Resolve the path and append the scalar value(s). Matches legacy
80    /// pre-Phase-4 behaviour.
81    #[default]
82    Scalar,
83    /// Recursively walk every scalar leaf rooted at the path. Each leaf
84    /// contributes one entry to the position map.
85    Recursive,
86}
87
88/// A single registered property-FTS path with its extraction mode.
89#[non_exhaustive]
90#[derive(Clone, Debug, PartialEq, Serialize)]
91pub struct FtsPropertyPathSpec {
92    /// JSON path to the property (must start with `$.`).
93    pub path: String,
94    /// Whether to treat this path as a scalar or recursively walk it.
95    pub mode: FtsPropertyPathMode,
96    /// Optional BM25 weight multiplier for this path (1.0 = default).
97    /// Must satisfy `0.0 < weight <= 1000.0` when set.
98    pub weight: Option<f32>,
99}
100
101// f32 does not implement Eq (due to NaN), but weights in practice are
102// always finite values set by callers, so reflexivity holds.
103impl Eq for FtsPropertyPathSpec {}
104
105impl FtsPropertyPathSpec {
106    #[must_use]
107    pub fn scalar(path: impl Into<String>) -> Self {
108        Self {
109            path: path.into(),
110            mode: FtsPropertyPathMode::Scalar,
111            weight: None,
112        }
113    }
114
115    #[must_use]
116    pub fn recursive(path: impl Into<String>) -> Self {
117        Self {
118            path: path.into(),
119            mode: FtsPropertyPathMode::Recursive,
120            weight: None,
121        }
122    }
123
124    /// Set the BM25 weight multiplier for this path.
125    ///
126    /// The weight must satisfy `0.0 < weight <= 1000.0` at registration
127    /// time; this builder method does not validate — validation happens in
128    /// `register_fts_property_schema_with_entries`.
129    #[must_use]
130    pub fn with_weight(mut self, weight: f32) -> Self {
131        self.weight = Some(weight);
132        self
133    }
134}
135
136/// Options controlling how a safe database export is performed.
137#[derive(Clone, Copy, Debug)]
138pub struct SafeExportOptions {
139    /// When true, runs `PRAGMA wal_checkpoint(FULL)` before copying and fails if
140    /// any WAL frames could not be applied (busy != 0). Set to false only in
141    /// tests that seed a database without WAL mode.
142    pub force_checkpoint: bool,
143}
144
145impl Default for SafeExportOptions {
146    fn default() -> Self {
147        Self {
148            force_checkpoint: true,
149        }
150    }
151}
152
153// Must match PROTOCOL_VERSION in fathomdb-admin-bridge.rs
154const EXPORT_PROTOCOL_VERSION: u32 = 1;
155
156/// Manifest describing a completed safe export.
157#[derive(Clone, Debug, Serialize)]
158pub struct SafeExportManifest {
159    /// Unix timestamp (seconds since epoch) when the export was created.
160    pub exported_at: u64,
161    /// SHA-256 hex digest of the exported database file.
162    pub sha256: String,
163    /// Schema version recorded in `fathom_schema_migrations` at export time.
164    pub schema_version: u32,
165    /// Bridge protocol version compiled into this binary.
166    pub protocol_version: u32,
167    /// Number of `SQLite` pages in the exported database file.
168    pub page_count: u64,
169}
170
171/// Report from tracing all rows associated with a given `source_ref`.
172#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
173pub struct TraceReport {
174    pub source_ref: String,
175    pub node_rows: usize,
176    pub edge_rows: usize,
177    pub action_rows: usize,
178    pub operational_mutation_rows: usize,
179    pub node_logical_ids: Vec<String>,
180    pub action_ids: Vec<String>,
181    pub operational_mutation_ids: Vec<String>,
182}
183
184/// An edge that was skipped during a restore because an endpoint is missing.
185#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
186pub struct SkippedEdge {
187    pub edge_logical_id: String,
188    pub missing_endpoint: String,
189}
190
191/// Report from restoring a retired logical ID back to active state.
192#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
193pub struct LogicalRestoreReport {
194    pub logical_id: String,
195    pub was_noop: bool,
196    pub restored_node_rows: usize,
197    pub restored_edge_rows: usize,
198    pub restored_chunk_rows: usize,
199    pub restored_fts_rows: usize,
200    pub restored_property_fts_rows: usize,
201    pub restored_vec_rows: usize,
202    pub skipped_edges: Vec<SkippedEdge>,
203    pub notes: Vec<String>,
204}
205
206/// Report from permanently purging all rows for a logical ID.
207#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
208pub struct LogicalPurgeReport {
209    pub logical_id: String,
210    pub was_noop: bool,
211    pub deleted_node_rows: usize,
212    pub deleted_edge_rows: usize,
213    pub deleted_chunk_rows: usize,
214    pub deleted_fts_rows: usize,
215    pub deleted_vec_rows: usize,
216    pub notes: Vec<String>,
217}
218
219/// Options controlling provenance event purging behavior.
220#[derive(Clone, Debug, Serialize, Deserialize)]
221pub struct ProvenancePurgeOptions {
222    pub dry_run: bool,
223    #[serde(default)]
224    pub preserve_event_types: Vec<String>,
225}
226
227/// Report from a provenance event purge operation.
228#[derive(Clone, Debug, Serialize)]
229pub struct ProvenancePurgeReport {
230    pub events_deleted: u64,
231    pub events_preserved: u64,
232    pub oldest_remaining: Option<i64>,
233}
234
235/// Service providing administrative operations (integrity checks, exports, restores, purges).
236#[derive(Debug)]
237pub struct AdminService {
238    database_path: PathBuf,
239    schema_manager: Arc<SchemaManager>,
240    projections: ProjectionService,
241    /// Sender side of the rebuild actor's channel.  `None` when the engine
242    /// was opened without a rebuild actor (e.g. in tests that use
243    /// [`AdminService::new`] directly).
244    rebuild_sender: Option<SyncSender<RebuildRequest>>,
245}
246
247/// Results of a semantic consistency check on the graph data.
248#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
249pub struct SemanticReport {
250    /// Chunks whose `node_logical_id` has no active node.
251    pub orphaned_chunks: usize,
252    /// Active nodes with a NULL `source_ref` (loss of provenance).
253    pub null_source_ref_nodes: usize,
254    /// Steps referencing a `run_id` that does not exist in the runs table.
255    pub broken_step_fk: usize,
256    /// Actions referencing a `step_id` that does not exist in the steps table.
257    pub broken_action_fk: usize,
258    /// FTS rows whose `chunk_id` does not exist in the chunks table.
259    pub stale_fts_rows: usize,
260    /// FTS rows whose node has been superseded (`superseded_at` IS NOT NULL on all active rows).
261    pub fts_rows_for_superseded_nodes: usize,
262    /// Property FTS rows whose node has been superseded or does not exist.
263    pub stale_property_fts_rows: usize,
264    /// Property FTS rows whose kind has no registered FTS property schema.
265    pub orphaned_property_fts_rows: usize,
266    /// Property FTS rows whose `kind` does not match the active node's actual kind.
267    pub mismatched_kind_property_fts_rows: usize,
268    /// Active logical IDs with more than one per-kind FTS property row.
269    pub duplicate_property_fts_rows: usize,
270    /// Property FTS rows whose `text_content` no longer matches the canonical extraction.
271    pub drifted_property_fts_rows: usize,
272    /// Active edges where at least one endpoint has no active node.
273    pub dangling_edges: usize,
274    /// `logical_ids` where every version has been superseded (no active row).
275    pub orphaned_supersession_chains: usize,
276    /// Vec rows whose backing chunk no longer exists in the chunks table.
277    pub stale_vec_rows: usize,
278    /// Compatibility counter for vec rows whose chunk points at missing node history.
279    pub vec_rows_for_superseded_nodes: usize,
280    /// Latest-state keys whose latest mutation is a `put` but no current row exists.
281    pub missing_operational_current_rows: usize,
282    /// Current rows that do not match the latest mutation state.
283    pub stale_operational_current_rows: usize,
284    /// Mutations written after the owning collection was disabled.
285    pub disabled_collection_mutations: usize,
286    /// Access metadata rows whose `logical_id` no longer has any node history.
287    pub orphaned_last_access_metadata_rows: usize,
288    pub warnings: Vec<String>,
289}
290
291/// Configuration for regenerating vector embeddings.
292///
293/// 0.4.0 architectural invariant: vector identity is the embedder's
294/// responsibility, not the regeneration config's. This struct carries only
295/// WHERE the vectors live and HOW to chunk/preprocess them — never WHAT
296/// model produced them. The embedder supplied at regen-call time is the
297/// single source of truth for `model_identity`, `model_version`,
298/// `dimension`, and `normalization_policy`; the resulting vector profile
299/// is stamped directly from [`QueryEmbedder::identity`].
300///
301/// 0.5.0 breaking change: `table_name` is removed. The vec table name is now
302/// derived from `kind` via [`fathomdb_schema::vec_kind_table_name`].
303#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
304#[serde(rename_all = "snake_case", deny_unknown_fields)]
305pub struct VectorRegenerationConfig {
306    pub kind: String,
307    pub profile: String,
308    pub chunking_policy: String,
309    pub preprocessing_policy: String,
310}
311
312/// Report from a vector embedding regeneration run.
313#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
314pub struct VectorRegenerationReport {
315    pub profile: String,
316    pub table_name: String,
317    pub dimension: usize,
318    pub total_chunks: usize,
319    pub regenerated_rows: usize,
320    pub contract_persisted: bool,
321    pub notes: Vec<String>,
322}
323
324/// Stored FTS tokenizer profile for a node kind.
325///
326/// Created and updated by [`AdminService::set_fts_profile`].
327#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
328pub struct FtsProfile {
329    /// Node kind this profile applies to (e.g. `"Article"`).
330    pub kind: String,
331    /// FTS5 tokenizer string (e.g. `"porter unicode61 remove_diacritics 2"`).
332    pub tokenizer: String,
333    /// Unix timestamp when the profile was last activated, or `None` if never.
334    pub active_at: Option<i64>,
335    /// Unix timestamp when the profile row was first created.
336    pub created_at: i64,
337}
338
339/// Stored vector embedding profile (global, kind-agnostic).
340///
341/// Created and updated by [`AdminService::set_vec_profile`].
342#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
343pub struct VecProfile {
344    /// Identifier for the embedding model (e.g. `"openai/text-embedding-3-small"`).
345    pub model_identity: String,
346    /// Optional version string for the model.
347    pub model_version: Option<String>,
348    /// Number of dimensions produced by the model.
349    pub dimensions: u32,
350    /// Unix timestamp when the profile was last activated, or `None` if never.
351    pub active_at: Option<i64>,
352    /// Unix timestamp when the profile row was first created.
353    pub created_at: i64,
354}
355
356/// Estimated cost of rebuilding a projection (FTS table or vector embeddings).
357///
358/// Returned by [`AdminService::preview_projection_impact`].
359#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
360pub struct ProjectionImpact {
361    /// Number of rows that would be processed during a full rebuild.
362    pub rows_to_rebuild: u64,
363    /// Rough estimated rebuild time in seconds.
364    pub estimated_seconds: u64,
365    /// Estimated temporary disk space required during rebuild, in bytes.
366    pub temp_db_size_bytes: u64,
367    /// The tokenizer currently stored in `projection_profiles`, if any.
368    pub current_tokenizer: Option<String>,
369    /// Reserved for future use; always `None` currently.
370    pub target_tokenizer: Option<String>,
371}
372
373/// Well-known tokenizer preset names mapped to their FTS5 tokenizer strings.
374pub const TOKENIZER_PRESETS: &[(&str, &str)] = &[
375    (
376        "recall-optimized-english",
377        "porter unicode61 remove_diacritics 2",
378    ),
379    ("precision-optimized", "unicode61 remove_diacritics 2"),
380    ("global-cjk", "icu"),
381    ("substring-trigram", "trigram"),
382    ("source-code", "unicode61 tokenchars '._-$@'"),
383];
384
385/// Resolve a tokenizer preset name to its FTS5 tokenizer string.
386///
387/// If `input` matches a known preset name the preset value is returned.
388/// Otherwise `input` is returned unchanged (treated as a raw tokenizer string).
389pub fn resolve_tokenizer_preset(input: &str) -> &str {
390    for (name, value) in TOKENIZER_PRESETS {
391        if *name == input {
392            return value;
393        }
394    }
395    input
396}
397
398const CURRENT_VECTOR_CONTRACT_FORMAT_VERSION: i64 = 1;
399const MAX_PROFILE_LEN: usize = 128;
400const MAX_POLICY_LEN: usize = 128;
401const MAX_CONTRACT_JSON_BYTES: usize = 32 * 1024;
402const MAX_AUDIT_METADATA_BYTES: usize = 2048;
403const DEFAULT_OPERATIONAL_READ_LIMIT: usize = 100;
404const MAX_OPERATIONAL_READ_LIMIT: usize = 1000;
405
406/// Thread-safe handle to the shared [`AdminService`].
407#[derive(Clone, Debug)]
408pub struct AdminHandle {
409    inner: Arc<AdminService>,
410}
411
412impl AdminHandle {
413    /// Wrap an [`AdminService`] in a shared handle.
414    #[must_use]
415    pub fn new(service: AdminService) -> Self {
416        Self {
417            inner: Arc::new(service),
418        }
419    }
420
421    /// Clone the inner `Arc` to the [`AdminService`].
422    #[must_use]
423    pub fn service(&self) -> Arc<AdminService> {
424        Arc::clone(&self.inner)
425    }
426}
427
428impl AdminService {
429    /// Create a new admin service for the database at the given path.
430    #[must_use]
431    pub fn new(path: impl AsRef<Path>, schema_manager: Arc<SchemaManager>) -> Self {
432        let database_path = path.as_ref().to_path_buf();
433        let projections = ProjectionService::new(&database_path, Arc::clone(&schema_manager));
434        Self {
435            database_path,
436            schema_manager,
437            projections,
438            rebuild_sender: None,
439        }
440    }
441
442    /// Create a new admin service wired to the background rebuild actor.
443    #[must_use]
444    pub fn new_with_rebuild(
445        path: impl AsRef<Path>,
446        schema_manager: Arc<SchemaManager>,
447        rebuild_sender: SyncSender<RebuildRequest>,
448    ) -> Self {
449        let database_path = path.as_ref().to_path_buf();
450        let projections = ProjectionService::new(&database_path, Arc::clone(&schema_manager));
451        Self {
452            database_path,
453            schema_manager,
454            projections,
455            rebuild_sender: Some(rebuild_sender),
456        }
457    }
458
459    fn connect(&self) -> Result<rusqlite::Connection, EngineError> {
460        #[cfg(feature = "sqlite-vec")]
461        let conn = sqlite::open_connection_with_vec(&self.database_path)?;
462        #[cfg(not(feature = "sqlite-vec"))]
463        let conn = sqlite::open_connection(&self.database_path)?;
464        self.schema_manager.bootstrap(&conn)?;
465        Ok(conn)
466    }
467
468    /// Persist or update the FTS tokenizer profile for a node kind.
469    ///
470    /// `tokenizer_str` may be a preset name (see [`TOKENIZER_PRESETS`]) or a
471    /// raw FTS5 tokenizer string.  The resolved string is validated before
472    /// being written to `projection_profiles`.
473    ///
474    /// # Errors
475    /// Returns [`EngineError`] if the tokenizer string contains disallowed
476    /// characters, or if the database write fails.
477    pub fn set_fts_profile(
478        &self,
479        kind: &str,
480        tokenizer_str: &str,
481    ) -> Result<FtsProfile, EngineError> {
482        let resolved = resolve_tokenizer_preset(tokenizer_str);
483        // Allowed chars: alphanumeric, space, apostrophe, dot, underscore, hyphen, dollar, at
484        if !resolved
485            .chars()
486            .all(|c| c.is_alphanumeric() || "'._-$@ ".contains(c))
487        {
488            return Err(EngineError::Bridge(format!(
489                "invalid tokenizer string: {resolved:?}"
490            )));
491        }
492        let conn = self.connect()?;
493        conn.execute(
494            r"INSERT INTO projection_profiles (kind, facet, config_json, active_at, created_at)
495              VALUES (?1, 'fts', json_object('tokenizer', ?2), unixepoch(), unixepoch())
496              ON CONFLICT(kind, facet) DO UPDATE SET
497                  config_json = json_object('tokenizer', ?2),
498                  active_at   = unixepoch()",
499            rusqlite::params![kind, resolved],
500        )?;
501        let row = conn.query_row(
502            "SELECT kind, json_extract(config_json, '$.tokenizer'), active_at, created_at \
503             FROM projection_profiles WHERE kind = ?1 AND facet = 'fts'",
504            rusqlite::params![kind],
505            |row| {
506                Ok(FtsProfile {
507                    kind: row.get(0)?,
508                    tokenizer: row.get(1)?,
509                    active_at: row.get(2)?,
510                    created_at: row.get(3)?,
511                })
512            },
513        )?;
514        Ok(row)
515    }
516
517    /// Retrieve the FTS tokenizer profile for a node kind.
518    ///
519    /// Returns `None` if no profile has been set for `kind`.
520    ///
521    /// # Errors
522    /// Returns [`EngineError`] if the database query fails.
523    pub fn get_fts_profile(&self, kind: &str) -> Result<Option<FtsProfile>, EngineError> {
524        let conn = self.connect()?;
525        let result = conn
526            .query_row(
527                "SELECT kind, json_extract(config_json, '$.tokenizer'), active_at, created_at \
528                 FROM projection_profiles WHERE kind = ?1 AND facet = 'fts'",
529                rusqlite::params![kind],
530                |row| {
531                    Ok(FtsProfile {
532                        kind: row.get(0)?,
533                        tokenizer: row.get(1)?,
534                        active_at: row.get(2)?,
535                        created_at: row.get(3)?,
536                    })
537                },
538            )
539            .optional()?;
540        Ok(result)
541    }
542
543    /// Retrieve the vector embedding profile for a specific node `kind`.
544    ///
545    /// Reads from `projection_profiles` under `(kind=<kind>, facet='vec')`.
546    /// Returns `None` if no vector profile has been persisted for this kind yet.
547    ///
548    /// # Errors
549    /// Returns [`EngineError`] if the database query fails.
550    pub fn get_vec_profile(&self, kind: &str) -> Result<Option<VecProfile>, EngineError> {
551        let conn = self.connect()?;
552        let result = conn
553            .query_row(
554                "SELECT \
555                   json_extract(config_json, '$.model_identity'), \
556                   json_extract(config_json, '$.model_version'), \
557                   CAST(json_extract(config_json, '$.dimensions') AS INTEGER), \
558                   active_at, \
559                   created_at \
560                 FROM projection_profiles WHERE kind = ?1 AND facet = 'vec'",
561                rusqlite::params![kind],
562                |row| {
563                    Ok(VecProfile {
564                        model_identity: row.get::<_, Option<String>>(0)?.unwrap_or_default(),
565                        model_version: row.get(1)?,
566                        dimensions: {
567                            let d: i64 = row.get::<_, Option<i64>>(2)?.unwrap_or(0);
568                            u32::try_from(d).unwrap_or(0)
569                        },
570                        active_at: row.get(3)?,
571                        created_at: row.get(4)?,
572                    })
573                },
574            )
575            .optional()?;
576        Ok(result)
577    }
578
579    /// Write or update the global vector profile from a JSON identity string.
580    ///
581    /// This is a private helper called after a successful vector regeneration.
582    /// Errors are logged as warnings and not propagated to the caller.
583    #[allow(dead_code)]
584    fn set_vec_profile_inner(
585        conn: &rusqlite::Connection,
586        identity_json: &str,
587    ) -> Result<VecProfile, rusqlite::Error> {
588        conn.execute(
589            r"INSERT INTO projection_profiles (kind, facet, config_json, active_at, created_at)
590              VALUES ('*', 'vec', ?1, unixepoch(), unixepoch())
591              ON CONFLICT(kind, facet) DO UPDATE SET
592                  config_json = ?1,
593                  active_at   = unixepoch()",
594            rusqlite::params![identity_json],
595        )?;
596        conn.query_row(
597            "SELECT \
598               json_extract(config_json, '$.model_identity'), \
599               json_extract(config_json, '$.model_version'), \
600               CAST(json_extract(config_json, '$.dimensions') AS INTEGER), \
601               active_at, \
602               created_at \
603             FROM projection_profiles WHERE kind = '*' AND facet = 'vec'",
604            [],
605            |row| {
606                Ok(VecProfile {
607                    model_identity: row.get(0)?,
608                    model_version: row.get(1)?,
609                    dimensions: {
610                        let d: i64 = row.get(2)?;
611                        u32::try_from(d).unwrap_or(0)
612                    },
613                    active_at: row.get(3)?,
614                    created_at: row.get(4)?,
615                })
616            },
617        )
618    }
619
620    /// Persist or update the global vector profile from a JSON config string.
621    ///
622    /// `config_json` must be valid JSON with at least a `model_identity`
623    /// field and `dimensions`.  The JSON is stored verbatim in the
624    /// `projection_profiles` table under `kind='*'`, `facet='vec'`.
625    ///
626    /// # Errors
627    /// Returns [`EngineError`] if the database write fails.
628    pub fn set_vec_profile(&self, config_json: &str) -> Result<VecProfile, EngineError> {
629        let conn = self.connect()?;
630        Self::set_vec_profile_inner(&conn, config_json).map_err(EngineError::Sqlite)
631    }
632
633    /// Estimate the cost of rebuilding a projection.
634    ///
635    /// For facet `"fts"`: counts active nodes of `kind`.
636    /// For facet `"vec"`: counts all chunks.
637    ///
638    /// # Errors
639    /// Returns [`EngineError`] for unknown facets or database errors.
640    pub fn preview_projection_impact(
641        &self,
642        kind: &str,
643        facet: &str,
644    ) -> Result<ProjectionImpact, EngineError> {
645        let conn = self.connect()?;
646        match facet {
647            "fts" => {
648                let rows: u64 = conn
649                    .query_row(
650                        "SELECT count(*) FROM nodes WHERE kind = ?1 AND superseded_at IS NULL",
651                        rusqlite::params![kind],
652                        |row| row.get::<_, i64>(0),
653                    )
654                    .map(i64::cast_unsigned)?;
655                let current_tokenizer = self.get_fts_profile(kind)?.map(|p| p.tokenizer);
656                Ok(ProjectionImpact {
657                    rows_to_rebuild: rows,
658                    estimated_seconds: rows / 5000,
659                    temp_db_size_bytes: rows * 200,
660                    current_tokenizer,
661                    target_tokenizer: None,
662                })
663            }
664            "vec" => {
665                let rows: u64 = conn
666                    .query_row("SELECT count(*) FROM chunks", [], |row| {
667                        row.get::<_, i64>(0)
668                    })
669                    .map(i64::cast_unsigned)?;
670                Ok(ProjectionImpact {
671                    rows_to_rebuild: rows,
672                    estimated_seconds: rows / 100,
673                    temp_db_size_bytes: rows * 1536,
674                    current_tokenizer: None,
675                    target_tokenizer: None,
676                })
677            }
678            other => Err(EngineError::Bridge(format!(
679                "unknown projection facet: {other:?}"
680            ))),
681        }
682    }
683
684    /// # Errors
685    /// Returns [`EngineError`] if the database connection fails or any SQL query fails.
686    pub fn check_integrity(&self) -> Result<IntegrityReport, EngineError> {
687        let conn = self.connect()?;
688
689        let physical_result: String =
690            conn.query_row("PRAGMA integrity_check", [], |row| row.get(0))?;
691        let foreign_key_count: i64 =
692            conn.query_row("SELECT count(*) FROM pragma_foreign_key_check", [], |row| {
693                row.get(0)
694            })?;
695        let missing_fts_rows: i64 = conn.query_row(
696            r"
697            SELECT count(*)
698            FROM chunks c
699            JOIN nodes n
700              ON n.logical_id = c.node_logical_id
701             AND n.superseded_at IS NULL
702            WHERE NOT EXISTS (
703                SELECT 1
704                FROM fts_nodes f
705                WHERE f.chunk_id = c.id
706            )
707            ",
708            [],
709            |row| row.get(0),
710        )?;
711        let duplicate_active: i64 = conn.query_row(
712            r"
713            SELECT count(*)
714            FROM (
715                SELECT logical_id
716                FROM nodes
717                WHERE superseded_at IS NULL
718                GROUP BY logical_id
719                HAVING count(*) > 1
720            )
721            ",
722            [],
723            |row| row.get(0),
724        )?;
725        let operational_missing_collections: i64 = conn.query_row(
726            r"
727            SELECT (
728                SELECT count(*)
729                FROM operational_mutations m
730                LEFT JOIN operational_collections c ON c.name = m.collection_name
731                WHERE c.name IS NULL
732            ) + (
733                SELECT count(*)
734                FROM operational_current oc
735                LEFT JOIN operational_collections c ON c.name = oc.collection_name
736                WHERE c.name IS NULL
737            )
738            ",
739            [],
740            |row| row.get(0),
741        )?;
742        let operational_missing_last_mutations: i64 = conn.query_row(
743            r"
744            SELECT count(*)
745            FROM operational_current oc
746            LEFT JOIN operational_mutations m ON m.id = oc.last_mutation_id
747            WHERE m.id IS NULL
748            ",
749            [],
750            |row| row.get(0),
751        )?;
752
753        // Count missing property FTS rows using the same extraction logic as
754        // write/rebuild. A pure-SQL check would overcount: nodes whose declared
755        // paths legitimately normalize to no values correctly have no row.
756        let missing_property_fts_rows = count_missing_property_fts_rows(&conn)?;
757
758        let mut warnings = Vec::new();
759        if missing_fts_rows > 0 {
760            warnings.push("missing FTS projections detected".to_owned());
761        }
762        if missing_property_fts_rows > 0 {
763            warnings.push("missing property FTS projections detected".to_owned());
764        }
765        if duplicate_active > 0 {
766            warnings.push("duplicate active logical_ids detected".to_owned());
767        }
768        if operational_missing_collections > 0 {
769            warnings.push("operational rows reference missing collections".to_owned());
770        }
771        if operational_missing_last_mutations > 0 {
772            warnings.push("operational current rows reference missing last mutations".to_owned());
773        }
774
775        // FIX(review): was `as usize` — unsound on 32-bit targets, wraps negatives silently.
776        // Options: (A) try_from().unwrap_or(0) — masks corruption, (B) try_from().expect() —
777        // panics on corruption, (C) propagate error. Chose (B) here: a negative count(*)
778        // signals data corruption, and the integrity report would be meaningless anyway.
779        Ok(IntegrityReport {
780            physical_ok: physical_result == "ok",
781            foreign_keys_ok: foreign_key_count == 0,
782            missing_fts_rows: i64_to_usize(missing_fts_rows),
783            missing_property_fts_rows: i64_to_usize(missing_property_fts_rows),
784            duplicate_active_logical_ids: i64_to_usize(duplicate_active),
785            operational_missing_collections: i64_to_usize(operational_missing_collections),
786            operational_missing_last_mutations: i64_to_usize(operational_missing_last_mutations),
787            warnings,
788        })
789    }
790
791    /// # Errors
792    /// Returns [`EngineError`] if the database connection fails or any SQL query fails.
793    #[allow(clippy::too_many_lines)]
794    pub fn check_semantics(&self) -> Result<SemanticReport, EngineError> {
795        let conn = self.connect()?;
796
797        let orphaned_chunks: i64 = conn.query_row(
798            r"
799            SELECT count(*)
800            FROM chunks c
801            WHERE NOT EXISTS (
802                SELECT 1 FROM nodes n
803                WHERE n.logical_id = c.node_logical_id
804            )
805            ",
806            [],
807            |row| row.get(0),
808        )?;
809
810        let null_source_ref_nodes: i64 = conn.query_row(
811            "SELECT count(*) FROM nodes WHERE source_ref IS NULL AND superseded_at IS NULL",
812            [],
813            |row| row.get(0),
814        )?;
815
816        let broken_step_fk: i64 = conn.query_row(
817            r"
818            SELECT count(*) FROM steps s
819            WHERE NOT EXISTS (SELECT 1 FROM runs r WHERE r.id = s.run_id)
820            ",
821            [],
822            |row| row.get(0),
823        )?;
824
825        let broken_action_fk: i64 = conn.query_row(
826            r"
827            SELECT count(*) FROM actions a
828            WHERE NOT EXISTS (SELECT 1 FROM steps s WHERE s.id = a.step_id)
829            ",
830            [],
831            |row| row.get(0),
832        )?;
833
834        let stale_fts_rows: i64 = conn.query_row(
835            r"
836            SELECT count(*) FROM fts_nodes f
837            WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = f.chunk_id)
838            ",
839            [],
840            |row| row.get(0),
841        )?;
842
843        let fts_rows_for_superseded_nodes: i64 = conn.query_row(
844            r"
845            SELECT count(*) FROM fts_nodes f
846            WHERE NOT EXISTS (
847                SELECT 1 FROM nodes n
848                WHERE n.logical_id = f.node_logical_id AND n.superseded_at IS NULL
849            )
850            ",
851            [],
852            |row| row.get(0),
853        )?;
854
855        let (
856            stale_property_fts_rows,
857            orphaned_property_fts_rows,
858            mismatched_kind_property_fts_rows,
859            duplicate_property_fts_rows,
860        ) = count_per_kind_property_fts_issues(&conn)?;
861
862        let drifted_property_fts_rows = count_drifted_property_fts_rows(&conn)?;
863
864        let dangling_edges: i64 = conn.query_row(
865            r"
866            SELECT count(*) FROM edges e
867            WHERE e.superseded_at IS NULL AND (
868                NOT EXISTS (SELECT 1 FROM nodes n WHERE n.logical_id = e.source_logical_id AND n.superseded_at IS NULL)
869                OR
870                NOT EXISTS (SELECT 1 FROM nodes n WHERE n.logical_id = e.target_logical_id AND n.superseded_at IS NULL)
871            )
872            ",
873            [],
874            |row| row.get(0),
875        )?;
876
877        let orphaned_supersession_chains: i64 = conn.query_row(
878            r"
879            SELECT count(*) FROM (
880                SELECT logical_id FROM nodes
881                GROUP BY logical_id
882                HAVING count(*) > 0 AND sum(CASE WHEN superseded_at IS NULL THEN 1 ELSE 0 END) = 0
883            )
884            ",
885            [],
886            |row| row.get(0),
887        )?;
888
889        // Vec stale row detection — iterates per-kind vec tables from projection_profiles.
890        #[cfg(feature = "sqlite-vec")]
891        let (stale_vec_rows, vec_rows_for_superseded_nodes): (i64, i64) = {
892            let kinds: Vec<String> =
893                match conn.prepare("SELECT kind FROM projection_profiles WHERE facet = 'vec'") {
894                    Ok(mut stmt) => stmt
895                        .query_map([], |row| row.get(0))
896                        .map_err(EngineError::Sqlite)?
897                        .collect::<Result<Vec<_>, _>>()
898                        .map_err(EngineError::Sqlite)?,
899                    Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
900                        if msg.contains("no such table: projection_profiles") =>
901                    {
902                        vec![]
903                    }
904                    Err(e) => return Err(EngineError::Sqlite(e)),
905                };
906            let mut stale = 0i64;
907            let mut superseded = 0i64;
908            for kind in &kinds {
909                let table = fathomdb_schema::vec_kind_table_name(kind);
910                let stale_sql = format!(
911                    "SELECT count(*) FROM {table} v \
912                     WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = v.chunk_id)"
913                );
914                let superseded_sql = format!(
915                    "SELECT count(*) FROM {table} v \
916                     JOIN chunks c ON c.id = v.chunk_id \
917                     WHERE NOT EXISTS (SELECT 1 FROM nodes n WHERE n.logical_id = c.node_logical_id)"
918                );
919                stale += match conn.query_row(&stale_sql, [], |row| row.get(0)) {
920                    Ok(n) => n,
921                    Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
922                        if msg.contains("no such table:")
923                            || msg.contains("no such module: vec0") =>
924                    {
925                        0
926                    }
927                    Err(e) => return Err(EngineError::Sqlite(e)),
928                };
929                superseded += match conn.query_row(&superseded_sql, [], |row| row.get(0)) {
930                    Ok(n) => n,
931                    Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
932                        if msg.contains("no such table:")
933                            || msg.contains("no such module: vec0") =>
934                    {
935                        0
936                    }
937                    Err(e) => return Err(EngineError::Sqlite(e)),
938                };
939            }
940            (stale, superseded)
941        };
942        #[cfg(not(feature = "sqlite-vec"))]
943        let stale_vec_rows: i64 = 0;
944        #[cfg(not(feature = "sqlite-vec"))]
945        let vec_rows_for_superseded_nodes: i64 = 0;
946        let missing_operational_current_rows: i64 = conn.query_row(
947            r"
948            SELECT count(*)
949            FROM operational_mutations m
950            JOIN operational_collections c
951              ON c.name = m.collection_name
952             AND c.kind = 'latest_state'
953            WHERE m.op_kind = 'put'
954              AND NOT EXISTS (
955                    SELECT 1
956                    FROM operational_mutations newer
957                    WHERE newer.collection_name = m.collection_name
958                      AND newer.record_key = m.record_key
959                      AND newer.mutation_order > m.mutation_order
960                )
961              AND NOT EXISTS (
962                    SELECT 1
963                    FROM operational_current oc
964                    WHERE oc.collection_name = m.collection_name
965                      AND oc.record_key = m.record_key
966                )
967            ",
968            [],
969            |row| row.get(0),
970        )?;
971        let stale_operational_current_rows: i64 = conn.query_row(
972            r"
973            SELECT count(*)
974            FROM operational_current oc
975            JOIN operational_collections c
976              ON c.name = oc.collection_name
977             AND c.kind = 'latest_state'
978            LEFT JOIN operational_mutations m ON m.id = oc.last_mutation_id
979            WHERE m.id IS NULL
980               OR m.collection_name != oc.collection_name
981               OR m.record_key != oc.record_key
982               OR m.op_kind != 'put'
983               OR m.payload_json != oc.payload_json
984               OR EXISTS (
985                    SELECT 1
986                    FROM operational_mutations newer
987                    WHERE newer.collection_name = oc.collection_name
988                      AND newer.record_key = oc.record_key
989                      AND newer.mutation_order > m.mutation_order
990                )
991            ",
992            [],
993            |row| row.get(0),
994        )?;
995        let disabled_collection_mutations: i64 = conn.query_row(
996            r"
997            SELECT count(*)
998            FROM operational_mutations m
999            JOIN operational_collections c ON c.name = m.collection_name
1000            WHERE c.disabled_at IS NOT NULL AND m.created_at > c.disabled_at
1001            ",
1002            [],
1003            |row| row.get(0),
1004        )?;
1005        let orphaned_last_access_metadata_rows: i64 = conn.query_row(
1006            r"
1007            SELECT count(*)
1008            FROM node_access_metadata am
1009            WHERE NOT EXISTS (
1010                SELECT 1 FROM nodes n WHERE n.logical_id = am.logical_id
1011            )
1012            ",
1013            [],
1014            |row| row.get(0),
1015        )?;
1016
1017        let mut warnings = Vec::new();
1018        if orphaned_chunks > 0 {
1019            warnings.push(format!(
1020                "{orphaned_chunks} orphaned chunk(s) with no surviving node history"
1021            ));
1022        }
1023        if null_source_ref_nodes > 0 {
1024            warnings.push(format!(
1025                "{null_source_ref_nodes} active node(s) with null source_ref"
1026            ));
1027        }
1028        if broken_step_fk > 0 {
1029            warnings.push(format!(
1030                "{broken_step_fk} step(s) referencing non-existent run"
1031            ));
1032        }
1033        if broken_action_fk > 0 {
1034            warnings.push(format!(
1035                "{broken_action_fk} action(s) referencing non-existent step"
1036            ));
1037        }
1038        if stale_fts_rows > 0 {
1039            warnings.push(format!(
1040                "{stale_fts_rows} stale FTS row(s) referencing missing chunk"
1041            ));
1042        }
1043        if fts_rows_for_superseded_nodes > 0 {
1044            warnings.push(format!(
1045                "{fts_rows_for_superseded_nodes} FTS row(s) for superseded node(s)"
1046            ));
1047        }
1048        if stale_property_fts_rows > 0 {
1049            warnings.push(format!(
1050                "{stale_property_fts_rows} stale property FTS row(s) for superseded/missing node(s)"
1051            ));
1052        }
1053        if orphaned_property_fts_rows > 0 {
1054            warnings.push(format!(
1055                "{orphaned_property_fts_rows} orphaned property FTS row(s) for unregistered kind(s)"
1056            ));
1057        }
1058        if mismatched_kind_property_fts_rows > 0 {
1059            warnings.push(format!(
1060                "{mismatched_kind_property_fts_rows} property FTS row(s) whose kind does not match the active node"
1061            ));
1062        }
1063        if duplicate_property_fts_rows > 0 {
1064            warnings.push(format!(
1065                "{duplicate_property_fts_rows} active logical ID(s) with duplicate property FTS rows"
1066            ));
1067        }
1068        if drifted_property_fts_rows > 0 {
1069            warnings.push(format!(
1070                "{drifted_property_fts_rows} property FTS row(s) with stale text_content"
1071            ));
1072        }
1073        if dangling_edges > 0 {
1074            warnings.push(format!(
1075                "{dangling_edges} active edge(s) with missing endpoint node"
1076            ));
1077        }
1078        if orphaned_supersession_chains > 0 {
1079            warnings.push(format!(
1080                "{orphaned_supersession_chains} logical_id(s) with all versions superseded"
1081            ));
1082        }
1083        if stale_vec_rows > 0 {
1084            warnings.push(format!(
1085                "{stale_vec_rows} stale vec row(s) referencing missing chunk"
1086            ));
1087        }
1088        if vec_rows_for_superseded_nodes > 0 {
1089            warnings.push(format!(
1090                "{vec_rows_for_superseded_nodes} vec row(s) whose node history is missing"
1091            ));
1092        }
1093        if missing_operational_current_rows > 0 {
1094            warnings.push(format!(
1095                "{missing_operational_current_rows} latest-state key(s) missing operational_current rows"
1096            ));
1097        }
1098        if stale_operational_current_rows > 0 {
1099            warnings.push(format!(
1100                "{stale_operational_current_rows} stale operational_current row(s)"
1101            ));
1102        }
1103        if disabled_collection_mutations > 0 {
1104            warnings.push(format!(
1105                "{disabled_collection_mutations} mutation(s) were written after collection disable"
1106            ));
1107        }
1108        if orphaned_last_access_metadata_rows > 0 {
1109            warnings.push(format!(
1110                "{orphaned_last_access_metadata_rows} last_access metadata row(s) reference missing node history"
1111            ));
1112        }
1113
1114        Ok(SemanticReport {
1115            orphaned_chunks: i64_to_usize(orphaned_chunks),
1116            null_source_ref_nodes: i64_to_usize(null_source_ref_nodes),
1117            broken_step_fk: i64_to_usize(broken_step_fk),
1118            broken_action_fk: i64_to_usize(broken_action_fk),
1119            stale_fts_rows: i64_to_usize(stale_fts_rows),
1120            fts_rows_for_superseded_nodes: i64_to_usize(fts_rows_for_superseded_nodes),
1121            stale_property_fts_rows: i64_to_usize(stale_property_fts_rows),
1122            orphaned_property_fts_rows: i64_to_usize(orphaned_property_fts_rows),
1123            mismatched_kind_property_fts_rows: i64_to_usize(mismatched_kind_property_fts_rows),
1124            duplicate_property_fts_rows: i64_to_usize(duplicate_property_fts_rows),
1125            drifted_property_fts_rows: i64_to_usize(drifted_property_fts_rows),
1126            dangling_edges: i64_to_usize(dangling_edges),
1127            orphaned_supersession_chains: i64_to_usize(orphaned_supersession_chains),
1128            stale_vec_rows: i64_to_usize(stale_vec_rows),
1129            vec_rows_for_superseded_nodes: i64_to_usize(vec_rows_for_superseded_nodes),
1130            missing_operational_current_rows: i64_to_usize(missing_operational_current_rows),
1131            stale_operational_current_rows: i64_to_usize(stale_operational_current_rows),
1132            disabled_collection_mutations: i64_to_usize(disabled_collection_mutations),
1133            orphaned_last_access_metadata_rows: i64_to_usize(orphaned_last_access_metadata_rows),
1134            warnings,
1135        })
1136    }
1137
1138    /// # Errors
1139    /// Returns [`EngineError`] if the collection metadata is invalid or the insert fails.
1140    pub fn register_operational_collection(
1141        &self,
1142        request: &OperationalRegisterRequest,
1143    ) -> Result<OperationalCollectionRecord, EngineError> {
1144        if request.name.trim().is_empty() {
1145            return Err(EngineError::InvalidWrite(
1146                "operational collection name must not be empty".to_owned(),
1147            ));
1148        }
1149        if request.schema_json.is_empty() {
1150            return Err(EngineError::InvalidWrite(
1151                "operational collection schema_json must not be empty".to_owned(),
1152            ));
1153        }
1154        if request.retention_json.is_empty() {
1155            return Err(EngineError::InvalidWrite(
1156                "operational collection retention_json must not be empty".to_owned(),
1157            ));
1158        }
1159        if request.filter_fields_json.is_empty() {
1160            return Err(EngineError::InvalidWrite(
1161                "operational collection filter_fields_json must not be empty".to_owned(),
1162            ));
1163        }
1164        parse_operational_validation_contract(&request.validation_json)
1165            .map_err(EngineError::InvalidWrite)?;
1166        parse_operational_secondary_indexes_json(&request.secondary_indexes_json, request.kind)
1167            .map_err(EngineError::InvalidWrite)?;
1168        if request.format_version <= 0 {
1169            return Err(EngineError::InvalidWrite(
1170                "operational collection format_version must be positive".to_owned(),
1171            ));
1172        }
1173        parse_operational_filter_fields(&request.filter_fields_json)
1174            .map_err(EngineError::InvalidWrite)?;
1175
1176        let mut conn = self.connect()?;
1177        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1178        tx.execute(
1179            "INSERT INTO operational_collections \
1180             (name, kind, schema_json, retention_json, filter_fields_json, validation_json, secondary_indexes_json, format_version, created_at) \
1181             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, unixepoch())",
1182            rusqlite::params![
1183                request.name.as_str(),
1184                request.kind.as_str(),
1185                request.schema_json.as_str(),
1186                request.retention_json.as_str(),
1187                request.filter_fields_json.as_str(),
1188                request.validation_json.as_str(),
1189                request.secondary_indexes_json.as_str(),
1190                request.format_version,
1191            ],
1192        )?;
1193        persist_simple_provenance_event(
1194            &tx,
1195            "operational_collection_registered",
1196            request.name.as_str(),
1197            Some(serde_json::json!({
1198                "kind": request.kind.as_str(),
1199                "format_version": request.format_version,
1200            })),
1201        )?;
1202        tx.commit()?;
1203
1204        self.describe_operational_collection(&request.name)?
1205            .ok_or_else(|| {
1206                EngineError::Bridge("registered collection missing after commit".to_owned())
1207            })
1208    }
1209
1210    /// # Errors
1211    /// Returns [`EngineError`] if the database query fails.
1212    pub fn describe_operational_collection(
1213        &self,
1214        name: &str,
1215    ) -> Result<Option<OperationalCollectionRecord>, EngineError> {
1216        let conn = self.connect()?;
1217        load_operational_collection_record(&conn, name)
1218    }
1219
1220    /// # Errors
1221    /// Returns [`EngineError`] if the collection is missing, the filter contract is invalid,
1222    /// or existing mutation backfill fails.
1223    pub fn update_operational_collection_filters(
1224        &self,
1225        name: &str,
1226        filter_fields_json: &str,
1227    ) -> Result<OperationalCollectionRecord, EngineError> {
1228        if filter_fields_json.is_empty() {
1229            return Err(EngineError::InvalidWrite(
1230                "operational collection filter_fields_json must not be empty".to_owned(),
1231            ));
1232        }
1233        let declared_fields = parse_operational_filter_fields(filter_fields_json)
1234            .map_err(EngineError::InvalidWrite)?;
1235
1236        let mut conn = self.connect()?;
1237        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1238        load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1239            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1240        })?;
1241        tx.execute(
1242            "UPDATE operational_collections SET filter_fields_json = ?2 WHERE name = ?1",
1243            rusqlite::params![name, filter_fields_json],
1244        )?;
1245        tx.execute(
1246            "DELETE FROM operational_filter_values WHERE collection_name = ?1",
1247            [name],
1248        )?;
1249
1250        let mut mutation_stmt = tx.prepare(
1251            "SELECT id, payload_json FROM operational_mutations \
1252             WHERE collection_name = ?1 ORDER BY mutation_order",
1253        )?;
1254        let mutations = mutation_stmt
1255            .query_map([name], |row| {
1256                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
1257            })?
1258            .collect::<Result<Vec<_>, _>>()?;
1259        drop(mutation_stmt);
1260
1261        let mut insert_filter_value = tx.prepare_cached(
1262            "INSERT INTO operational_filter_values \
1263             (mutation_id, collection_name, field_name, string_value, integer_value) \
1264             VALUES (?1, ?2, ?3, ?4, ?5)",
1265        )?;
1266        let mut inserted_values = 0usize;
1267        for (mutation_id, payload_json) in &mutations {
1268            for filter_value in
1269                extract_operational_filter_values(&declared_fields, payload_json.as_str())
1270            {
1271                insert_filter_value.execute(rusqlite::params![
1272                    mutation_id,
1273                    name,
1274                    filter_value.field_name,
1275                    filter_value.string_value,
1276                    filter_value.integer_value,
1277                ])?;
1278                inserted_values += 1;
1279            }
1280        }
1281        drop(insert_filter_value);
1282
1283        persist_simple_provenance_event(
1284            &tx,
1285            "operational_collection_filter_fields_updated",
1286            name,
1287            Some(serde_json::json!({
1288                "field_count": declared_fields.len(),
1289                "mutations_backfilled": mutations.len(),
1290                "inserted_filter_values": inserted_values,
1291            })),
1292        )?;
1293        let updated = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1294            EngineError::Bridge("operational collection missing after filter update".to_owned())
1295        })?;
1296        tx.commit()?;
1297        Ok(updated)
1298    }
1299
1300    /// # Errors
1301    /// Returns [`EngineError`] if the collection is missing or the validation contract is invalid.
1302    pub fn update_operational_collection_validation(
1303        &self,
1304        name: &str,
1305        validation_json: &str,
1306    ) -> Result<OperationalCollectionRecord, EngineError> {
1307        parse_operational_validation_contract(validation_json)
1308            .map_err(EngineError::InvalidWrite)?;
1309
1310        let mut conn = self.connect()?;
1311        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1312        load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1313            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1314        })?;
1315        tx.execute(
1316            "UPDATE operational_collections SET validation_json = ?2 WHERE name = ?1",
1317            rusqlite::params![name, validation_json],
1318        )?;
1319        persist_simple_provenance_event(
1320            &tx,
1321            "operational_collection_validation_updated",
1322            name,
1323            Some(serde_json::json!({
1324                "has_validation": !validation_json.is_empty(),
1325            })),
1326        )?;
1327        let updated = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1328            EngineError::Bridge("operational collection missing after validation update".to_owned())
1329        })?;
1330        tx.commit()?;
1331        Ok(updated)
1332    }
1333
1334    /// # Errors
1335    /// Returns [`EngineError`] if the collection is missing, the contract is invalid,
1336    /// or derived index rebuild fails.
1337    pub fn update_operational_collection_secondary_indexes(
1338        &self,
1339        name: &str,
1340        secondary_indexes_json: &str,
1341    ) -> Result<OperationalCollectionRecord, EngineError> {
1342        let mut conn = self.connect()?;
1343        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1344        let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1345            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1346        })?;
1347        let indexes = parse_operational_secondary_indexes_json(secondary_indexes_json, record.kind)
1348            .map_err(EngineError::InvalidWrite)?;
1349        tx.execute(
1350            "UPDATE operational_collections SET secondary_indexes_json = ?2 WHERE name = ?1",
1351            rusqlite::params![name, secondary_indexes_json],
1352        )?;
1353        let (mutation_entries_rebuilt, current_entries_rebuilt) =
1354            rebuild_operational_secondary_index_entries(&tx, &record.name, record.kind, &indexes)?;
1355        persist_simple_provenance_event(
1356            &tx,
1357            "operational_collection_secondary_indexes_updated",
1358            name,
1359            Some(serde_json::json!({
1360                "index_count": indexes.len(),
1361                "mutation_entries_rebuilt": mutation_entries_rebuilt,
1362                "current_entries_rebuilt": current_entries_rebuilt,
1363            })),
1364        )?;
1365        let updated = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1366            EngineError::Bridge(
1367                "operational collection missing after secondary index update".to_owned(),
1368            )
1369        })?;
1370        tx.commit()?;
1371        Ok(updated)
1372    }
1373
1374    /// # Errors
1375    /// Returns [`EngineError`] if the collection is missing or rebuild fails.
1376    pub fn rebuild_operational_secondary_indexes(
1377        &self,
1378        name: &str,
1379    ) -> Result<OperationalSecondaryIndexRebuildReport, EngineError> {
1380        let mut conn = self.connect()?;
1381        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1382        let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1383            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1384        })?;
1385        let indexes =
1386            parse_operational_secondary_indexes_json(&record.secondary_indexes_json, record.kind)
1387                .map_err(EngineError::InvalidWrite)?;
1388        let (mutation_entries_rebuilt, current_entries_rebuilt) =
1389            rebuild_operational_secondary_index_entries(&tx, &record.name, record.kind, &indexes)?;
1390        persist_simple_provenance_event(
1391            &tx,
1392            "operational_secondary_indexes_rebuilt",
1393            name,
1394            Some(serde_json::json!({
1395                "index_count": indexes.len(),
1396                "mutation_entries_rebuilt": mutation_entries_rebuilt,
1397                "current_entries_rebuilt": current_entries_rebuilt,
1398            })),
1399        )?;
1400        tx.commit()?;
1401        Ok(OperationalSecondaryIndexRebuildReport {
1402            collection_name: name.to_owned(),
1403            mutation_entries_rebuilt,
1404            current_entries_rebuilt,
1405        })
1406    }
1407
1408    /// # Errors
1409    /// Returns [`EngineError`] if the collection is missing or its validation contract is invalid.
1410    pub fn validate_operational_collection_history(
1411        &self,
1412        name: &str,
1413    ) -> Result<OperationalHistoryValidationReport, EngineError> {
1414        let conn = self.connect()?;
1415        let record = load_operational_collection_record(&conn, name)?.ok_or_else(|| {
1416            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1417        })?;
1418        let Some(contract) = parse_operational_validation_contract(&record.validation_json)
1419            .map_err(EngineError::InvalidWrite)?
1420        else {
1421            return Err(EngineError::InvalidWrite(format!(
1422                "operational collection '{name}' has no validation_json configured"
1423            )));
1424        };
1425
1426        let mut stmt = conn.prepare(
1427            "SELECT id, record_key, op_kind, payload_json FROM operational_mutations \
1428             WHERE collection_name = ?1 ORDER BY mutation_order",
1429        )?;
1430        let rows = stmt
1431            .query_map([name], |row| {
1432                Ok((
1433                    row.get::<_, String>(0)?,
1434                    row.get::<_, String>(1)?,
1435                    row.get::<_, String>(2)?,
1436                    row.get::<_, String>(3)?,
1437                ))
1438            })?
1439            .collect::<Result<Vec<_>, _>>()?;
1440        drop(stmt);
1441
1442        let mut checked_rows = 0usize;
1443        let mut issues = Vec::new();
1444        for (mutation_id, record_key, op_kind, payload_json) in rows {
1445            if op_kind == "delete" {
1446                continue;
1447            }
1448            checked_rows += 1;
1449            if let Err(message) =
1450                validate_operational_payload_against_contract(&contract, payload_json.as_str())
1451            {
1452                issues.push(OperationalHistoryValidationIssue {
1453                    mutation_id,
1454                    record_key,
1455                    op_kind,
1456                    message,
1457                });
1458            }
1459        }
1460
1461        Ok(OperationalHistoryValidationReport {
1462            collection_name: name.to_owned(),
1463            checked_rows,
1464            invalid_row_count: issues.len(),
1465            issues,
1466        })
1467    }
1468
1469    /// # Errors
1470    /// Returns [`EngineError`] if the database query fails.
1471    pub fn disable_operational_collection(
1472        &self,
1473        name: &str,
1474    ) -> Result<OperationalCollectionRecord, EngineError> {
1475        let mut conn = self.connect()?;
1476        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1477        let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1478            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1479        })?;
1480        let changed = if record.disabled_at.is_none() {
1481            tx.execute(
1482                "UPDATE operational_collections SET disabled_at = unixepoch() WHERE name = ?1",
1483                [name],
1484            )?;
1485            true
1486        } else {
1487            false
1488        };
1489        let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1490            EngineError::Bridge("operational collection missing after disable".to_owned())
1491        })?;
1492        persist_simple_provenance_event(
1493            &tx,
1494            "operational_collection_disabled",
1495            name,
1496            Some(serde_json::json!({
1497                "disabled_at": record.disabled_at,
1498                "changed": changed,
1499            })),
1500        )?;
1501        tx.commit()?;
1502        Ok(record)
1503    }
1504
1505    /// # Errors
1506    /// Returns [`EngineError`] if the database query fails.
1507    pub fn compact_operational_collection(
1508        &self,
1509        name: &str,
1510        dry_run: bool,
1511    ) -> Result<OperationalCompactionReport, EngineError> {
1512        let mut conn = self.connect()?;
1513        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1514        let collection = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1515            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1516        })?;
1517        validate_append_only_operational_collection(&collection, "compact")?;
1518        let (mutation_ids, before_timestamp) =
1519            operational_compaction_candidates(&tx, &collection.retention_json, name)?;
1520        if dry_run {
1521            drop(tx);
1522            return Ok(OperationalCompactionReport {
1523                collection_name: name.to_owned(),
1524                deleted_mutations: mutation_ids.len(),
1525                dry_run: true,
1526                before_timestamp,
1527            });
1528        }
1529        let mut delete_stmt =
1530            tx.prepare_cached("DELETE FROM operational_mutations WHERE id = ?1")?;
1531        for mutation_id in &mutation_ids {
1532            delete_stmt.execute([mutation_id.as_str()])?;
1533        }
1534        drop(delete_stmt);
1535        persist_simple_provenance_event(
1536            &tx,
1537            "operational_collection_compacted",
1538            name,
1539            Some(serde_json::json!({
1540                "deleted_mutations": mutation_ids.len(),
1541                "before_timestamp": before_timestamp,
1542            })),
1543        )?;
1544        tx.commit()?;
1545        Ok(OperationalCompactionReport {
1546            collection_name: name.to_owned(),
1547            deleted_mutations: mutation_ids.len(),
1548            dry_run: false,
1549            before_timestamp,
1550        })
1551    }
1552
1553    /// # Errors
1554    /// Returns [`EngineError`] if the database query fails.
1555    pub fn purge_operational_collection(
1556        &self,
1557        name: &str,
1558        before_timestamp: i64,
1559    ) -> Result<OperationalPurgeReport, EngineError> {
1560        let mut conn = self.connect()?;
1561        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1562        let collection = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1563            EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1564        })?;
1565        validate_append_only_operational_collection(&collection, "purge")?;
1566        let deleted_mutations = tx.execute(
1567            "DELETE FROM operational_mutations WHERE collection_name = ?1 AND created_at < ?2",
1568            rusqlite::params![name, before_timestamp],
1569        )?;
1570        persist_simple_provenance_event(
1571            &tx,
1572            "operational_collection_purged",
1573            name,
1574            Some(serde_json::json!({
1575                "deleted_mutations": deleted_mutations,
1576                "before_timestamp": before_timestamp,
1577            })),
1578        )?;
1579        tx.commit()?;
1580        Ok(OperationalPurgeReport {
1581            collection_name: name.to_owned(),
1582            deleted_mutations,
1583            before_timestamp,
1584        })
1585    }
1586
1587    /// # Errors
1588    /// Returns [`EngineError`] if collection selection or policy parsing fails.
1589    pub fn plan_operational_retention(
1590        &self,
1591        now_timestamp: i64,
1592        collection_names: Option<&[String]>,
1593        max_collections: Option<usize>,
1594    ) -> Result<OperationalRetentionPlanReport, EngineError> {
1595        let conn = self.connect()?;
1596        let records = load_operational_retention_records(&conn, collection_names, max_collections)?;
1597        let mut items = Vec::with_capacity(records.len());
1598        for record in records {
1599            items.push(plan_operational_retention_item(
1600                &conn,
1601                &record,
1602                now_timestamp,
1603            )?);
1604        }
1605        Ok(OperationalRetentionPlanReport {
1606            planned_at: now_timestamp,
1607            collections_examined: items.len(),
1608            items,
1609        })
1610    }
1611
1612    /// # Errors
1613    /// Returns [`EngineError`] if collection selection, policy parsing, or execution fails.
1614    pub fn run_operational_retention(
1615        &self,
1616        now_timestamp: i64,
1617        collection_names: Option<&[String]>,
1618        max_collections: Option<usize>,
1619        dry_run: bool,
1620    ) -> Result<OperationalRetentionRunReport, EngineError> {
1621        let mut conn = self.connect()?;
1622        let records = load_operational_retention_records(&conn, collection_names, max_collections)?;
1623        let mut items = Vec::with_capacity(records.len());
1624        let mut collections_acted_on = 0usize;
1625
1626        for record in records {
1627            let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1628            let item = run_operational_retention_item(&tx, &record, now_timestamp, dry_run)?;
1629            if item.deleted_mutations > 0 {
1630                collections_acted_on += 1;
1631            }
1632            if dry_run || item.action_kind == OperationalRetentionActionKind::Noop {
1633                drop(tx);
1634            } else {
1635                tx.commit()?;
1636            }
1637            items.push(item);
1638        }
1639
1640        Ok(OperationalRetentionRunReport {
1641            executed_at: now_timestamp,
1642            collections_examined: items.len(),
1643            collections_acted_on,
1644            dry_run,
1645            items,
1646        })
1647    }
1648
1649    /// # Errors
1650    /// Returns [`EngineError`] if the database query fails.
1651    pub fn trace_operational_collection(
1652        &self,
1653        collection_name: &str,
1654        record_key: Option<&str>,
1655    ) -> Result<OperationalTraceReport, EngineError> {
1656        let conn = self.connect()?;
1657        ensure_operational_collection_registered(&conn, collection_name)?;
1658        let mutations = if let Some(record_key) = record_key {
1659            let mut stmt = conn.prepare(
1660                "SELECT id, collection_name, record_key, op_kind, payload_json, source_ref, created_at \
1661                 FROM operational_mutations \
1662                 WHERE collection_name = ?1 AND record_key = ?2 \
1663                 ORDER BY mutation_order",
1664            )?;
1665            stmt.query_map([collection_name, record_key], map_operational_mutation_row)?
1666                .collect::<Result<Vec<_>, _>>()?
1667        } else {
1668            let mut stmt = conn.prepare(
1669                "SELECT id, collection_name, record_key, op_kind, payload_json, source_ref, created_at \
1670                 FROM operational_mutations \
1671                 WHERE collection_name = ?1 \
1672                 ORDER BY mutation_order",
1673            )?;
1674            stmt.query_map([collection_name], map_operational_mutation_row)?
1675                .collect::<Result<Vec<_>, _>>()?
1676        };
1677        let current_rows = if let Some(record_key) = record_key {
1678            let mut stmt = conn.prepare(
1679                "SELECT collection_name, record_key, payload_json, updated_at, last_mutation_id \
1680                 FROM operational_current \
1681                 WHERE collection_name = ?1 AND record_key = ?2 \
1682                 ORDER BY updated_at, record_key",
1683            )?;
1684            stmt.query_map([collection_name, record_key], map_operational_current_row)?
1685                .collect::<Result<Vec<_>, _>>()?
1686        } else {
1687            let mut stmt = conn.prepare(
1688                "SELECT collection_name, record_key, payload_json, updated_at, last_mutation_id \
1689                 FROM operational_current \
1690                 WHERE collection_name = ?1 \
1691                 ORDER BY updated_at, record_key",
1692            )?;
1693            stmt.query_map([collection_name], map_operational_current_row)?
1694                .collect::<Result<Vec<_>, _>>()?
1695        };
1696
1697        Ok(OperationalTraceReport {
1698            collection_name: collection_name.to_owned(),
1699            record_key: record_key.map(str::to_owned),
1700            mutation_count: mutations.len(),
1701            current_count: current_rows.len(),
1702            mutations,
1703            current_rows,
1704        })
1705    }
1706
1707    /// # Errors
1708    /// Returns [`EngineError`] if the collection contract is invalid or the filtered read fails.
1709    pub fn read_operational_collection(
1710        &self,
1711        request: &OperationalReadRequest,
1712    ) -> Result<OperationalReadReport, EngineError> {
1713        if request.collection_name.trim().is_empty() {
1714            return Err(EngineError::InvalidWrite(
1715                "operational read collection_name must not be empty".to_owned(),
1716            ));
1717        }
1718        if request.filters.is_empty() {
1719            return Err(EngineError::InvalidWrite(
1720                "operational read requires at least one filter clause".to_owned(),
1721            ));
1722        }
1723
1724        let conn = self.connect()?;
1725        let record = load_operational_collection_record(&conn, &request.collection_name)?
1726            .ok_or_else(|| {
1727                EngineError::InvalidWrite(format!(
1728                    "operational collection '{}' is not registered",
1729                    request.collection_name
1730                ))
1731            })?;
1732        validate_append_only_operational_collection(&record, "read")?;
1733        let declared_fields = parse_operational_filter_fields(&record.filter_fields_json)
1734            .map_err(EngineError::InvalidWrite)?;
1735        let secondary_indexes =
1736            parse_operational_secondary_indexes_json(&record.secondary_indexes_json, record.kind)
1737                .map_err(EngineError::InvalidWrite)?;
1738        let applied_limit = operational_read_limit(request.limit)?;
1739        let filters = compile_operational_read_filters(&request.filters, &declared_fields)?;
1740        if let Some(report) = execute_operational_secondary_index_read(
1741            &conn,
1742            &request.collection_name,
1743            &filters,
1744            &secondary_indexes,
1745            applied_limit,
1746        )? {
1747            return Ok(report);
1748        }
1749        execute_operational_filtered_read(&conn, &request.collection_name, &filters, applied_limit)
1750    }
1751
1752    /// # Errors
1753    /// Returns [`EngineError`] if the database query fails or collection validation fails.
1754    pub fn rebuild_operational_current(
1755        &self,
1756        collection_name: Option<&str>,
1757    ) -> Result<OperationalRepairReport, EngineError> {
1758        let mut conn = self.connect()?;
1759        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1760        let collections = if let Some(name) = collection_name {
1761            let maybe_kind: Option<String> = tx
1762                .query_row(
1763                    "SELECT kind FROM operational_collections WHERE name = ?1",
1764                    [name],
1765                    |row| row.get(0),
1766                )
1767                .optional()?;
1768            let Some(kind) = maybe_kind else {
1769                return Err(EngineError::InvalidWrite(format!(
1770                    "operational collection '{name}' is not registered"
1771                )));
1772            };
1773            if kind != OperationalCollectionKind::LatestState.as_str() {
1774                return Err(EngineError::InvalidWrite(format!(
1775                    "operational collection '{name}' is not latest_state"
1776                )));
1777            }
1778            vec![name.to_owned()]
1779        } else {
1780            let mut stmt = tx.prepare(
1781                "SELECT name FROM operational_collections WHERE kind = 'latest_state' ORDER BY name",
1782            )?;
1783            stmt.query_map([], |row| row.get::<_, String>(0))?
1784                .collect::<Result<Vec<_>, _>>()?
1785        };
1786
1787        let rebuilt_rows = rebuild_operational_current_rows(&tx, &collections)?;
1788        for collection in &collections {
1789            let record = load_operational_collection_record(&tx, collection)?.ok_or_else(|| {
1790                EngineError::Bridge(format!(
1791                    "operational collection '{collection}' missing during current rebuild"
1792                ))
1793            })?;
1794            let indexes = parse_operational_secondary_indexes_json(
1795                &record.secondary_indexes_json,
1796                record.kind,
1797            )
1798            .map_err(EngineError::InvalidWrite)?;
1799            if !indexes.is_empty() {
1800                rebuild_operational_secondary_index_entries(
1801                    &tx,
1802                    &record.name,
1803                    record.kind,
1804                    &indexes,
1805                )?;
1806            }
1807        }
1808
1809        persist_simple_provenance_event(
1810            &tx,
1811            "operational_current_rebuilt",
1812            collection_name.unwrap_or("*"),
1813            Some(serde_json::json!({
1814                "collections_rebuilt": collections.len(),
1815                "current_rows_rebuilt": rebuilt_rows,
1816            })),
1817        )?;
1818        tx.commit()?;
1819
1820        Ok(OperationalRepairReport {
1821            collections_rebuilt: collections.len(),
1822            current_rows_rebuilt: rebuilt_rows,
1823        })
1824    }
1825
1826    /// # Errors
1827    /// Returns [`EngineError`] if the database connection fails or the projection rebuild fails.
1828    pub fn rebuild_projections(
1829        &self,
1830        target: ProjectionTarget,
1831    ) -> Result<ProjectionRepairReport, EngineError> {
1832        self.projections.rebuild_projections(target)
1833    }
1834
1835    /// # Errors
1836    /// Returns [`EngineError`] if the database connection fails or the projection rebuild fails.
1837    pub fn rebuild_missing_projections(&self) -> Result<ProjectionRepairReport, EngineError> {
1838        self.projections.rebuild_missing_projections()
1839    }
1840
1841    /// Register (or update) an FTS property projection schema for the given node kind.
1842    ///
1843    /// After registration, any node of this kind will have the declared JSON property
1844    /// paths extracted, concatenated, and indexed in the per-kind `fts_props_<kind>` FTS5 table.
1845    ///
1846    /// # Errors
1847    /// Returns [`EngineError`] if `property_paths` is empty, contains duplicates,
1848    /// or if the database write fails.
1849    pub fn register_fts_property_schema(
1850        &self,
1851        kind: &str,
1852        property_paths: &[String],
1853        separator: Option<&str>,
1854    ) -> Result<FtsPropertySchemaRecord, EngineError> {
1855        let specs: Vec<FtsPropertyPathSpec> = property_paths
1856            .iter()
1857            .map(|p| FtsPropertyPathSpec::scalar(p.clone()))
1858            .collect();
1859        self.register_fts_property_schema_with_entries(
1860            kind,
1861            &specs,
1862            separator,
1863            &[],
1864            RebuildMode::Eager,
1865        )
1866    }
1867
1868    /// Register (or update) an FTS property projection schema with
1869    /// per-path modes and optional exclude paths.
1870    ///
1871    /// Under `RebuildMode::Eager` (the legacy mode), the full rebuild runs
1872    /// inside the registration transaction — same behavior as before Pack 7.
1873    ///
1874    /// Under `RebuildMode::Async` (the 0.4.1 default), the schema row is
1875    /// persisted in a short IMMEDIATE transaction, a rebuild-state row is
1876    /// upserted, and the actual rebuild is handed off to the background
1877    /// `RebuildActor`.  The register call returns in <100ms even for large
1878    /// kinds.
1879    ///
1880    /// # Errors
1881    /// Returns [`EngineError`] if the paths are invalid, the JSON
1882    /// serialization fails, or the (schema-persist / rebuild) transaction fails.
1883    pub fn register_fts_property_schema_with_entries(
1884        &self,
1885        kind: &str,
1886        entries: &[FtsPropertyPathSpec],
1887        separator: Option<&str>,
1888        exclude_paths: &[String],
1889        mode: RebuildMode,
1890    ) -> Result<FtsPropertySchemaRecord, EngineError> {
1891        let paths: Vec<String> = entries.iter().map(|e| e.path.clone()).collect();
1892        validate_fts_property_paths(&paths)?;
1893        for p in exclude_paths {
1894            if !p.starts_with("$.") {
1895                return Err(EngineError::InvalidWrite(format!(
1896                    "exclude_paths entries must start with '$.' but got: {p}"
1897                )));
1898            }
1899        }
1900        for e in entries {
1901            if let Some(w) = e.weight
1902                && !(w > 0.0 && w <= 1000.0)
1903            {
1904                return Err(EngineError::Bridge(format!(
1905                    "weight out of range: {w} (must satisfy 0.0 < weight <= 1000.0)"
1906                )));
1907            }
1908        }
1909        let separator = separator.unwrap_or(" ");
1910        let paths_json = serialize_property_paths_json(entries, exclude_paths)?;
1911
1912        match mode {
1913            RebuildMode::Eager => self.register_fts_property_schema_eager(
1914                kind,
1915                entries,
1916                separator,
1917                exclude_paths,
1918                &paths,
1919                &paths_json,
1920            ),
1921            RebuildMode::Async => self.register_fts_property_schema_async(
1922                kind,
1923                entries,
1924                separator,
1925                &paths,
1926                &paths_json,
1927            ),
1928        }
1929    }
1930
1931    /// Eager path: existing transactional behavior unchanged.
1932    fn register_fts_property_schema_eager(
1933        &self,
1934        kind: &str,
1935        entries: &[FtsPropertyPathSpec],
1936        separator: &str,
1937        exclude_paths: &[String],
1938        paths: &[String],
1939        paths_json: &str,
1940    ) -> Result<FtsPropertySchemaRecord, EngineError> {
1941        let mut conn = self.connect()?;
1942        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1943
1944        // Determine whether the registration introduces a recursive path
1945        // that was not present in the previously-registered schema for
1946        // this kind. If so, we must eagerly rebuild property FTS rows and
1947        // position map for every active node of this kind within the same
1948        // transaction.
1949        let previous_row: Option<(String, String)> = tx
1950            .query_row(
1951                "SELECT property_paths_json, separator FROM fts_property_schemas WHERE kind = ?1",
1952                [kind],
1953                |row| {
1954                    let json: String = row.get(0)?;
1955                    let sep: String = row.get(1)?;
1956                    Ok((json, sep))
1957                },
1958            )
1959            .optional()?;
1960        let had_previous_schema = previous_row.is_some();
1961        let previous_recursive_paths: Vec<String> = previous_row
1962            .map(|(json, sep)| crate::writer::parse_property_schema_json(&json, &sep))
1963            .map_or(Vec::new(), |schema| {
1964                schema
1965                    .paths
1966                    .into_iter()
1967                    .filter(|p| p.mode == crate::writer::PropertyPathMode::Recursive)
1968                    .map(|p| p.path)
1969                    .collect()
1970            });
1971        let new_recursive_paths: Vec<&str> = entries
1972            .iter()
1973            .filter(|e| e.mode == FtsPropertyPathMode::Recursive)
1974            .map(|e| e.path.as_str())
1975            .collect();
1976        let introduces_new_recursive = new_recursive_paths
1977            .iter()
1978            .any(|p| !previous_recursive_paths.iter().any(|prev| prev == p));
1979
1980        tx.execute(
1981            "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
1982             VALUES (?1, ?2, ?3) \
1983             ON CONFLICT(kind) DO UPDATE SET property_paths_json = ?2, separator = ?3",
1984            rusqlite::params![kind, paths_json, separator],
1985        )?;
1986
1987        // Eager transactional rebuild: always fire on any registration or update.
1988        // First-time registrations must populate the per-kind FTS table from any
1989        // existing nodes; updates must clear and re-populate so stale rows don't
1990        // linger. This covers recursive-path additions AND scalar-only
1991        // re-registrations where only the path or separator changed. (P4-P2-1)
1992        let _ = (introduces_new_recursive, had_previous_schema);
1993        let needs_rebuild = true;
1994        if needs_rebuild {
1995            let any_weight = entries.iter().any(|e| e.weight.is_some());
1996            let tok = fathomdb_schema::resolve_fts_tokenizer(&tx, kind)
1997                .map_err(|e| EngineError::Bridge(e.to_string()))?;
1998            if any_weight {
1999                // Per-spec column mode: drop and recreate the table with one column
2000                // per spec. Data population into per-spec columns is future work;
2001                // the table is left empty after recreation.
2002                create_or_replace_fts_kind_table(&tx, kind, entries, &tok)?;
2003                tx.execute(
2004                    "DELETE FROM fts_node_property_positions WHERE kind = ?1",
2005                    [kind],
2006                )?;
2007                // Skip insert_property_fts_rows_for_kind — it uses text_content
2008                // which is not present in the per-spec column layout.
2009            } else {
2010                // Legacy text_content mode: drop and recreate the table to ensure
2011                // the correct single-column layout (handles weighted-to-unweighted
2012                // downgrade where a stale per-spec table might otherwise remain).
2013                create_or_replace_fts_kind_table(&tx, kind, &[], &tok)?;
2014                tx.execute(
2015                    "DELETE FROM fts_node_property_positions WHERE kind = ?1",
2016                    [kind],
2017                )?;
2018                // Scope the rebuild to `kind` only. The multi-kind
2019                // `insert_property_fts_rows` iterates over every registered
2020                // schema and would re-insert rows for siblings that were not
2021                // deleted above, duplicating their FTS entries.
2022                crate::projection::insert_property_fts_rows_for_kind(&tx, kind)?;
2023            }
2024        }
2025
2026        persist_simple_provenance_event(
2027            &tx,
2028            "fts_property_schema_registered",
2029            kind,
2030            Some(serde_json::json!({
2031                "property_paths": paths,
2032                "separator": separator,
2033                "exclude_paths": exclude_paths,
2034                "eager_rebuild": needs_rebuild,
2035            })),
2036        )?;
2037        tx.commit()?;
2038
2039        self.describe_fts_property_schema(kind)?.ok_or_else(|| {
2040            EngineError::Bridge("registered FTS property schema missing after commit".to_owned())
2041        })
2042    }
2043
2044    /// Async path: schema persisted in a short tx; rebuild handed to actor.
2045    fn register_fts_property_schema_async(
2046        &self,
2047        kind: &str,
2048        entries: &[FtsPropertyPathSpec],
2049        separator: &str,
2050        paths: &[String],
2051        paths_json: &str,
2052    ) -> Result<FtsPropertySchemaRecord, EngineError> {
2053        let mut conn = self.connect()?;
2054        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2055
2056        // Detect first-registration vs re-registration.
2057        let had_previous_schema: bool = tx
2058            .query_row(
2059                "SELECT count(*) FROM fts_property_schemas WHERE kind = ?1",
2060                rusqlite::params![kind],
2061                |r| r.get::<_, i64>(0),
2062            )
2063            .unwrap_or(0)
2064            > 0;
2065
2066        // Upsert schema row (fast — just a metadata write).
2067        tx.execute(
2068            "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
2069             VALUES (?1, ?2, ?3) \
2070             ON CONFLICT(kind) DO UPDATE SET property_paths_json = ?2, separator = ?3",
2071            rusqlite::params![kind, paths_json, separator],
2072        )?;
2073
2074        // Always drop and recreate the per-kind FTS table to ensure the schema
2075        // matches the registered spec layout. This handles weighted-to-unweighted
2076        // downgrade where a stale per-spec table would otherwise remain.
2077        let any_weight = entries.iter().any(|e| e.weight.is_some());
2078        let tok = fathomdb_schema::resolve_fts_tokenizer(&tx, kind)
2079            .map_err(|e| EngineError::Bridge(e.to_string()))?;
2080        if any_weight {
2081            create_or_replace_fts_kind_table(&tx, kind, entries, &tok)?;
2082        } else {
2083            // Legacy text_content layout — pass empty specs so
2084            // create_or_replace_fts_kind_table uses the single text_content column.
2085            create_or_replace_fts_kind_table(&tx, kind, &[], &tok)?;
2086        }
2087
2088        // Retrieve the rowid of the schema row as schema_id.
2089        let schema_id: i64 = tx.query_row(
2090            "SELECT rowid FROM fts_property_schemas WHERE kind = ?1",
2091            rusqlite::params![kind],
2092            |r| r.get(0),
2093        )?;
2094
2095        let now_ms = crate::rebuild_actor::now_unix_ms_pub();
2096        let is_first = i64::from(!had_previous_schema);
2097
2098        // Upsert rebuild state row.
2099        tx.execute(
2100            "INSERT INTO fts_property_rebuild_state \
2101             (kind, schema_id, state, rows_done, started_at, is_first_registration) \
2102             VALUES (?1, ?2, 'PENDING', 0, ?3, ?4) \
2103             ON CONFLICT(kind) DO UPDATE SET \
2104                 schema_id = excluded.schema_id, \
2105                 state = 'PENDING', \
2106                 rows_total = NULL, \
2107                 rows_done = 0, \
2108                 started_at = excluded.started_at, \
2109                 last_progress_at = NULL, \
2110                 error_message = NULL, \
2111                 is_first_registration = excluded.is_first_registration",
2112            rusqlite::params![kind, schema_id, now_ms, is_first],
2113        )?;
2114
2115        persist_simple_provenance_event(
2116            &tx,
2117            "fts_property_schema_registered",
2118            kind,
2119            Some(serde_json::json!({
2120                "property_paths": paths,
2121                "separator": separator,
2122                "mode": "async",
2123            })),
2124        )?;
2125        tx.commit()?;
2126
2127        // Enqueue the rebuild request if the actor is available.
2128        // try_send is non-blocking: if the channel is full (capacity 64), the
2129        // request is dropped. The state row stays PENDING and the caller can
2130        // observe this via get_property_fts_rebuild_state. No automatic retry
2131        // in 0.4.1 — caller must re-invoke register to re-enqueue.
2132        if let Some(sender) = &self.rebuild_sender
2133            && sender
2134                .try_send(RebuildRequest {
2135                    kind: kind.to_owned(),
2136                    schema_id,
2137                })
2138                .is_err()
2139        {
2140            trace_warn!(
2141                kind = %kind,
2142                "rebuild channel full; rebuild request dropped — state remains PENDING"
2143            );
2144        }
2145
2146        self.describe_fts_property_schema(kind)?.ok_or_else(|| {
2147            EngineError::Bridge("registered FTS property schema missing after commit".to_owned())
2148        })
2149    }
2150
2151    /// Return the rebuild state row for a kind, if one exists.
2152    ///
2153    /// # Errors
2154    /// Returns [`EngineError`] if the database query fails.
2155    pub fn get_property_fts_rebuild_state(
2156        &self,
2157        kind: &str,
2158    ) -> Result<Option<RebuildStateRow>, EngineError> {
2159        let conn = self.connect()?;
2160        let row = conn
2161            .query_row(
2162                "SELECT kind, schema_id, state, rows_total, rows_done, \
2163                 started_at, is_first_registration, error_message \
2164                 FROM fts_property_rebuild_state WHERE kind = ?1",
2165                rusqlite::params![kind],
2166                |r| {
2167                    Ok(RebuildStateRow {
2168                        kind: r.get(0)?,
2169                        schema_id: r.get(1)?,
2170                        state: r.get(2)?,
2171                        rows_total: r.get(3)?,
2172                        rows_done: r.get(4)?,
2173                        started_at: r.get(5)?,
2174                        is_first_registration: r.get::<_, i64>(6)? != 0,
2175                        error_message: r.get(7)?,
2176                    })
2177                },
2178            )
2179            .optional()?;
2180        Ok(row)
2181    }
2182
2183    /// Return the count of rows in `fts_property_rebuild_staging` for a kind.
2184    /// Used by tests to verify the staging table was populated.
2185    ///
2186    /// # Errors
2187    /// Returns [`EngineError`] if the database query fails.
2188    pub fn count_staging_rows(&self, kind: &str) -> Result<i64, EngineError> {
2189        let conn = self.connect()?;
2190        let count: i64 = conn.query_row(
2191            "SELECT count(*) FROM fts_property_rebuild_staging WHERE kind = ?1",
2192            rusqlite::params![kind],
2193            |r| r.get(0),
2194        )?;
2195        Ok(count)
2196    }
2197
2198    /// Return whether a specific node is present in `fts_property_rebuild_staging`.
2199    /// Used by tests to verify the double-write path.
2200    ///
2201    /// # Errors
2202    /// Returns [`EngineError`] if the database query fails.
2203    pub fn staging_row_exists(
2204        &self,
2205        kind: &str,
2206        node_logical_id: &str,
2207    ) -> Result<bool, EngineError> {
2208        let conn = self.connect()?;
2209        let count: i64 = conn.query_row(
2210            "SELECT count(*) FROM fts_property_rebuild_staging WHERE kind = ?1 AND node_logical_id = ?2",
2211            rusqlite::params![kind, node_logical_id],
2212            |r| r.get(0),
2213        )?;
2214        Ok(count > 0)
2215    }
2216
2217    /// Return the FTS property schema for a single node kind, if registered.
2218    ///
2219    /// # Errors
2220    /// Returns [`EngineError`] if the database query fails.
2221    pub fn describe_fts_property_schema(
2222        &self,
2223        kind: &str,
2224    ) -> Result<Option<FtsPropertySchemaRecord>, EngineError> {
2225        let conn = self.connect()?;
2226        load_fts_property_schema_record(&conn, kind)
2227    }
2228
2229    /// Return all registered FTS property schemas.
2230    ///
2231    /// # Errors
2232    /// Returns [`EngineError`] if the database query fails.
2233    pub fn list_fts_property_schemas(&self) -> Result<Vec<FtsPropertySchemaRecord>, EngineError> {
2234        let conn = self.connect()?;
2235        let mut stmt = conn.prepare(
2236            "SELECT kind, property_paths_json, separator, format_version \
2237             FROM fts_property_schemas ORDER BY kind",
2238        )?;
2239        let records = stmt
2240            .query_map([], |row| {
2241                let kind: String = row.get(0)?;
2242                let paths_json: String = row.get(1)?;
2243                let separator: String = row.get(2)?;
2244                let format_version: i64 = row.get(3)?;
2245                Ok(build_fts_property_schema_record(
2246                    kind,
2247                    &paths_json,
2248                    separator,
2249                    format_version,
2250                ))
2251            })?
2252            .collect::<Result<Vec<_>, _>>()?;
2253        Ok(records)
2254    }
2255
2256    /// Remove the FTS property schema for a node kind.
2257    ///
2258    /// This does **not** delete existing FTS rows for this kind;
2259    /// call `rebuild_projections(Fts)` to clean up stale rows.
2260    ///
2261    /// # Errors
2262    /// Returns [`EngineError`] if the kind is not registered or the delete fails.
2263    pub fn remove_fts_property_schema(&self, kind: &str) -> Result<(), EngineError> {
2264        let mut conn = self.connect()?;
2265        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2266        let deleted = tx.execute("DELETE FROM fts_property_schemas WHERE kind = ?1", [kind])?;
2267        if deleted == 0 {
2268            return Err(EngineError::InvalidWrite(format!(
2269                "FTS property schema for kind '{kind}' is not registered"
2270            )));
2271        }
2272        // Delete all FTS rows from the per-kind table (if it exists).
2273        let table = fathomdb_schema::fts_kind_table_name(kind);
2274        let table_exists: bool = tx
2275            .query_row(
2276                "SELECT count(*) FROM sqlite_master WHERE type='table' AND name = ?1 \
2277                 AND sql LIKE 'CREATE VIRTUAL TABLE%'",
2278                rusqlite::params![table],
2279                |r| r.get::<_, i64>(0),
2280            )
2281            .unwrap_or(0)
2282            > 0;
2283        if table_exists {
2284            tx.execute_batch(&format!("DELETE FROM {table}"))?;
2285        }
2286        persist_simple_provenance_event(&tx, "fts_property_schema_removed", kind, None)?;
2287        tx.commit()?;
2288        Ok(())
2289    }
2290
2291    /// Recreate enabled vector profiles from persisted `vector_profiles` metadata.
2292    ///
2293    /// # Errors
2294    /// Returns [`EngineError`] if the database connection fails, reading metadata fails,
2295    /// or sqlite-vec support is unavailable while enabled profiles are present.
2296    pub fn restore_vector_profiles(&self) -> Result<ProjectionRepairReport, EngineError> {
2297        let conn = self.connect()?;
2298        let profiles: Vec<(String, String, i64)> = {
2299            let mut stmt = conn.prepare(
2300                "SELECT profile, table_name, dimension \
2301                 FROM vector_profiles WHERE enabled = 1 ORDER BY profile",
2302            )?;
2303            stmt.query_map([], |row| {
2304                Ok((
2305                    row.get::<_, String>(0)?,
2306                    row.get::<_, String>(1)?,
2307                    row.get::<_, i64>(2)?,
2308                ))
2309            })?
2310            .collect::<Result<Vec<_>, _>>()?
2311        };
2312
2313        for (profile, table_name, dimension) in &profiles {
2314            let dimension = usize::try_from(*dimension).map_err(|_| {
2315                EngineError::Bridge(format!("invalid vector profile dimension: {dimension}"))
2316            })?;
2317            self.schema_manager
2318                .ensure_vector_profile(&conn, profile, table_name, dimension)?;
2319        }
2320
2321        Ok(ProjectionRepairReport {
2322            targets: vec![ProjectionTarget::Vec],
2323            rebuilt_rows: profiles.len(),
2324            notes: vec![],
2325        })
2326    }
2327
2328    /// Rebuild vector embeddings using an application-supplied regeneration
2329    /// contract and generator command.
2330    ///
2331    /// The config is persisted in `vector_embedding_contracts` so the metadata
2332    /// required for recovery survives future repair runs.
2333    ///
2334    /// Vector identity is stamped from [`QueryEmbedder::identity`] — the
2335    /// caller supplies the embedder and cannot override its identity. This
2336    /// makes drift between the read-path and write-path identity stories
2337    /// structurally impossible.
2338    ///
2339    /// # Errors
2340    /// Returns [`EngineError`] if the database connection fails, the config is
2341    /// invalid, the embedder fails, or the regenerated embeddings are
2342    /// malformed.
2343    #[allow(clippy::too_many_lines)]
2344    pub fn regenerate_vector_embeddings(
2345        &self,
2346        embedder: &dyn QueryEmbedder,
2347        config: &VectorRegenerationConfig,
2348    ) -> Result<VectorRegenerationReport, EngineError> {
2349        let conn = self.connect()?;
2350        let identity = embedder.identity();
2351        let config = validate_vector_regeneration_config(&conn, config, &identity)
2352            .map_err(|failure| failure.to_engine_error())?;
2353        let chunks = collect_regeneration_chunks(&conn)?;
2354        let payload = build_regeneration_input(&config, &identity, chunks.clone());
2355        let snapshot_hash = compute_snapshot_hash(&payload)?;
2356        let audit_metadata = VectorRegenerationAuditMetadata {
2357            profile: config.profile.clone(),
2358            model_identity: identity.model_identity.clone(),
2359            model_version: identity.model_version.clone(),
2360            chunk_count: chunks.len(),
2361            snapshot_hash: snapshot_hash.clone(),
2362            failure_class: None,
2363        };
2364        persist_vector_regeneration_event(
2365            &conn,
2366            "vector_regeneration_requested",
2367            &config.profile,
2368            &audit_metadata,
2369        )?;
2370        let notes = vec!["vector embeddings regenerated via configured embedder".to_owned()];
2371
2372        let mut embedding_map: std::collections::HashMap<String, Vec<u8>> =
2373            std::collections::HashMap::with_capacity(chunks.len());
2374        for chunk in &chunks {
2375            let vector = match embedder.embed_query(&chunk.text_content) {
2376                Ok(vector) => vector,
2377                Err(error) => {
2378                    let failure = VectorRegenerationFailure::new(
2379                        VectorRegenerationFailureClass::EmbedderFailure,
2380                        format!("embedder failed for chunk '{}': {error}", chunk.chunk_id),
2381                    );
2382                    self.persist_vector_regeneration_failure_best_effort(
2383                        &config.profile,
2384                        &audit_metadata,
2385                        &failure,
2386                    );
2387                    return Err(failure.to_engine_error());
2388                }
2389            };
2390            if vector.len() != identity.dimension {
2391                let failure = VectorRegenerationFailure::new(
2392                    VectorRegenerationFailureClass::InvalidEmbedderOutput,
2393                    format!(
2394                        "embedder produced {} values for chunk '{}', expected {}",
2395                        vector.len(),
2396                        chunk.chunk_id,
2397                        identity.dimension
2398                    ),
2399                );
2400                self.persist_vector_regeneration_failure_best_effort(
2401                    &config.profile,
2402                    &audit_metadata,
2403                    &failure,
2404                );
2405                return Err(failure.to_engine_error());
2406            }
2407            if vector.iter().any(|value| !value.is_finite()) {
2408                let failure = VectorRegenerationFailure::new(
2409                    VectorRegenerationFailureClass::InvalidEmbedderOutput,
2410                    format!(
2411                        "embedder returned non-finite values for chunk '{}'",
2412                        chunk.chunk_id
2413                    ),
2414                );
2415                self.persist_vector_regeneration_failure_best_effort(
2416                    &config.profile,
2417                    &audit_metadata,
2418                    &failure,
2419                );
2420                return Err(failure.to_engine_error());
2421            }
2422            let bytes: Vec<u8> = vector
2423                .iter()
2424                .flat_map(|value| value.to_le_bytes())
2425                .collect();
2426            embedding_map.insert(chunk.chunk_id.clone(), bytes);
2427        }
2428
2429        let table_name = fathomdb_schema::vec_kind_table_name(&config.kind);
2430        let mut conn = conn;
2431        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2432        match self
2433            .schema_manager
2434            .ensure_vec_kind_profile(&tx, &config.kind, identity.dimension)
2435        {
2436            Ok(()) => {}
2437            Err(SchemaError::MissingCapability(message)) => {
2438                let failure = VectorRegenerationFailure::new(
2439                    VectorRegenerationFailureClass::UnsupportedVecCapability,
2440                    message,
2441                );
2442                drop(tx);
2443                self.persist_vector_regeneration_failure_best_effort(
2444                    &config.profile,
2445                    &audit_metadata,
2446                    &failure,
2447                );
2448                return Err(failure.to_engine_error());
2449            }
2450            Err(error) => return Err(EngineError::Schema(error)),
2451        }
2452        let apply_chunks = collect_regeneration_chunks(&tx)?;
2453        let apply_payload = build_regeneration_input(&config, &identity, apply_chunks.clone());
2454        let apply_hash = compute_snapshot_hash(&apply_payload)?;
2455        if apply_hash != snapshot_hash {
2456            let failure = VectorRegenerationFailure::new(
2457                VectorRegenerationFailureClass::SnapshotDrift,
2458                "chunk snapshot changed during generation; retry".to_owned(),
2459            );
2460            drop(tx);
2461            self.persist_vector_regeneration_failure_best_effort(
2462                &config.profile,
2463                &audit_metadata,
2464                &failure,
2465            );
2466            return Err(failure.to_engine_error());
2467        }
2468        persist_vector_contract(&tx, &config, &table_name, &identity, &snapshot_hash)?;
2469        tx.execute(&format!("DELETE FROM {table_name}"), [])?;
2470        let mut stmt = tx.prepare_cached(&format!(
2471            "INSERT INTO {table_name} (chunk_id, embedding) VALUES (?1, ?2)"
2472        ))?;
2473        let mut regenerated_rows = 0usize;
2474        for chunk in &apply_chunks {
2475            let Some(embedding) = embedding_map.remove(&chunk.chunk_id) else {
2476                drop(stmt);
2477                drop(tx);
2478                let failure = VectorRegenerationFailure::new(
2479                    VectorRegenerationFailureClass::InvalidEmbedderOutput,
2480                    format!(
2481                        "embedder did not produce a vector for chunk '{}'",
2482                        chunk.chunk_id
2483                    ),
2484                );
2485                self.persist_vector_regeneration_failure_best_effort(
2486                    &config.profile,
2487                    &audit_metadata,
2488                    &failure,
2489                );
2490                return Err(failure.to_engine_error());
2491            };
2492            stmt.execute(rusqlite::params![chunk.chunk_id.as_str(), embedding])?;
2493            regenerated_rows += 1;
2494        }
2495        drop(stmt);
2496        persist_vector_regeneration_event(
2497            &tx,
2498            "vector_regeneration_apply",
2499            &config.profile,
2500            &audit_metadata,
2501        )?;
2502        tx.commit()?;
2503
2504        Ok(VectorRegenerationReport {
2505            profile: config.profile.clone(),
2506            table_name,
2507            dimension: identity.dimension,
2508            total_chunks: chunks.len(),
2509            regenerated_rows,
2510            contract_persisted: true,
2511            notes,
2512        })
2513    }
2514
2515    /// Regenerate vector embeddings in-process using a [`BatchEmbedder`].
2516    ///
2517    /// Functionally equivalent to [`regenerate_vector_embeddings`] but uses
2518    /// `BatchEmbedder::batch_embed` to process all chunks in one call. This
2519    /// is the intended path for [`BuiltinBgeSmallEmbedder`] — it keeps the
2520    /// forward pass in-process without requiring an external subprocess.
2521    ///
2522    /// The subprocess-based path ([`regenerate_vector_embeddings`]) remains
2523    /// intact for callers who supply their own generator binary.
2524    ///
2525    /// # Errors
2526    /// Returns [`EngineError`] if the database connection fails, the config is
2527    /// invalid, the embedder fails, or the regenerated embeddings are malformed.
2528    #[allow(clippy::too_many_lines)]
2529    pub fn regenerate_vector_embeddings_in_process(
2530        &self,
2531        embedder: &dyn BatchEmbedder,
2532        config: &VectorRegenerationConfig,
2533    ) -> Result<VectorRegenerationReport, EngineError> {
2534        let conn = self.connect()?;
2535        let identity = embedder.identity();
2536        let config = validate_vector_regeneration_config(&conn, config, &identity)
2537            .map_err(|failure| failure.to_engine_error())?;
2538        let chunks = collect_regeneration_chunks(&conn)?;
2539        let payload = build_regeneration_input(&config, &identity, chunks.clone());
2540        let snapshot_hash = compute_snapshot_hash(&payload)?;
2541        let audit_metadata = VectorRegenerationAuditMetadata {
2542            profile: config.profile.clone(),
2543            model_identity: identity.model_identity.clone(),
2544            model_version: identity.model_version.clone(),
2545            chunk_count: chunks.len(),
2546            snapshot_hash: snapshot_hash.clone(),
2547            failure_class: None,
2548        };
2549        persist_vector_regeneration_event(
2550            &conn,
2551            "vector_regeneration_requested",
2552            &config.profile,
2553            &audit_metadata,
2554        )?;
2555        let notes = vec!["vector embeddings regenerated via in-process batch embedder".to_owned()];
2556
2557        // Collect texts and call batch_embed once for all chunks.
2558        let chunk_texts: Vec<String> = chunks.iter().map(|c| c.text_content.clone()).collect();
2559        let batch_vectors = match embedder.batch_embed(&chunk_texts) {
2560            Ok(vecs) => vecs,
2561            Err(error) => {
2562                let failure = VectorRegenerationFailure::new(
2563                    VectorRegenerationFailureClass::EmbedderFailure,
2564                    format!("batch embedder failed: {error}"),
2565                );
2566                self.persist_vector_regeneration_failure_best_effort(
2567                    &config.profile,
2568                    &audit_metadata,
2569                    &failure,
2570                );
2571                return Err(failure.to_engine_error());
2572            }
2573        };
2574        if batch_vectors.len() != chunks.len() {
2575            let failure = VectorRegenerationFailure::new(
2576                VectorRegenerationFailureClass::InvalidEmbedderOutput,
2577                format!(
2578                    "batch embedder returned {} vectors for {} chunks",
2579                    batch_vectors.len(),
2580                    chunks.len()
2581                ),
2582            );
2583            self.persist_vector_regeneration_failure_best_effort(
2584                &config.profile,
2585                &audit_metadata,
2586                &failure,
2587            );
2588            return Err(failure.to_engine_error());
2589        }
2590
2591        let mut embedding_map: std::collections::HashMap<String, Vec<u8>> =
2592            std::collections::HashMap::with_capacity(chunks.len());
2593        for (chunk, vector) in chunks.iter().zip(batch_vectors) {
2594            if vector.len() != identity.dimension {
2595                let failure = VectorRegenerationFailure::new(
2596                    VectorRegenerationFailureClass::InvalidEmbedderOutput,
2597                    format!(
2598                        "embedder produced {} values for chunk '{}', expected {}",
2599                        vector.len(),
2600                        chunk.chunk_id,
2601                        identity.dimension
2602                    ),
2603                );
2604                self.persist_vector_regeneration_failure_best_effort(
2605                    &config.profile,
2606                    &audit_metadata,
2607                    &failure,
2608                );
2609                return Err(failure.to_engine_error());
2610            }
2611            if vector.iter().any(|value| !value.is_finite()) {
2612                let failure = VectorRegenerationFailure::new(
2613                    VectorRegenerationFailureClass::InvalidEmbedderOutput,
2614                    format!(
2615                        "embedder returned non-finite values for chunk '{}'",
2616                        chunk.chunk_id
2617                    ),
2618                );
2619                self.persist_vector_regeneration_failure_best_effort(
2620                    &config.profile,
2621                    &audit_metadata,
2622                    &failure,
2623                );
2624                return Err(failure.to_engine_error());
2625            }
2626            let bytes: Vec<u8> = vector
2627                .iter()
2628                .flat_map(|value| value.to_le_bytes())
2629                .collect();
2630            embedding_map.insert(chunk.chunk_id.clone(), bytes);
2631        }
2632
2633        let mut conn = conn;
2634        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2635        let table_name = fathomdb_schema::vec_kind_table_name(&config.kind);
2636        match self
2637            .schema_manager
2638            .ensure_vec_kind_profile(&tx, &config.kind, identity.dimension)
2639        {
2640            Ok(()) => {}
2641            Err(SchemaError::MissingCapability(message)) => {
2642                let failure = VectorRegenerationFailure::new(
2643                    VectorRegenerationFailureClass::UnsupportedVecCapability,
2644                    message,
2645                );
2646                drop(tx);
2647                self.persist_vector_regeneration_failure_best_effort(
2648                    &config.profile,
2649                    &audit_metadata,
2650                    &failure,
2651                );
2652                return Err(failure.to_engine_error());
2653            }
2654            Err(error) => return Err(EngineError::Schema(error)),
2655        }
2656        let apply_chunks = collect_regeneration_chunks(&tx)?;
2657        let apply_payload = build_regeneration_input(&config, &identity, apply_chunks.clone());
2658        let apply_hash = compute_snapshot_hash(&apply_payload)?;
2659        if apply_hash != snapshot_hash {
2660            let failure = VectorRegenerationFailure::new(
2661                VectorRegenerationFailureClass::SnapshotDrift,
2662                "chunk snapshot changed during generation; retry".to_owned(),
2663            );
2664            drop(tx);
2665            self.persist_vector_regeneration_failure_best_effort(
2666                &config.profile,
2667                &audit_metadata,
2668                &failure,
2669            );
2670            return Err(failure.to_engine_error());
2671        }
2672        persist_vector_contract(&tx, &config, &table_name, &identity, &snapshot_hash)?;
2673        tx.execute(&format!("DELETE FROM {table_name}"), [])?;
2674        let mut stmt = tx.prepare_cached(&format!(
2675            "INSERT INTO {table_name} (chunk_id, embedding) VALUES (?1, ?2)"
2676        ))?;
2677        let mut regenerated_rows = 0usize;
2678        for chunk in &apply_chunks {
2679            let Some(embedding) = embedding_map.remove(&chunk.chunk_id) else {
2680                drop(stmt);
2681                drop(tx);
2682                let failure = VectorRegenerationFailure::new(
2683                    VectorRegenerationFailureClass::InvalidEmbedderOutput,
2684                    format!(
2685                        "embedder did not produce a vector for chunk '{}'",
2686                        chunk.chunk_id
2687                    ),
2688                );
2689                self.persist_vector_regeneration_failure_best_effort(
2690                    &config.profile,
2691                    &audit_metadata,
2692                    &failure,
2693                );
2694                return Err(failure.to_engine_error());
2695            };
2696            stmt.execute(rusqlite::params![chunk.chunk_id.as_str(), embedding])?;
2697            regenerated_rows += 1;
2698        }
2699        drop(stmt);
2700        persist_vector_regeneration_event(
2701            &tx,
2702            "vector_regeneration_apply",
2703            &config.profile,
2704            &audit_metadata,
2705        )?;
2706        tx.commit()?;
2707
2708        Ok(VectorRegenerationReport {
2709            profile: config.profile.clone(),
2710            table_name,
2711            dimension: identity.dimension,
2712            total_chunks: chunks.len(),
2713            regenerated_rows,
2714            contract_persisted: true,
2715            notes,
2716        })
2717    }
2718
2719    fn persist_vector_regeneration_failure_best_effort(
2720        &self,
2721        profile: &str,
2722        metadata: &VectorRegenerationAuditMetadata,
2723        failure: &VectorRegenerationFailure,
2724    ) {
2725        let Ok(conn) = self.connect() else {
2726            return;
2727        };
2728        let failure_metadata = VectorRegenerationAuditMetadata {
2729            profile: metadata.profile.clone(),
2730            model_identity: metadata.model_identity.clone(),
2731            model_version: metadata.model_version.clone(),
2732            chunk_count: metadata.chunk_count,
2733            snapshot_hash: metadata.snapshot_hash.clone(),
2734            failure_class: Some(failure.failure_class_label().to_owned()),
2735        };
2736        let _ = persist_vector_regeneration_event(
2737            &conn,
2738            "vector_regeneration_failed",
2739            profile,
2740            &failure_metadata,
2741        );
2742    }
2743
2744    /// # Errors
2745    /// Returns [`EngineError`] if the database connection fails or any SQL query fails.
2746    pub fn trace_source(&self, source_ref: &str) -> Result<TraceReport, EngineError> {
2747        let conn = self.connect()?;
2748
2749        let node_logical_ids = collect_strings(
2750            &conn,
2751            "SELECT logical_id FROM nodes WHERE source_ref = ?1 ORDER BY created_at",
2752            source_ref,
2753        )?;
2754        let action_ids = collect_strings(
2755            &conn,
2756            "SELECT id FROM actions WHERE source_ref = ?1 ORDER BY created_at",
2757            source_ref,
2758        )?;
2759        let operational_mutation_ids = collect_strings(
2760            &conn,
2761            "SELECT id FROM operational_mutations WHERE source_ref = ?1 ORDER BY mutation_order",
2762            source_ref,
2763        )?;
2764
2765        Ok(TraceReport {
2766            source_ref: source_ref.to_owned(),
2767            node_rows: count_source_ref(&conn, "nodes", source_ref)?,
2768            edge_rows: count_source_ref(&conn, "edges", source_ref)?,
2769            action_rows: count_source_ref(&conn, "actions", source_ref)?,
2770            operational_mutation_rows: count_source_ref(
2771                &conn,
2772                "operational_mutations",
2773                source_ref,
2774            )?,
2775            node_logical_ids,
2776            action_ids,
2777            operational_mutation_ids,
2778        })
2779    }
2780
2781    /// # Errors
2782    /// Returns [`EngineError`] if the database connection fails, the transaction cannot be
2783    /// started, or lifecycle restoration prerequisites are missing.
2784    #[allow(clippy::too_many_lines)]
2785    pub fn restore_logical_id(
2786        &self,
2787        logical_id: &str,
2788    ) -> Result<LogicalRestoreReport, EngineError> {
2789        let mut conn = self.connect()?;
2790        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2791
2792        let active_count: i64 = tx.query_row(
2793            "SELECT count(*) FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL",
2794            [logical_id],
2795            |row| row.get(0),
2796        )?;
2797        if active_count > 0 {
2798            return Ok(LogicalRestoreReport {
2799                logical_id: logical_id.to_owned(),
2800                was_noop: true,
2801                restored_node_rows: 0,
2802                restored_edge_rows: 0,
2803                restored_chunk_rows: 0,
2804                restored_fts_rows: 0,
2805                restored_property_fts_rows: 0,
2806                restored_vec_rows: 0,
2807                skipped_edges: Vec::new(),
2808                notes: vec!["logical_id already active".to_owned()],
2809            });
2810        }
2811
2812        let restored_node: Option<(String, String)> = tx
2813            .query_row(
2814                "SELECT row_id, kind FROM nodes \
2815                 WHERE logical_id = ?1 AND superseded_at IS NOT NULL \
2816                 ORDER BY superseded_at DESC, created_at DESC, rowid DESC LIMIT 1",
2817                [logical_id],
2818                |row| Ok((row.get(0)?, row.get(1)?)),
2819            )
2820            .optional()?;
2821        let (restored_node_row_id, restored_kind) = restored_node.ok_or_else(|| {
2822            EngineError::InvalidWrite(format!("logical_id '{logical_id}' is not retired"))
2823        })?;
2824
2825        tx.execute(
2826            "UPDATE nodes SET superseded_at = NULL WHERE row_id = ?1",
2827            [restored_node_row_id.as_str()],
2828        )?;
2829
2830        let retire_scope: Option<(i64, Option<String>, i64)> = tx
2831            .query_row(
2832                "SELECT rowid, source_ref, created_at FROM provenance_events \
2833                 WHERE event_type = 'node_retire' AND subject = ?1 \
2834                 ORDER BY created_at DESC, rowid DESC LIMIT 1",
2835                [logical_id],
2836                |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
2837            )
2838            .optional()?;
2839        let (restored_edge_rows, skipped_edges) = if let Some((
2840            retire_event_rowid,
2841            retire_source_ref,
2842            retire_created_at,
2843        )) = retire_scope
2844        {
2845            restore_validated_edges(
2846                &tx,
2847                logical_id,
2848                retire_source_ref.as_deref(),
2849                retire_created_at,
2850                retire_event_rowid,
2851            )?
2852        } else {
2853            (0, Vec::new())
2854        };
2855
2856        let restored_chunk_rows: usize = tx
2857            .query_row(
2858                "SELECT count(*) FROM chunks WHERE node_logical_id = ?1",
2859                [logical_id],
2860                |row| row.get::<_, i64>(0),
2861            )
2862            .map(i64_to_usize)?;
2863        tx.execute(
2864            "DELETE FROM fts_nodes WHERE node_logical_id = ?1",
2865            [logical_id],
2866        )?;
2867        let restored_fts_rows = tx.execute(
2868            "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
2869             SELECT id, node_logical_id, ?2, text_content \
2870             FROM chunks WHERE node_logical_id = ?1",
2871            rusqlite::params![logical_id, restored_kind],
2872        )?;
2873        let restored_vec_rows = count_vec_rows_for_logical_id(&tx, logical_id)?;
2874
2875        // Rebuild property FTS for the restored node.
2876        // Delete from the per-kind FTS table for this node (if the table exists).
2877        let table = fathomdb_schema::fts_kind_table_name(&restored_kind);
2878        let fts_table_exists: bool = tx
2879            .query_row(
2880                "SELECT count(*) FROM sqlite_master WHERE type='table' AND name = ?1 \
2881                 AND sql LIKE 'CREATE VIRTUAL TABLE%'",
2882                rusqlite::params![table],
2883                |r| r.get::<_, i64>(0),
2884            )
2885            .unwrap_or(0)
2886            > 0;
2887        if fts_table_exists {
2888            tx.execute(
2889                &format!("DELETE FROM {table} WHERE node_logical_id = ?1"),
2890                [logical_id],
2891            )?;
2892        }
2893        let restored_property_fts_rows =
2894            rebuild_single_node_property_fts(&tx, logical_id, &restored_kind)?;
2895
2896        persist_simple_provenance_event(
2897            &tx,
2898            "restore_logical_id",
2899            logical_id,
2900            Some(serde_json::json!({
2901                "restored_node_rows": 1,
2902                "restored_edge_rows": restored_edge_rows,
2903                "restored_chunk_rows": restored_chunk_rows,
2904                "restored_fts_rows": restored_fts_rows,
2905                "restored_property_fts_rows": restored_property_fts_rows,
2906                "restored_vec_rows": restored_vec_rows,
2907            })),
2908        )?;
2909        tx.commit()?;
2910
2911        Ok(LogicalRestoreReport {
2912            logical_id: logical_id.to_owned(),
2913            was_noop: false,
2914            restored_node_rows: 1,
2915            restored_edge_rows,
2916            restored_chunk_rows,
2917            restored_fts_rows,
2918            restored_property_fts_rows,
2919            restored_vec_rows,
2920            skipped_edges,
2921            notes: Vec::new(),
2922        })
2923    }
2924
2925    /// # Errors
2926    /// Returns [`EngineError`] if the database connection fails, the transaction cannot be
2927    /// started, or the purge mutation fails.
2928    pub fn purge_logical_id(&self, logical_id: &str) -> Result<LogicalPurgeReport, EngineError> {
2929        let mut conn = self.connect()?;
2930        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2931
2932        let active_count: i64 = tx.query_row(
2933            "SELECT count(*) FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL",
2934            [logical_id],
2935            |row| row.get(0),
2936        )?;
2937        if active_count > 0 {
2938            return Ok(LogicalPurgeReport {
2939                logical_id: logical_id.to_owned(),
2940                was_noop: true,
2941                deleted_node_rows: 0,
2942                deleted_edge_rows: 0,
2943                deleted_chunk_rows: 0,
2944                deleted_fts_rows: 0,
2945                deleted_vec_rows: 0,
2946                notes: vec!["logical_id is active; purge skipped".to_owned()],
2947            });
2948        }
2949
2950        let node_rows: i64 = tx.query_row(
2951            "SELECT count(*) FROM nodes WHERE logical_id = ?1",
2952            [logical_id],
2953            |row| row.get(0),
2954        )?;
2955        if node_rows == 0 {
2956            return Err(EngineError::InvalidWrite(format!(
2957                "logical_id '{logical_id}' does not exist"
2958            )));
2959        }
2960
2961        let deleted_vec_rows = delete_vec_rows_for_logical_id(&tx, logical_id)?;
2962        let deleted_fts_rows = tx.execute(
2963            "DELETE FROM fts_nodes WHERE node_logical_id = ?1",
2964            [logical_id],
2965        )?;
2966        let deleted_edge_rows = tx.execute(
2967            "DELETE FROM edges WHERE source_logical_id = ?1 OR target_logical_id = ?1",
2968            [logical_id],
2969        )?;
2970        let deleted_chunk_rows = tx.execute(
2971            "DELETE FROM chunks WHERE node_logical_id = ?1",
2972            [logical_id],
2973        )?;
2974        let deleted_node_rows =
2975            tx.execute("DELETE FROM nodes WHERE logical_id = ?1", [logical_id])?;
2976        tx.execute(
2977            "DELETE FROM node_access_metadata WHERE logical_id = ?1",
2978            [logical_id],
2979        )?;
2980
2981        persist_simple_provenance_event(
2982            &tx,
2983            "purge_logical_id",
2984            logical_id,
2985            Some(serde_json::json!({
2986                "deleted_node_rows": deleted_node_rows,
2987                "deleted_edge_rows": deleted_edge_rows,
2988                "deleted_chunk_rows": deleted_chunk_rows,
2989                "deleted_fts_rows": deleted_fts_rows,
2990                "deleted_vec_rows": deleted_vec_rows,
2991            })),
2992        )?;
2993        tx.commit()?;
2994
2995        Ok(LogicalPurgeReport {
2996            logical_id: logical_id.to_owned(),
2997            was_noop: false,
2998            deleted_node_rows,
2999            deleted_edge_rows,
3000            deleted_chunk_rows,
3001            deleted_fts_rows,
3002            deleted_vec_rows,
3003            notes: Vec::new(),
3004        })
3005    }
3006
3007    /// Purge provenance events older than `before_timestamp`.
3008    ///
3009    /// By default, `excise` and `purge_logical_id` event types are preserved so that
3010    /// data-deletion audit trails survive. Pass an explicit
3011    /// `preserve_event_types` list to override this default.
3012    ///
3013    /// # Errors
3014    /// Returns [`EngineError`] if the database connection fails, the transaction
3015    /// cannot be started, or any SQL statement fails.
3016    pub fn purge_provenance_events(
3017        &self,
3018        before_timestamp: i64,
3019        options: &ProvenancePurgeOptions,
3020    ) -> Result<ProvenancePurgeReport, EngineError> {
3021        let mut conn = self.connect()?;
3022        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
3023
3024        let preserved_types: Vec<&str> = if options.preserve_event_types.is_empty() {
3025            vec!["excise", "purge_logical_id"]
3026        } else {
3027            options
3028                .preserve_event_types
3029                .iter()
3030                .map(String::as_str)
3031                .collect()
3032        };
3033
3034        // Build the NOT IN clause dynamically based on preserved types.
3035        let placeholders: String = (0..preserved_types.len())
3036            .map(|i| format!("?{}", i + 2))
3037            .collect::<Vec<_>>()
3038            .join(", ");
3039        let count_query = format!(
3040            "SELECT count(*) FROM provenance_events \
3041             WHERE created_at < ?1 AND event_type NOT IN ({placeholders})"
3042        );
3043        let delete_query = format!(
3044            "DELETE FROM provenance_events WHERE rowid IN (\
3045             SELECT rowid FROM provenance_events \
3046             WHERE created_at < ?1 AND event_type NOT IN ({placeholders}) \
3047             LIMIT 10000)"
3048        );
3049
3050        let bind_params = |stmt: &mut rusqlite::Statement<'_>| -> Result<(), rusqlite::Error> {
3051            stmt.raw_bind_parameter(1, before_timestamp)?;
3052            for (i, event_type) in preserved_types.iter().enumerate() {
3053                stmt.raw_bind_parameter(i + 2, *event_type)?;
3054            }
3055            Ok(())
3056        };
3057
3058        let events_deleted = if options.dry_run {
3059            let mut stmt = tx.prepare(&count_query)?;
3060            bind_params(&mut stmt)?;
3061            stmt.raw_query()
3062                .next()?
3063                .map_or(0, |row| row.get::<_, u64>(0).unwrap_or(0))
3064        } else {
3065            let mut total_deleted: u64 = 0;
3066            loop {
3067                let mut stmt = tx.prepare(&delete_query)?;
3068                bind_params(&mut stmt)?;
3069                let deleted = stmt.raw_execute()?;
3070                if deleted == 0 {
3071                    break;
3072                }
3073                total_deleted += deleted as u64;
3074            }
3075            total_deleted
3076        };
3077
3078        let total_after: u64 =
3079            tx.query_row("SELECT count(*) FROM provenance_events", [], |row| {
3080                row.get(0)
3081            })?;
3082
3083        let oldest_remaining: Option<i64> = tx
3084            .query_row("SELECT MIN(created_at) FROM provenance_events", [], |row| {
3085                row.get(0)
3086            })
3087            .optional()?
3088            .flatten();
3089
3090        if !options.dry_run {
3091            tx.commit()?;
3092        }
3093
3094        // In dry_run mode nothing was deleted, so total_after includes the
3095        // would-be-deleted rows; subtract to get the preserved count.
3096        let events_preserved = if options.dry_run {
3097            total_after - events_deleted
3098        } else {
3099            total_after
3100        };
3101
3102        Ok(ProvenancePurgeReport {
3103            events_deleted,
3104            events_preserved,
3105            oldest_remaining,
3106        })
3107    }
3108
3109    /// # Errors
3110    /// Returns [`EngineError`] if the database connection fails, the transaction cannot be
3111    /// started, or any SQL statement fails.
3112    #[allow(clippy::too_many_lines)]
3113    pub fn excise_source(&self, source_ref: &str) -> Result<TraceReport, EngineError> {
3114        let mut conn = self.connect()?;
3115
3116        let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
3117        let affected_operational_collections = collect_strings_tx(
3118            &tx,
3119            "SELECT DISTINCT m.collection_name \
3120             FROM operational_mutations m \
3121             JOIN operational_collections c ON c.name = m.collection_name \
3122             WHERE m.source_ref = ?1 AND c.kind = 'latest_state' \
3123             ORDER BY m.collection_name",
3124            source_ref,
3125        )?;
3126
3127        // Collect (row_id, logical_id) for active rows that will be excised.
3128        let pairs: Vec<(String, String)> = {
3129            let mut stmt = tx.prepare(
3130                "SELECT row_id, logical_id FROM nodes \
3131                 WHERE source_ref = ?1 AND superseded_at IS NULL",
3132            )?;
3133            stmt.query_map([source_ref], |row| {
3134                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
3135            })?
3136            .collect::<Result<Vec<_>, _>>()?
3137        };
3138        let affected_logical_ids: Vec<String> = pairs
3139            .iter()
3140            .map(|(_, logical_id)| logical_id.clone())
3141            .collect();
3142
3143        // Supersede bad rows in all tables.
3144        tx.execute(
3145            "UPDATE nodes SET superseded_at = unixepoch() \
3146             WHERE source_ref = ?1 AND superseded_at IS NULL",
3147            [source_ref],
3148        )?;
3149        tx.execute(
3150            "UPDATE edges SET superseded_at = unixepoch() \
3151             WHERE source_ref = ?1 AND superseded_at IS NULL",
3152            [source_ref],
3153        )?;
3154        tx.execute(
3155            "UPDATE actions SET superseded_at = unixepoch() \
3156             WHERE source_ref = ?1 AND superseded_at IS NULL",
3157            [source_ref],
3158        )?;
3159        clear_operational_current_rows(&tx, &affected_operational_collections)?;
3160        tx.execute(
3161            "DELETE FROM operational_mutations WHERE source_ref = ?1",
3162            [source_ref],
3163        )?;
3164        for logical_id in &affected_logical_ids {
3165            delete_vec_rows_for_logical_id(&tx, logical_id)?;
3166            tx.execute(
3167                "DELETE FROM chunks WHERE node_logical_id = ?1",
3168                [logical_id.as_str()],
3169            )?;
3170        }
3171
3172        // Restore the most recent prior version for each affected logical_id.
3173        for (excised_row_id, logical_id) in &pairs {
3174            let prior: Option<String> = tx
3175                .query_row(
3176                    "SELECT row_id FROM nodes \
3177                     WHERE logical_id = ?1 AND row_id != ?2 \
3178                     ORDER BY created_at DESC LIMIT 1",
3179                    [logical_id.as_str(), excised_row_id.as_str()],
3180                    |row| row.get(0),
3181                )
3182                .optional()?;
3183            if let Some(prior_id) = prior {
3184                tx.execute(
3185                    "UPDATE nodes SET superseded_at = NULL WHERE row_id = ?1",
3186                    [prior_id.as_str()],
3187                )?;
3188            }
3189        }
3190
3191        for logical_id in &affected_logical_ids {
3192            let has_active_node = tx
3193                .query_row(
3194                    "SELECT 1 FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL LIMIT 1",
3195                    [logical_id.as_str()],
3196                    |row| row.get::<_, i64>(0),
3197                )
3198                .optional()?
3199                .is_some();
3200            if !has_active_node {
3201                tx.execute(
3202                    "DELETE FROM node_access_metadata WHERE logical_id = ?1",
3203                    [logical_id.as_str()],
3204                )?;
3205            }
3206        }
3207
3208        rebuild_operational_current_rows(&tx, &affected_operational_collections)?;
3209
3210        // Rebuild FTS atomically within the same transaction so readers never
3211        // observe a post-excise node state with a stale FTS index.
3212        tx.execute("DELETE FROM fts_nodes", [])?;
3213        tx.execute(
3214            r"
3215            INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content)
3216            SELECT c.id, n.logical_id, n.kind, c.text_content
3217            FROM chunks c
3218            JOIN nodes n
3219              ON n.logical_id = c.node_logical_id
3220             AND n.superseded_at IS NULL
3221            ",
3222            [],
3223        )?;
3224
3225        // Rebuild property FTS in the same transaction.
3226        rebuild_property_fts_in_tx(&tx)?;
3227
3228        // Record the audit event inside the same transaction so the excision and its
3229        // audit record are committed atomically — no window where the excision is
3230        // durable but unaudited.
3231        tx.execute(
3232            "INSERT INTO provenance_events (id, event_type, subject, source_ref) \
3233             VALUES (?1, 'excise_source', ?2, ?2)",
3234            rusqlite::params![new_id(), source_ref],
3235        )?;
3236
3237        tx.commit()?;
3238
3239        self.trace_source(source_ref)
3240    }
3241
3242    /// # Errors
3243    /// Returns [`EngineError`] if the WAL checkpoint fails, the `SQLite` backup fails,
3244    /// the SHA-256 digest cannot be computed, or the manifest file cannot be written.
3245    pub fn safe_export(
3246        &self,
3247        destination_path: impl AsRef<Path>,
3248        options: SafeExportOptions,
3249    ) -> Result<SafeExportManifest, EngineError> {
3250        let destination_path = destination_path.as_ref();
3251
3252        // 1. Optionally checkpoint WAL before exporting. This keeps the on-disk file tidy for
3253        // callers that want a fully checkpointed export, but export correctness does not depend
3254        // on it because the backup API copies from the live SQLite connection state.
3255        let conn = self.connect()?;
3256
3257        if options.force_checkpoint {
3258            trace_info!("safe_export: wal checkpoint started");
3259            let (busy, log, checkpointed): (i64, i64, i64) =
3260                conn.query_row("PRAGMA wal_checkpoint(FULL)", [], |row| {
3261                    Ok((row.get(0)?, row.get(1)?, row.get(2)?))
3262                })?;
3263            if busy != 0 {
3264                trace_warn!(
3265                    busy,
3266                    log_frames = log,
3267                    checkpointed_frames = checkpointed,
3268                    "safe_export: wal checkpoint blocked by active readers"
3269                );
3270                return Err(EngineError::Bridge(format!(
3271                    "WAL checkpoint blocked: {busy} active reader(s) prevented a full checkpoint; \
3272                     log frames={log}, checkpointed={checkpointed}; \
3273                     retry export when no readers are active"
3274                )));
3275            }
3276            trace_info!(
3277                log_frames = log,
3278                checkpointed_frames = checkpointed,
3279                "safe_export: wal checkpoint completed"
3280            );
3281        }
3282
3283        let schema_version: u32 = conn
3284            .query_row(
3285                "SELECT COALESCE(MAX(version), 0) FROM fathom_schema_migrations",
3286                [],
3287                |row| row.get(0),
3288            )
3289            .unwrap_or(0);
3290
3291        // 2. Export the database through SQLite's online backup API so committed data in the WAL
3292        // is included even when `force_checkpoint` is false.
3293        if let Some(parent) = destination_path.parent() {
3294            fs::create_dir_all(parent)?;
3295        }
3296        conn.backup(DatabaseName::Main, destination_path, None)?;
3297
3298        drop(conn);
3299
3300        // 2b. Query page_count from the EXPORTED file so the manifest reflects what was
3301        // actually backed up, not the source (which may have changed between the PRAGMA
3302        // and the backup call).
3303        let page_count: u64 = {
3304            let export_conn = rusqlite::Connection::open_with_flags(
3305                destination_path,
3306                rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY
3307                    | rusqlite::OpenFlags::SQLITE_OPEN_NO_MUTEX,
3308            )?;
3309            export_conn.query_row("PRAGMA page_count", [], |row| row.get(0))?
3310        };
3311
3312        // 3. Compute SHA-256 of the exported file.
3313        // FIX(review): was fs::read loading entire DB into memory; use streaming hash.
3314        let sha256 = {
3315            let mut file = fs::File::open(destination_path)?;
3316            let mut hasher = Sha256::new();
3317            io::copy(&mut file, &mut hasher)?;
3318            format!("{:x}", hasher.finalize())
3319        };
3320
3321        // 4. Record when the export was created.
3322        let exported_at = SystemTime::now()
3323            .duration_since(SystemTime::UNIX_EPOCH)
3324            .map_err(|e| EngineError::Bridge(format!("system clock error: {e}")))?
3325            .as_secs();
3326
3327        let manifest = SafeExportManifest {
3328            exported_at,
3329            sha256,
3330            schema_version,
3331            protocol_version: EXPORT_PROTOCOL_VERSION,
3332            page_count,
3333        };
3334
3335        // 5. Write manifest alongside the exported file, using Path API for the name.
3336        let manifest_path = {
3337            let mut p = destination_path.to_path_buf();
3338            let stem = p
3339                .file_name()
3340                .map(|n| format!("{}.export-manifest.json", n.to_string_lossy()))
3341                .ok_or_else(|| {
3342                    EngineError::Bridge("destination path has no filename".to_owned())
3343                })?;
3344            p.set_file_name(stem);
3345            p
3346        };
3347        let manifest_json =
3348            serde_json::to_string(&manifest).map_err(|e| EngineError::Bridge(e.to_string()))?;
3349
3350        // Atomic manifest write: write to a temp file then rename so readers never
3351        // observe a partially-written manifest.
3352        let manifest_tmp = manifest_path.with_extension("json.tmp");
3353        if let Err(e) = fs::write(&manifest_tmp, &manifest_json)
3354            .and_then(|()| fs::rename(&manifest_tmp, &manifest_path))
3355        {
3356            let _ = fs::remove_file(&manifest_tmp);
3357            return Err(e.into());
3358        }
3359
3360        Ok(manifest)
3361    }
3362}
3363
3364#[allow(dead_code)]
3365#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
3366struct VectorEmbeddingContractRecord {
3367    profile: String,
3368    table_name: String,
3369    model_identity: String,
3370    model_version: String,
3371    dimension: usize,
3372    normalization_policy: String,
3373    chunking_policy: String,
3374    preprocessing_policy: String,
3375    generator_command_json: String,
3376    applied_at: i64,
3377    snapshot_hash: String,
3378    contract_format_version: i64,
3379}
3380
3381#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
3382struct VectorRegenerationInputChunk {
3383    chunk_id: String,
3384    node_logical_id: String,
3385    kind: String,
3386    text_content: String,
3387    byte_start: Option<i64>,
3388    byte_end: Option<i64>,
3389    source_ref: Option<String>,
3390    created_at: i64,
3391}
3392
3393#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
3394struct VectorRegenerationInput {
3395    profile: String,
3396    table_name: String,
3397    model_identity: String,
3398    model_version: String,
3399    dimension: usize,
3400    normalization_policy: String,
3401    chunking_policy: String,
3402    preprocessing_policy: String,
3403    chunks: Vec<VectorRegenerationInputChunk>,
3404}
3405
3406#[derive(Clone, Copy, Debug, PartialEq, Eq)]
3407pub(crate) enum VectorRegenerationFailureClass {
3408    InvalidContract,
3409    EmbedderFailure,
3410    InvalidEmbedderOutput,
3411    SnapshotDrift,
3412    UnsupportedVecCapability,
3413}
3414
3415impl VectorRegenerationFailureClass {
3416    fn label(self) -> &'static str {
3417        match self {
3418            Self::InvalidContract => "invalid contract",
3419            Self::EmbedderFailure => "embedder failure",
3420            Self::InvalidEmbedderOutput => "invalid embedder output",
3421            Self::SnapshotDrift => "snapshot drift",
3422            Self::UnsupportedVecCapability => "unsupported vec capability",
3423        }
3424    }
3425
3426    fn retryable(self) -> bool {
3427        matches!(self, Self::SnapshotDrift)
3428    }
3429}
3430
3431#[derive(Clone, Debug, PartialEq, Eq)]
3432pub(crate) struct VectorRegenerationFailure {
3433    class: VectorRegenerationFailureClass,
3434    detail: String,
3435}
3436
3437impl VectorRegenerationFailure {
3438    pub(crate) fn new(class: VectorRegenerationFailureClass, detail: impl Into<String>) -> Self {
3439        Self {
3440            class,
3441            detail: detail.into(),
3442        }
3443    }
3444
3445    fn to_engine_error(&self) -> EngineError {
3446        let retry_suffix = if self.class.retryable() {
3447            " [retryable]"
3448        } else {
3449            ""
3450        };
3451        EngineError::Bridge(format!(
3452            "vector regeneration {}: {}{}",
3453            self.class.label(),
3454            self.detail,
3455            retry_suffix
3456        ))
3457    }
3458
3459    fn failure_class_label(&self) -> &'static str {
3460        self.class.label()
3461    }
3462}
3463
3464#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
3465struct VectorRegenerationAuditMetadata {
3466    profile: String,
3467    model_identity: String,
3468    model_version: String,
3469    chunk_count: usize,
3470    snapshot_hash: String,
3471    #[serde(skip_serializing_if = "Option::is_none")]
3472    failure_class: Option<String>,
3473}
3474
3475#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize)]
3476#[serde(tag = "mode", rename_all = "snake_case")]
3477enum OperationalRetentionPolicy {
3478    KeepAll,
3479    PurgeBeforeSeconds { max_age_seconds: i64 },
3480    KeepLast { max_rows: usize },
3481}
3482
3483/// # Errors
3484/// Returns [`EngineError`] if the file cannot be read or the config is invalid.
3485pub fn load_vector_regeneration_config(
3486    path: impl AsRef<Path>,
3487) -> Result<VectorRegenerationConfig, EngineError> {
3488    let path = path.as_ref();
3489    let raw = fs::read_to_string(path)?;
3490    match path.extension().and_then(|ext| ext.to_str()) {
3491        Some("toml") => {
3492            toml::from_str(&raw).map_err(|error| EngineError::Bridge(error.to_string()))
3493        }
3494        Some("json") | None => {
3495            serde_json::from_str(&raw).map_err(|error| EngineError::Bridge(error.to_string()))
3496        }
3497        Some(other) => Err(EngineError::Bridge(format!(
3498            "unsupported vector regeneration config extension: {other}"
3499        ))),
3500    }
3501}
3502
3503fn validate_vector_regeneration_config(
3504    conn: &rusqlite::Connection,
3505    config: &VectorRegenerationConfig,
3506    identity: &QueryEmbedderIdentity,
3507) -> Result<VectorRegenerationConfig, VectorRegenerationFailure> {
3508    let kind = validate_bounded_text("kind", &config.kind, MAX_PROFILE_LEN)?;
3509    let profile = validate_bounded_text("profile", &config.profile, MAX_PROFILE_LEN)?;
3510    if identity.dimension == 0 {
3511        return Err(VectorRegenerationFailure::new(
3512            VectorRegenerationFailureClass::InvalidContract,
3513            "embedder reports dimension 0".to_owned(),
3514        ));
3515    }
3516    let chunking_policy =
3517        validate_bounded_text("chunking_policy", &config.chunking_policy, MAX_POLICY_LEN)?;
3518    let preprocessing_policy = validate_bounded_text(
3519        "preprocessing_policy",
3520        &config.preprocessing_policy,
3521        MAX_POLICY_LEN,
3522    )?;
3523
3524    if let Some(existing_dimension) = current_vector_profile_dimension(conn, &profile)?
3525        && existing_dimension != identity.dimension
3526    {
3527        return Err(VectorRegenerationFailure::new(
3528            VectorRegenerationFailureClass::InvalidContract,
3529            format!(
3530                "embedder dimension {} does not match existing vector profile dimension {}",
3531                identity.dimension, existing_dimension
3532            ),
3533        ));
3534    }
3535
3536    validate_existing_contract_version(conn, &profile)?;
3537
3538    let normalized = VectorRegenerationConfig {
3539        kind,
3540        profile,
3541        chunking_policy,
3542        preprocessing_policy,
3543    };
3544    let serialized = serde_json::to_vec(&normalized).map_err(|error| {
3545        VectorRegenerationFailure::new(
3546            VectorRegenerationFailureClass::InvalidContract,
3547            error.to_string(),
3548        )
3549    })?;
3550    if serialized.len() > MAX_CONTRACT_JSON_BYTES {
3551        return Err(VectorRegenerationFailure::new(
3552            VectorRegenerationFailureClass::InvalidContract,
3553            format!("serialized contract exceeds {MAX_CONTRACT_JSON_BYTES} bytes"),
3554        ));
3555    }
3556
3557    Ok(normalized)
3558}
3559
3560#[allow(clippy::cast_possible_wrap)]
3561fn persist_vector_contract(
3562    conn: &rusqlite::Connection,
3563    config: &VectorRegenerationConfig,
3564    table_name: &str,
3565    identity: &QueryEmbedderIdentity,
3566    snapshot_hash: &str,
3567) -> Result<(), EngineError> {
3568    conn.execute(
3569        r"
3570        INSERT OR REPLACE INTO vector_embedding_contracts (
3571            profile,
3572            table_name,
3573            model_identity,
3574            model_version,
3575            dimension,
3576            normalization_policy,
3577            chunking_policy,
3578            preprocessing_policy,
3579            generator_command_json,
3580            applied_at,
3581            snapshot_hash,
3582            contract_format_version,
3583            updated_at
3584        ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, unixepoch(), ?10, ?11, unixepoch())
3585        ",
3586        rusqlite::params![
3587            config.profile.as_str(),
3588            table_name,
3589            identity.model_identity.as_str(),
3590            identity.model_version.as_str(),
3591            identity.dimension as i64,
3592            identity.normalization_policy.as_str(),
3593            config.chunking_policy.as_str(),
3594            config.preprocessing_policy.as_str(),
3595            "[]",
3596            snapshot_hash,
3597            CURRENT_VECTOR_CONTRACT_FORMAT_VERSION,
3598        ],
3599    )?;
3600    Ok(())
3601}
3602
3603fn persist_vector_regeneration_event(
3604    conn: &rusqlite::Connection,
3605    event_type: &str,
3606    subject: &str,
3607    metadata: &VectorRegenerationAuditMetadata,
3608) -> Result<(), EngineError> {
3609    let metadata_json = serialize_audit_metadata(metadata)?;
3610    conn.execute(
3611        "INSERT INTO provenance_events (id, event_type, subject, metadata_json) VALUES (?1, ?2, ?3, ?4)",
3612        rusqlite::params![new_id(), event_type, subject, metadata_json],
3613    )?;
3614    Ok(())
3615}
3616
3617fn persist_simple_provenance_event(
3618    conn: &rusqlite::Connection,
3619    event_type: &str,
3620    subject: &str,
3621    metadata: Option<serde_json::Value>,
3622) -> Result<(), EngineError> {
3623    let metadata_json = metadata.map(|value| value.to_string()).unwrap_or_default();
3624    conn.execute(
3625        "INSERT INTO provenance_events (id, event_type, subject, metadata_json) VALUES (?1, ?2, ?3, ?4)",
3626        rusqlite::params![new_id(), event_type, subject, metadata_json],
3627    )?;
3628    Ok(())
3629}
3630
3631/// Count per-kind FTS integrity issues across all registered per-kind tables.
3632/// Returns (stale, orphaned, `mismatched_kind`, duplicate) counts.
3633///
3634/// - Stale: rows in a per-kind table whose node is superseded or missing.
3635/// - Orphaned: rows in a per-kind table for a kind with no registered schema.
3636/// - Mismatched kind: impossible with per-kind tables (always 0).
3637/// - Duplicate: same `node_logical_id` appears more than once in any per-kind table.
3638fn count_per_kind_property_fts_issues(
3639    conn: &rusqlite::Connection,
3640) -> Result<(i64, i64, i64, i64), EngineError> {
3641    // Collect all per-kind virtual tables from sqlite_master.
3642    // Filter by sql LIKE 'CREATE VIRTUAL TABLE%' to exclude FTS5 shadow tables
3643    // (e.g. fts_props_goal_data, fts_props_goal_idx) which share the same prefix.
3644    let per_kind_tables: Vec<String> = {
3645        let mut stmt = conn.prepare(
3646            "SELECT name FROM sqlite_master \
3647             WHERE type='table' AND name LIKE 'fts_props_%' \
3648             AND sql LIKE 'CREATE VIRTUAL TABLE%'",
3649        )?;
3650        stmt.query_map([], |r| r.get::<_, String>(0))?
3651            .collect::<Result<Vec<_>, _>>()?
3652    };
3653
3654    let registered_kinds: std::collections::HashSet<String> = {
3655        let mut stmt = conn.prepare("SELECT kind FROM fts_property_schemas")?;
3656        stmt.query_map([], |r| r.get::<_, String>(0))?
3657            .collect::<Result<std::collections::HashSet<_>, _>>()?
3658    };
3659
3660    let mut stale = 0i64;
3661    let mut orphaned = 0i64;
3662    let mut duplicate = 0i64;
3663
3664    for table in &per_kind_tables {
3665        // Stale: rows whose node_logical_id has no active node.
3666        let kind_stale: i64 = conn.query_row(
3667            &format!(
3668                "SELECT count(*) FROM {table} fp \
3669                 WHERE NOT EXISTS (\
3670                     SELECT 1 FROM nodes n \
3671                     WHERE n.logical_id = fp.node_logical_id AND n.superseded_at IS NULL\
3672                 )"
3673            ),
3674            [],
3675            |r| r.get(0),
3676        )?;
3677        stale += kind_stale;
3678
3679        // Duplicate: same node_logical_id more than once.
3680        let kind_dup: i64 = conn.query_row(
3681            &format!(
3682                "SELECT count(*) FROM (\
3683                     SELECT node_logical_id FROM {table} \
3684                     GROUP BY node_logical_id HAVING count(*) > 1\
3685                 )"
3686            ),
3687            [],
3688            |r| r.get(0),
3689        )?;
3690        duplicate += kind_dup;
3691
3692        // Orphaned: this per-kind table has no corresponding schema.
3693        // Determine which kind this table corresponds to by checking all registered kinds.
3694        let table_has_schema = registered_kinds
3695            .iter()
3696            .any(|k| fathomdb_schema::fts_kind_table_name(k) == *table);
3697        if !table_has_schema {
3698            let table_rows: i64 =
3699                conn.query_row(&format!("SELECT count(*) FROM {table}"), [], |r| r.get(0))?;
3700            orphaned += table_rows;
3701        }
3702    }
3703
3704    // Mismatched kind is always 0 with per-kind tables.
3705    Ok((stale, orphaned, 0, duplicate))
3706}
3707
3708/// Count active nodes that should have a property FTS row (extraction yields a value)
3709/// but don't. Uses the same extraction logic as write/rebuild to avoid false positives
3710/// for nodes whose declared paths legitimately normalize to no values.
3711fn count_missing_property_fts_rows(conn: &rusqlite::Connection) -> Result<i64, EngineError> {
3712    let schemas = crate::writer::load_fts_property_schemas(conn)?;
3713    if schemas.is_empty() {
3714        return Ok(0);
3715    }
3716
3717    let mut missing = 0i64;
3718    for (kind, schema) in &schemas {
3719        let table = fathomdb_schema::fts_kind_table_name(kind);
3720        // If the per-kind table doesn't exist yet, all nodes with extractable values are missing.
3721        let table_exists: bool = conn
3722            .query_row(
3723                "SELECT count(*) FROM sqlite_master WHERE type='table' AND name = ?1",
3724                [table.as_str()],
3725                |r| r.get::<_, i64>(0),
3726            )
3727            .unwrap_or(0)
3728            > 0;
3729
3730        if table_exists {
3731            let mut stmt = conn.prepare(&format!(
3732                "SELECT n.logical_id, n.properties FROM nodes n \
3733                 WHERE n.kind = ?1 AND n.superseded_at IS NULL \
3734                   AND NOT EXISTS (SELECT 1 FROM {table} fp WHERE fp.node_logical_id = n.logical_id)"
3735            ))?;
3736            let rows = stmt.query_map([kind.as_str()], |row| {
3737                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
3738            })?;
3739            for row in rows {
3740                let (_logical_id, properties_str) = row?;
3741                let props: serde_json::Value =
3742                    serde_json::from_str(&properties_str).unwrap_or_default();
3743                if crate::writer::extract_property_fts(&props, schema)
3744                    .0
3745                    .is_some()
3746                {
3747                    missing += 1;
3748                }
3749            }
3750        } else {
3751            // Per-kind table doesn't exist yet — count all nodes with extractable values.
3752            let mut stmt = conn.prepare(
3753                "SELECT n.logical_id, n.properties FROM nodes n \
3754                 WHERE n.kind = ?1 AND n.superseded_at IS NULL",
3755            )?;
3756            let rows = stmt.query_map([kind.as_str()], |row| {
3757                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
3758            })?;
3759            for row in rows {
3760                let (_logical_id, properties_str) = row?;
3761                let props: serde_json::Value =
3762                    serde_json::from_str(&properties_str).unwrap_or_default();
3763                if crate::writer::extract_property_fts(&props, schema)
3764                    .0
3765                    .is_some()
3766                {
3767                    missing += 1;
3768                }
3769            }
3770        }
3771    }
3772    Ok(missing)
3773}
3774
3775/// Count property FTS rows whose `text_content` has drifted from the current canonical
3776/// value computed by `compute_property_fts_text(...)`. This catches:
3777/// - rows whose text no longer matches the current node properties and schema
3778/// - rows that should have been removed (extraction now yields no value)
3779fn count_drifted_property_fts_rows(conn: &rusqlite::Connection) -> Result<i64, EngineError> {
3780    let schemas = crate::writer::load_fts_property_schemas(conn)?;
3781    if schemas.is_empty() {
3782        return Ok(0);
3783    }
3784
3785    let mut drifted = 0i64;
3786    for (kind, schema) in &schemas {
3787        let table = fathomdb_schema::fts_kind_table_name(kind);
3788        // If the per-kind table doesn't exist, no rows to check.
3789        let table_exists: bool = conn
3790            .query_row(
3791                "SELECT count(*) FROM sqlite_master WHERE type='table' AND name = ?1",
3792                [table.as_str()],
3793                |r| r.get::<_, i64>(0),
3794            )
3795            .unwrap_or(0)
3796            > 0;
3797        if !table_exists {
3798            continue;
3799        }
3800        let mut stmt = conn.prepare(&format!(
3801            "SELECT fp.node_logical_id, fp.text_content, n.properties \
3802             FROM {table} fp \
3803             JOIN nodes n ON n.logical_id = fp.node_logical_id AND n.superseded_at IS NULL \
3804             WHERE n.kind = ?1"
3805        ))?;
3806        let rows = stmt.query_map([kind.as_str()], |row| {
3807            Ok((
3808                row.get::<_, String>(0)?,
3809                row.get::<_, String>(1)?,
3810                row.get::<_, String>(2)?,
3811            ))
3812        })?;
3813        for row in rows {
3814            let (_logical_id, stored_text, properties_str) = row?;
3815            let props: serde_json::Value =
3816                serde_json::from_str(&properties_str).unwrap_or_default();
3817            let (expected, _positions, _stats) =
3818                crate::writer::extract_property_fts(&props, schema);
3819            match expected {
3820                Some(text) if text == stored_text => {}
3821                _ => drifted += 1,
3822            }
3823        }
3824    }
3825    Ok(drifted)
3826}
3827
3828/// Rebuild property FTS rows from canonical state within an existing transaction.
3829fn rebuild_property_fts_in_tx(conn: &rusqlite::Connection) -> Result<usize, EngineError> {
3830    // Delete from ALL per-kind FTS virtual tables (including orphaned ones without schemas).
3831    // Filter by sql LIKE 'CREATE VIRTUAL TABLE%' to exclude FTS5 shadow tables.
3832    let all_per_kind_tables: Vec<String> = {
3833        let mut stmt = conn.prepare(
3834            "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'fts_props_%' \
3835             AND sql LIKE 'CREATE VIRTUAL TABLE%'",
3836        )?;
3837        stmt.query_map([], |r| r.get::<_, String>(0))?
3838            .collect::<Result<Vec<_>, _>>()?
3839    };
3840    for table in &all_per_kind_tables {
3841        conn.execute_batch(&format!("DELETE FROM {table}"))?;
3842    }
3843    conn.execute("DELETE FROM fts_node_property_positions", [])?;
3844    let inserted = crate::projection::insert_property_fts_rows(
3845        conn,
3846        "SELECT logical_id, properties FROM nodes WHERE kind = ?1 AND superseded_at IS NULL",
3847    )?;
3848    Ok(inserted)
3849}
3850
3851/// Rebuild property FTS for a single node. Returns 1 if a row was inserted, 0 otherwise.
3852/// The caller must delete any existing per-kind FTS row for this node first.
3853fn rebuild_single_node_property_fts(
3854    conn: &rusqlite::Connection,
3855    logical_id: &str,
3856    kind: &str,
3857) -> Result<usize, EngineError> {
3858    let schema: Option<(String, String)> = conn
3859        .query_row(
3860            "SELECT property_paths_json, separator FROM fts_property_schemas WHERE kind = ?1",
3861            [kind],
3862            |row| {
3863                let paths_json: String = row.get(0)?;
3864                let separator: String = row.get(1)?;
3865                Ok((paths_json, separator))
3866            },
3867        )
3868        .optional()?;
3869    let Some((paths_json, separator)) = schema else {
3870        return Ok(0);
3871    };
3872    let parsed = crate::writer::parse_property_schema_json(&paths_json, &separator);
3873    let properties_str: Option<String> = conn
3874        .query_row(
3875            "SELECT properties FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL",
3876            [logical_id],
3877            |row| row.get(0),
3878        )
3879        .optional()?;
3880    let Some(properties_str) = properties_str else {
3881        return Ok(0);
3882    };
3883    let props: serde_json::Value = serde_json::from_str(&properties_str).unwrap_or_default();
3884    let (text, positions, _stats) = crate::writer::extract_property_fts(&props, &parsed);
3885    let Some(text) = text else {
3886        return Ok(0);
3887    };
3888    conn.execute(
3889        "DELETE FROM fts_node_property_positions WHERE node_logical_id = ?1",
3890        rusqlite::params![logical_id],
3891    )?;
3892    let table = fathomdb_schema::fts_kind_table_name(kind);
3893    let tok = fathomdb_schema::DEFAULT_FTS_TOKENIZER;
3894    conn.execute_batch(&format!(
3895        "CREATE VIRTUAL TABLE IF NOT EXISTS {table} \
3896         USING fts5(node_logical_id UNINDEXED, text_content, tokenize = '{tok}')"
3897    ))?;
3898    conn.execute(
3899        &format!("INSERT INTO {table} (node_logical_id, text_content) VALUES (?1, ?2)"),
3900        rusqlite::params![logical_id, text],
3901    )?;
3902    for pos in &positions {
3903        conn.execute(
3904            "INSERT INTO fts_node_property_positions \
3905             (node_logical_id, kind, start_offset, end_offset, leaf_path) \
3906             VALUES (?1, ?2, ?3, ?4, ?5)",
3907            rusqlite::params![
3908                logical_id,
3909                kind,
3910                i64::try_from(pos.start_offset).unwrap_or(i64::MAX),
3911                i64::try_from(pos.end_offset).unwrap_or(i64::MAX),
3912                pos.leaf_path,
3913            ],
3914        )?;
3915    }
3916    Ok(1)
3917}
3918
3919fn serialize_property_paths_json(
3920    entries: &[FtsPropertyPathSpec],
3921    exclude_paths: &[String],
3922) -> Result<String, EngineError> {
3923    // Scalar-only schemas with no exclude_paths and no weights are
3924    // serialised in the legacy shape (bare array of strings) for full
3925    // backwards compatibility with earlier schema versions.
3926    let all_scalar = entries
3927        .iter()
3928        .all(|e| e.mode == FtsPropertyPathMode::Scalar);
3929    let any_weight = entries.iter().any(|e| e.weight.is_some());
3930    if all_scalar && exclude_paths.is_empty() && !any_weight {
3931        let paths: Vec<&str> = entries.iter().map(|e| e.path.as_str()).collect();
3932        return serde_json::to_string(&paths).map_err(|e| {
3933            EngineError::InvalidWrite(format!("failed to serialize property paths: {e}"))
3934        });
3935    }
3936
3937    let mut obj = serde_json::Map::new();
3938    let paths_json: Vec<serde_json::Value> = entries
3939        .iter()
3940        .map(|e| {
3941            let mode_str = match e.mode {
3942                FtsPropertyPathMode::Scalar => "scalar",
3943                FtsPropertyPathMode::Recursive => "recursive",
3944            };
3945            let mut entry = serde_json::json!({ "path": e.path, "mode": mode_str });
3946            if let Some(w) = e.weight {
3947                entry["weight"] = serde_json::json!(w);
3948            }
3949            entry
3950        })
3951        .collect();
3952    obj.insert("paths".to_owned(), serde_json::Value::Array(paths_json));
3953    if !exclude_paths.is_empty() {
3954        obj.insert("exclude_paths".to_owned(), serde_json::json!(exclude_paths));
3955    }
3956    serde_json::to_string(&serde_json::Value::Object(obj))
3957        .map_err(|e| EngineError::InvalidWrite(format!("failed to serialize property paths: {e}")))
3958}
3959
3960/// Drop and recreate the per-kind FTS5 virtual table with one column per spec.
3961///
3962/// The tokenizer string is validated before interpolation into DDL to
3963/// prevent SQL injection.  If `specs` is empty a single `text_content`
3964/// column is used (matching the migration-21 baseline shape).
3965fn create_or_replace_fts_kind_table(
3966    conn: &rusqlite::Connection,
3967    kind: &str,
3968    specs: &[FtsPropertyPathSpec],
3969    tokenizer: &str,
3970) -> Result<(), EngineError> {
3971    let table = fathomdb_schema::fts_kind_table_name(kind);
3972
3973    // Validate tokenizer string: alphanumeric plus the set used by all known presets.
3974    // Must match the allowlist in `set_fts_profile` so that profiles written by one
3975    // function are accepted by the other.  The source-code preset
3976    // (`"unicode61 tokenchars '._-$@'"`) requires `.`, `-`, `$`, `@`.
3977    if !tokenizer
3978        .chars()
3979        .all(|c| c.is_alphanumeric() || "'._-$@ ".contains(c))
3980    {
3981        return Err(EngineError::Bridge(format!(
3982            "invalid tokenizer string: {tokenizer:?}"
3983        )));
3984    }
3985
3986    let cols: Vec<String> = if specs.is_empty() {
3987        vec![
3988            "node_logical_id UNINDEXED".to_owned(),
3989            "text_content".to_owned(),
3990        ]
3991    } else {
3992        std::iter::once("node_logical_id UNINDEXED".to_owned())
3993            .chain(specs.iter().map(|s| {
3994                let is_recursive = matches!(s.mode, FtsPropertyPathMode::Recursive);
3995                fathomdb_schema::fts_column_name(&s.path, is_recursive)
3996            }))
3997            .collect()
3998    };
3999
4000    // Escape inner apostrophes so the SQL single-quoted tokenize= clause is valid.
4001    // "unicode61 tokenchars '._-$@'" → "unicode61 tokenchars ''._-$@''"
4002    let tokenizer_sql = tokenizer.replace('\'', "''");
4003    conn.execute_batch(&format!(
4004        "DROP TABLE IF EXISTS {table}; \
4005         CREATE VIRTUAL TABLE {table} USING fts5({cols}, tokenize='{tokenizer_sql}');",
4006        cols = cols.join(", "),
4007    ))?;
4008
4009    Ok(())
4010}
4011
4012fn validate_fts_property_paths(paths: &[String]) -> Result<(), EngineError> {
4013    if paths.is_empty() {
4014        return Err(EngineError::InvalidWrite(
4015            "FTS property paths must not be empty".to_owned(),
4016        ));
4017    }
4018    let mut seen = std::collections::HashSet::new();
4019    for path in paths {
4020        if !path.starts_with("$.") {
4021            return Err(EngineError::InvalidWrite(format!(
4022                "FTS property path must start with '$.' but got: {path}"
4023            )));
4024        }
4025        let after_prefix = &path[2..]; // safe: already validated "$." prefix
4026        let segments: Vec<&str> = after_prefix.split('.').collect();
4027        if segments.is_empty() || segments.iter().any(|s| s.is_empty()) {
4028            return Err(EngineError::InvalidWrite(format!(
4029                "FTS property path has empty segment(s): {path}"
4030            )));
4031        }
4032        for seg in &segments {
4033            if !seg.chars().all(|c| c.is_alphanumeric() || c == '_') {
4034                return Err(EngineError::InvalidWrite(format!(
4035                    "FTS property path segment contains invalid characters: {path}"
4036                )));
4037            }
4038        }
4039        if !seen.insert(path) {
4040            return Err(EngineError::InvalidWrite(format!(
4041                "duplicate FTS property path: {path}"
4042            )));
4043        }
4044    }
4045    Ok(())
4046}
4047
4048fn load_fts_property_schema_record(
4049    conn: &rusqlite::Connection,
4050    kind: &str,
4051) -> Result<Option<FtsPropertySchemaRecord>, EngineError> {
4052    let row = conn
4053        .query_row(
4054            "SELECT kind, property_paths_json, separator, format_version \
4055             FROM fts_property_schemas WHERE kind = ?1",
4056            [kind],
4057            |row| {
4058                let kind: String = row.get(0)?;
4059                let paths_json: String = row.get(1)?;
4060                let separator: String = row.get(2)?;
4061                let format_version: i64 = row.get(3)?;
4062                Ok(build_fts_property_schema_record(
4063                    kind,
4064                    &paths_json,
4065                    separator,
4066                    format_version,
4067                ))
4068            },
4069        )
4070        .optional()?;
4071    Ok(row)
4072}
4073
4074/// Build an [`FtsPropertySchemaRecord`] from a raw
4075/// `fts_property_schemas` row. Delegates JSON parsing to
4076/// [`crate::writer::parse_property_schema_json`] — the same parser the
4077/// recursive walker uses at rebuild time — so both the legacy bare-array
4078/// shape and the Phase 4 object-shaped envelope round-trip correctly.
4079fn build_fts_property_schema_record(
4080    kind: String,
4081    paths_json: &str,
4082    separator: String,
4083    format_version: i64,
4084) -> FtsPropertySchemaRecord {
4085    let schema = crate::writer::parse_property_schema_json(paths_json, &separator);
4086    let entries: Vec<FtsPropertyPathSpec> = schema
4087        .paths
4088        .into_iter()
4089        .map(|entry| FtsPropertyPathSpec {
4090            path: entry.path,
4091            mode: match entry.mode {
4092                crate::writer::PropertyPathMode::Scalar => FtsPropertyPathMode::Scalar,
4093                crate::writer::PropertyPathMode::Recursive => FtsPropertyPathMode::Recursive,
4094            },
4095            weight: entry.weight,
4096        })
4097        .collect();
4098    let property_paths: Vec<String> = entries.iter().map(|e| e.path.clone()).collect();
4099    FtsPropertySchemaRecord {
4100        kind,
4101        property_paths,
4102        entries,
4103        exclude_paths: schema.exclude_paths,
4104        separator,
4105        format_version,
4106    }
4107}
4108
4109fn build_regeneration_input(
4110    config: &VectorRegenerationConfig,
4111    identity: &QueryEmbedderIdentity,
4112    chunks: Vec<VectorRegenerationInputChunk>,
4113) -> VectorRegenerationInput {
4114    VectorRegenerationInput {
4115        profile: config.profile.clone(),
4116        table_name: fathomdb_schema::vec_kind_table_name(&config.kind),
4117        model_identity: identity.model_identity.clone(),
4118        model_version: identity.model_version.clone(),
4119        dimension: identity.dimension,
4120        normalization_policy: identity.normalization_policy.clone(),
4121        chunking_policy: config.chunking_policy.clone(),
4122        preprocessing_policy: config.preprocessing_policy.clone(),
4123        chunks,
4124    }
4125}
4126
4127fn compute_snapshot_hash(payload: &VectorRegenerationInput) -> Result<String, EngineError> {
4128    let bytes =
4129        serde_json::to_vec(payload).map_err(|error| EngineError::Bridge(error.to_string()))?;
4130    let mut hasher = Sha256::new();
4131    hasher.update(bytes);
4132    Ok(format!("{:x}", hasher.finalize()))
4133}
4134
4135fn collect_regeneration_chunks(
4136    conn: &rusqlite::Connection,
4137) -> Result<Vec<VectorRegenerationInputChunk>, EngineError> {
4138    let mut stmt = conn.prepare(
4139        r"
4140        SELECT c.id, c.node_logical_id, n.kind, c.text_content, c.byte_start, c.byte_end, n.source_ref, c.created_at
4141        FROM chunks c
4142        JOIN nodes n
4143          ON n.logical_id = c.node_logical_id
4144         AND n.superseded_at IS NULL
4145        ORDER BY c.created_at, c.id
4146        ",
4147    )?;
4148    let chunks = stmt
4149        .query_map([], |row| {
4150            Ok(VectorRegenerationInputChunk {
4151                chunk_id: row.get(0)?,
4152                node_logical_id: row.get(1)?,
4153                kind: row.get(2)?,
4154                text_content: row.get(3)?,
4155                byte_start: row.get(4)?,
4156                byte_end: row.get(5)?,
4157                source_ref: row.get(6)?,
4158                created_at: row.get(7)?,
4159            })
4160        })?
4161        .collect::<Result<Vec<_>, _>>()?;
4162    Ok(chunks)
4163}
4164
4165fn validate_bounded_text(
4166    field: &str,
4167    value: &str,
4168    max_len: usize,
4169) -> Result<String, VectorRegenerationFailure> {
4170    let trimmed = value.trim();
4171    if trimmed.is_empty() {
4172        return Err(VectorRegenerationFailure::new(
4173            VectorRegenerationFailureClass::InvalidContract,
4174            format!("{field} must not be empty"),
4175        ));
4176    }
4177    if trimmed.len() > max_len {
4178        return Err(VectorRegenerationFailure::new(
4179            VectorRegenerationFailureClass::InvalidContract,
4180            format!("{field} exceeds max length {max_len}"),
4181        ));
4182    }
4183    Ok(trimmed.to_owned())
4184}
4185
4186fn current_vector_profile_dimension(
4187    conn: &rusqlite::Connection,
4188    profile: &str,
4189) -> Result<Option<usize>, VectorRegenerationFailure> {
4190    let dimension: Option<i64> = conn
4191        .query_row(
4192            "SELECT dimension FROM vector_profiles WHERE profile = ?1 AND enabled = 1",
4193            [profile],
4194            |row| row.get(0),
4195        )
4196        .optional()
4197        .map_err(|error| {
4198            VectorRegenerationFailure::new(
4199                VectorRegenerationFailureClass::InvalidContract,
4200                error.to_string(),
4201            )
4202        })?;
4203    dimension
4204        .map(|value| {
4205            usize::try_from(value).map_err(|_| {
4206                VectorRegenerationFailure::new(
4207                    VectorRegenerationFailureClass::InvalidContract,
4208                    format!("stored vector profile dimension is invalid: {value}"),
4209                )
4210            })
4211        })
4212        .transpose()
4213}
4214
4215fn validate_existing_contract_version(
4216    conn: &rusqlite::Connection,
4217    profile: &str,
4218) -> Result<(), VectorRegenerationFailure> {
4219    let version: Option<i64> = conn
4220        .query_row(
4221            "SELECT contract_format_version FROM vector_embedding_contracts WHERE profile = ?1",
4222            [profile],
4223            |row| row.get(0),
4224        )
4225        .optional()
4226        .map_err(|error| {
4227            VectorRegenerationFailure::new(
4228                VectorRegenerationFailureClass::InvalidContract,
4229                error.to_string(),
4230            )
4231        })?;
4232    if let Some(version) = version
4233        && version > CURRENT_VECTOR_CONTRACT_FORMAT_VERSION
4234    {
4235        return Err(VectorRegenerationFailure::new(
4236            VectorRegenerationFailureClass::InvalidContract,
4237            format!(
4238                "persisted contract format version {version} is unsupported; supported version is {CURRENT_VECTOR_CONTRACT_FORMAT_VERSION}"
4239            ),
4240        ));
4241    }
4242    Ok(())
4243}
4244
4245fn serialize_audit_metadata(
4246    metadata: &VectorRegenerationAuditMetadata,
4247) -> Result<String, EngineError> {
4248    let json =
4249        serde_json::to_string(metadata).map_err(|error| EngineError::Bridge(error.to_string()))?;
4250    if json.len() > MAX_AUDIT_METADATA_BYTES {
4251        return Err(VectorRegenerationFailure::new(
4252            VectorRegenerationFailureClass::InvalidContract,
4253            format!("audit metadata exceeds {MAX_AUDIT_METADATA_BYTES} bytes"),
4254        )
4255        .to_engine_error());
4256    }
4257    Ok(json)
4258}
4259
4260fn count_source_ref(
4261    conn: &rusqlite::Connection,
4262    table: &str,
4263    source_ref: &str,
4264) -> Result<usize, EngineError> {
4265    let sql = match table {
4266        "nodes" => "SELECT count(*) FROM nodes WHERE source_ref = ?1",
4267        "edges" => "SELECT count(*) FROM edges WHERE source_ref = ?1",
4268        "actions" => "SELECT count(*) FROM actions WHERE source_ref = ?1",
4269        "operational_mutations" => {
4270            "SELECT count(*) FROM operational_mutations WHERE source_ref = ?1"
4271        }
4272        other => return Err(EngineError::Bridge(format!("unknown table: {other}"))),
4273    };
4274    let count: i64 = conn.query_row(sql, [source_ref], |row| row.get(0))?;
4275    // FIX(review): was `count as usize` — unsound cast.
4276    // Chose option (C) here: propagate error since this is a user-facing helper.
4277    usize::try_from(count)
4278        .map_err(|_| EngineError::Bridge(format!("count overflow for table {table}: {count}")))
4279}
4280
4281fn rebuild_operational_current_rows(
4282    tx: &rusqlite::Transaction<'_>,
4283    collections: &[String],
4284) -> Result<usize, EngineError> {
4285    let mut rebuilt_rows = 0usize;
4286    clear_operational_current_rows(tx, collections)?;
4287    let mut ins_current = tx.prepare_cached(
4288        "INSERT INTO operational_current \
4289         (collection_name, record_key, payload_json, updated_at, last_mutation_id) \
4290         VALUES (?1, ?2, ?3, ?4, ?5)",
4291    )?;
4292
4293    for collection in collections {
4294        let mut stmt = tx.prepare(
4295            "SELECT id, collection_name, record_key, op_kind, payload_json, source_ref, created_at \
4296             FROM operational_mutations \
4297             WHERE collection_name = ?1 \
4298             ORDER BY record_key, mutation_order",
4299        )?;
4300        let mut latest_by_key: std::collections::HashMap<String, Option<(String, i64, String)>> =
4301            std::collections::HashMap::new();
4302        let rows = stmt.query_map([collection], map_operational_mutation_row)?;
4303        for row in rows {
4304            let mutation = row?;
4305            match mutation.op_kind.as_str() {
4306                "put" => {
4307                    latest_by_key.insert(
4308                        mutation.record_key,
4309                        Some((mutation.payload_json, mutation.created_at, mutation.id)),
4310                    );
4311                }
4312                "delete" => {
4313                    latest_by_key.insert(mutation.record_key, None);
4314                }
4315                _ => {}
4316            }
4317        }
4318
4319        for (record_key, state) in latest_by_key {
4320            if let Some((payload_json, updated_at, last_mutation_id)) = state {
4321                ins_current.execute(rusqlite::params![
4322                    collection,
4323                    record_key,
4324                    payload_json,
4325                    updated_at,
4326                    last_mutation_id,
4327                ])?;
4328                rebuilt_rows += 1;
4329            }
4330        }
4331    }
4332
4333    drop(ins_current);
4334    Ok(rebuilt_rows)
4335}
4336
4337fn clear_operational_current_rows(
4338    tx: &rusqlite::Transaction<'_>,
4339    collections: &[String],
4340) -> Result<(), EngineError> {
4341    let mut delete_current =
4342        tx.prepare_cached("DELETE FROM operational_current WHERE collection_name = ?1")?;
4343    let mut delete_secondary_current = tx.prepare_cached(
4344        "DELETE FROM operational_secondary_index_entries \
4345         WHERE collection_name = ?1 AND subject_kind = 'current'",
4346    )?;
4347    for collection in collections {
4348        delete_secondary_current.execute([collection])?;
4349        delete_current.execute([collection])?;
4350    }
4351    drop(delete_secondary_current);
4352    drop(delete_current);
4353    Ok(())
4354}
4355
4356fn clear_operational_secondary_index_entries(
4357    tx: &rusqlite::Transaction<'_>,
4358    collection_name: &str,
4359) -> Result<(), EngineError> {
4360    tx.execute(
4361        "DELETE FROM operational_secondary_index_entries WHERE collection_name = ?1",
4362        [collection_name],
4363    )?;
4364    Ok(())
4365}
4366
4367fn insert_operational_secondary_index_entry(
4368    tx: &rusqlite::Transaction<'_>,
4369    collection_name: &str,
4370    subject_kind: &str,
4371    mutation_id: &str,
4372    record_key: &str,
4373    entry: &crate::operational::OperationalSecondaryIndexEntry,
4374) -> Result<(), EngineError> {
4375    tx.execute(
4376        "INSERT INTO operational_secondary_index_entries \
4377         (collection_name, index_name, subject_kind, mutation_id, record_key, sort_timestamp, \
4378          slot1_text, slot1_integer, slot2_text, slot2_integer, slot3_text, slot3_integer) \
4379         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
4380        rusqlite::params![
4381            collection_name,
4382            entry.index_name,
4383            subject_kind,
4384            mutation_id,
4385            record_key,
4386            entry.sort_timestamp,
4387            entry.slot1_text,
4388            entry.slot1_integer,
4389            entry.slot2_text,
4390            entry.slot2_integer,
4391            entry.slot3_text,
4392            entry.slot3_integer,
4393        ],
4394    )?;
4395    Ok(())
4396}
4397
4398fn rebuild_operational_secondary_index_entries(
4399    tx: &rusqlite::Transaction<'_>,
4400    collection_name: &str,
4401    collection_kind: OperationalCollectionKind,
4402    indexes: &[OperationalSecondaryIndexDefinition],
4403) -> Result<(usize, usize), EngineError> {
4404    clear_operational_secondary_index_entries(tx, collection_name)?;
4405
4406    let mut mutation_entries_rebuilt = 0usize;
4407    if collection_kind == OperationalCollectionKind::AppendOnlyLog {
4408        let mut stmt = tx.prepare(
4409            "SELECT id, record_key, payload_json FROM operational_mutations \
4410             WHERE collection_name = ?1 ORDER BY mutation_order",
4411        )?;
4412        let rows = stmt
4413            .query_map([collection_name], |row| {
4414                Ok((
4415                    row.get::<_, String>(0)?,
4416                    row.get::<_, String>(1)?,
4417                    row.get::<_, String>(2)?,
4418                ))
4419            })?
4420            .collect::<Result<Vec<_>, _>>()?;
4421        drop(stmt);
4422        for (mutation_id, record_key, payload_json) in rows {
4423            for entry in extract_secondary_index_entries_for_mutation(indexes, &payload_json) {
4424                insert_operational_secondary_index_entry(
4425                    tx,
4426                    collection_name,
4427                    "mutation",
4428                    &mutation_id,
4429                    &record_key,
4430                    &entry,
4431                )?;
4432                mutation_entries_rebuilt += 1;
4433            }
4434        }
4435    }
4436
4437    let mut current_entries_rebuilt = 0usize;
4438    if collection_kind == OperationalCollectionKind::LatestState {
4439        let mut stmt = tx.prepare(
4440            "SELECT record_key, payload_json, updated_at, last_mutation_id FROM operational_current \
4441             WHERE collection_name = ?1 ORDER BY updated_at DESC, record_key",
4442        )?;
4443        let rows = stmt
4444            .query_map([collection_name], |row| {
4445                Ok((
4446                    row.get::<_, String>(0)?,
4447                    row.get::<_, String>(1)?,
4448                    row.get::<_, i64>(2)?,
4449                    row.get::<_, String>(3)?,
4450                ))
4451            })?
4452            .collect::<Result<Vec<_>, _>>()?;
4453        drop(stmt);
4454        for (record_key, payload_json, updated_at, last_mutation_id) in rows {
4455            for entry in
4456                extract_secondary_index_entries_for_current(indexes, &payload_json, updated_at)
4457            {
4458                insert_operational_secondary_index_entry(
4459                    tx,
4460                    collection_name,
4461                    "current",
4462                    &last_mutation_id,
4463                    &record_key,
4464                    &entry,
4465                )?;
4466                current_entries_rebuilt += 1;
4467            }
4468        }
4469    }
4470
4471    Ok((mutation_entries_rebuilt, current_entries_rebuilt))
4472}
4473
4474fn collect_strings_tx(
4475    tx: &rusqlite::Transaction<'_>,
4476    sql: &str,
4477    value: &str,
4478) -> Result<Vec<String>, EngineError> {
4479    let mut stmt = tx.prepare(sql)?;
4480    let rows = stmt.query_map([value], |row| row.get::<_, String>(0))?;
4481    rows.collect::<Result<Vec<_>, _>>()
4482        .map_err(EngineError::from)
4483}
4484
4485/// Convert a non-negative i64 count to usize, panicking on negative values
4486/// which would indicate data corruption.
4487#[allow(clippy::expect_used)]
4488fn i64_to_usize(val: i64) -> usize {
4489    usize::try_from(val).expect("count(*) must be non-negative")
4490}
4491
4492/// Runs a parameterized query and collects the first column as strings.
4493///
4494/// NOTE(review): sql parameter must be a hardcoded query string, never user input.
4495/// Options: (A) doc comment, (B) whitelist refactor like `count_source_ref`, (C) leave as-is.
4496/// Chose (A): function is private, only called with hardcoded SQL from `trace_source`.
4497/// Whitelist refactor not practical — queries have different SELECT/ORDER BY per table.
4498fn collect_strings(
4499    conn: &rusqlite::Connection,
4500    sql: &str,
4501    param: &str,
4502) -> Result<Vec<String>, EngineError> {
4503    let mut stmt = conn.prepare(sql)?;
4504    let values = stmt
4505        .query_map([param], |row| row.get::<_, String>(0))?
4506        .collect::<Result<Vec<_>, _>>()?;
4507    Ok(values)
4508}
4509
4510fn collect_edge_logical_ids_for_restore(
4511    tx: &rusqlite::Transaction<'_>,
4512    logical_id: &str,
4513    retire_source_ref: Option<&str>,
4514    retire_created_at: i64,
4515    retire_event_rowid: i64,
4516) -> Result<Vec<String>, EngineError> {
4517    let mut stmt = tx.prepare(
4518        "SELECT DISTINCT e.logical_id \
4519         FROM edges e \
4520         JOIN provenance_events p \
4521           ON p.subject = e.logical_id \
4522          AND p.event_type = 'edge_retire' \
4523          AND ( \
4524                p.created_at > ?3 \
4525                OR (p.created_at = ?3 AND p.rowid >= ?4) \
4526          ) \
4527          AND ((?2 IS NULL AND p.source_ref IS NULL) OR p.source_ref = ?2) \
4528         WHERE e.superseded_at IS NOT NULL \
4529           AND (e.source_logical_id = ?1 OR e.target_logical_id = ?1) \
4530           AND NOT EXISTS ( \
4531                SELECT 1 FROM edges active \
4532                WHERE active.logical_id = e.logical_id \
4533                  AND active.superseded_at IS NULL \
4534           ) \
4535         ORDER BY e.logical_id",
4536    )?;
4537    let edge_ids = stmt
4538        .query_map(
4539            rusqlite::params![
4540                logical_id,
4541                retire_source_ref,
4542                retire_created_at,
4543                retire_event_rowid
4544            ],
4545            |row| row.get::<_, String>(0),
4546        )?
4547        .collect::<Result<Vec<_>, _>>()?;
4548    Ok(edge_ids)
4549}
4550
4551/// Restores edges for a node being restored, skipping any whose counterpart
4552/// endpoint is not active (e.g. still retired or purged).
4553fn restore_validated_edges(
4554    tx: &rusqlite::Transaction<'_>,
4555    logical_id: &str,
4556    retire_source_ref: Option<&str>,
4557    retire_created_at: i64,
4558    retire_event_rowid: i64,
4559) -> Result<(usize, Vec<SkippedEdge>), EngineError> {
4560    let edge_logical_ids = collect_edge_logical_ids_for_restore(
4561        tx,
4562        logical_id,
4563        retire_source_ref,
4564        retire_created_at,
4565        retire_event_rowid,
4566    )?;
4567    let mut restored = 0usize;
4568    let mut skipped = Vec::new();
4569    for edge_logical_id in &edge_logical_ids {
4570        let edge_detail: Option<(String, String, String)> = tx
4571            .query_row(
4572                "SELECT row_id, source_logical_id, target_logical_id FROM edges \
4573                 WHERE logical_id = ?1 AND superseded_at IS NOT NULL \
4574                 ORDER BY superseded_at DESC, created_at DESC, rowid DESC LIMIT 1",
4575                [edge_logical_id.as_str()],
4576                |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
4577            )
4578            .optional()?;
4579        let Some((edge_row_id, source_lid, target_lid)) = edge_detail else {
4580            continue;
4581        };
4582        let other_endpoint = if source_lid == logical_id {
4583            &target_lid
4584        } else {
4585            &source_lid
4586        };
4587        let endpoint_active: bool = tx
4588            .query_row(
4589                "SELECT 1 FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL LIMIT 1",
4590                [other_endpoint.as_str()],
4591                |_| Ok(true),
4592            )
4593            .optional()?
4594            .unwrap_or(false);
4595        if !endpoint_active {
4596            skipped.push(SkippedEdge {
4597                edge_logical_id: edge_logical_id.clone(),
4598                missing_endpoint: other_endpoint.clone(),
4599            });
4600            continue;
4601        }
4602        restored += tx.execute(
4603            "UPDATE edges SET superseded_at = NULL WHERE row_id = ?1",
4604            [edge_row_id.as_str()],
4605        )?;
4606    }
4607    Ok((restored, skipped))
4608}
4609
4610#[cfg(feature = "sqlite-vec")]
4611fn count_vec_rows_for_logical_id(
4612    tx: &rusqlite::Transaction<'_>,
4613    logical_id: &str,
4614) -> Result<usize, EngineError> {
4615    // Look up the kind for this logical_id to derive the per-kind vec table name.
4616    let kind: Option<String> = tx
4617        .query_row(
4618            "SELECT kind FROM nodes WHERE logical_id = ?1 LIMIT 1",
4619            [logical_id],
4620            |row| row.get(0),
4621        )
4622        .optional()?;
4623    let Some(kind) = kind else {
4624        return Ok(0);
4625    };
4626    let table_name = fathomdb_schema::vec_kind_table_name(&kind);
4627    match tx.query_row(
4628        &format!(
4629            "SELECT count(*) FROM {table_name} v \
4630             JOIN chunks c ON c.id = v.chunk_id \
4631             WHERE c.node_logical_id = ?1"
4632        ),
4633        [logical_id],
4634        |row| row.get::<_, i64>(0),
4635    ) {
4636        Ok(count) => Ok(i64_to_usize(count)),
4637        Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
4638            if msg.contains(&table_name) || msg.contains("no such module: vec0") =>
4639        {
4640            Ok(0)
4641        }
4642        Err(error) => Err(EngineError::Sqlite(error)),
4643    }
4644}
4645
4646#[cfg(not(feature = "sqlite-vec"))]
4647#[allow(clippy::unnecessary_wraps)]
4648fn count_vec_rows_for_logical_id(
4649    _tx: &rusqlite::Transaction<'_>,
4650    _logical_id: &str,
4651) -> Result<usize, EngineError> {
4652    Ok(0)
4653}
4654
4655#[cfg(feature = "sqlite-vec")]
4656fn delete_vec_rows_for_logical_id(
4657    tx: &rusqlite::Transaction<'_>,
4658    logical_id: &str,
4659) -> Result<usize, EngineError> {
4660    // Look up the kind for this logical_id to derive the per-kind vec table name.
4661    let kind: Option<String> = tx
4662        .query_row(
4663            "SELECT kind FROM nodes WHERE logical_id = ?1 LIMIT 1",
4664            [logical_id],
4665            |row| row.get(0),
4666        )
4667        .optional()?;
4668    let Some(kind) = kind else {
4669        return Ok(0);
4670    };
4671    let table_name = fathomdb_schema::vec_kind_table_name(&kind);
4672    match tx.execute(
4673        &format!(
4674            "DELETE FROM {table_name} WHERE chunk_id IN (SELECT id FROM chunks WHERE node_logical_id = ?1)"
4675        ),
4676        [logical_id],
4677    ) {
4678        Ok(count) => Ok(count),
4679        Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
4680            if msg.contains(&table_name) || msg.contains("no such module: vec0") =>
4681        {
4682            Ok(0)
4683        }
4684        Err(error) => Err(EngineError::Sqlite(error)),
4685    }
4686}
4687
4688#[cfg(not(feature = "sqlite-vec"))]
4689#[allow(clippy::unnecessary_wraps)]
4690fn delete_vec_rows_for_logical_id(
4691    _tx: &rusqlite::Transaction<'_>,
4692    _logical_id: &str,
4693) -> Result<usize, EngineError> {
4694    Ok(0)
4695}
4696
4697fn ensure_operational_collection_registered(
4698    conn: &rusqlite::Connection,
4699    collection_name: &str,
4700) -> Result<(), EngineError> {
4701    if load_operational_collection_record(conn, collection_name)?.is_none() {
4702        return Err(EngineError::InvalidWrite(format!(
4703            "operational collection '{collection_name}' is not registered"
4704        )));
4705    }
4706    Ok(())
4707}
4708
4709fn load_operational_collection_record(
4710    conn: &rusqlite::Connection,
4711    name: &str,
4712) -> Result<Option<OperationalCollectionRecord>, EngineError> {
4713    conn.query_row(
4714        "SELECT name, kind, schema_json, retention_json, filter_fields_json, validation_json, secondary_indexes_json, format_version, created_at, disabled_at \
4715         FROM operational_collections WHERE name = ?1",
4716        [name],
4717        map_operational_collection_row,
4718    )
4719    .optional()
4720    .map_err(EngineError::Sqlite)
4721}
4722
4723fn validate_append_only_operational_collection(
4724    record: &OperationalCollectionRecord,
4725    operation: &str,
4726) -> Result<(), EngineError> {
4727    if record.kind != OperationalCollectionKind::AppendOnlyLog {
4728        return Err(EngineError::InvalidWrite(format!(
4729            "operational collection '{}' must be append_only_log to {operation}",
4730            record.name
4731        )));
4732    }
4733    Ok(())
4734}
4735
4736#[derive(Clone, Debug, PartialEq, Eq)]
4737struct CompiledOperationalReadFilter {
4738    field: String,
4739    condition: OperationalReadCondition,
4740}
4741
4742#[derive(Clone, Debug)]
4743struct MatchedAppendOnlySecondaryIndexRead<'a> {
4744    index_name: &'a str,
4745    value_filter: &'a CompiledOperationalReadFilter,
4746    time_range: Option<&'a CompiledOperationalReadFilter>,
4747}
4748
4749#[derive(Clone, Debug, PartialEq, Eq)]
4750enum OperationalReadCondition {
4751    ExactString(String),
4752    ExactInteger(i64),
4753    Prefix(String),
4754    Range {
4755        lower: Option<i64>,
4756        upper: Option<i64>,
4757    },
4758}
4759
4760fn operational_read_limit(limit: Option<usize>) -> Result<usize, EngineError> {
4761    let applied_limit = limit.unwrap_or(DEFAULT_OPERATIONAL_READ_LIMIT);
4762    if applied_limit == 0 {
4763        return Err(EngineError::InvalidWrite(
4764            "operational read limit must be greater than zero".to_owned(),
4765        ));
4766    }
4767    Ok(applied_limit.min(MAX_OPERATIONAL_READ_LIMIT))
4768}
4769
4770fn parse_operational_filter_fields(
4771    filter_fields_json: &str,
4772) -> Result<Vec<OperationalFilterField>, String> {
4773    let fields: Vec<OperationalFilterField> = serde_json::from_str(filter_fields_json)
4774        .map_err(|error| format!("invalid filter_fields_json: {error}"))?;
4775    let mut seen = std::collections::HashSet::new();
4776    for field in &fields {
4777        if field.name.trim().is_empty() {
4778            return Err("filter_fields_json field names must not be empty".to_owned());
4779        }
4780        if !seen.insert(field.name.as_str()) {
4781            return Err(format!(
4782                "filter_fields_json contains duplicate field '{}'",
4783                field.name
4784            ));
4785        }
4786        if field.modes.is_empty() {
4787            return Err(format!(
4788                "filter_fields_json field '{}' must declare at least one mode",
4789                field.name
4790            ));
4791        }
4792        if field.modes.contains(&OperationalFilterMode::Prefix)
4793            && field.field_type != OperationalFilterFieldType::String
4794        {
4795            return Err(format!(
4796                "filter field '{}' only supports prefix for string types",
4797                field.name
4798            ));
4799        }
4800    }
4801    Ok(fields)
4802}
4803
4804fn compile_operational_read_filters(
4805    filters: &[OperationalFilterClause],
4806    declared_fields: &[OperationalFilterField],
4807) -> Result<Vec<CompiledOperationalReadFilter>, EngineError> {
4808    let field_map = declared_fields
4809        .iter()
4810        .map(|field| (field.name.as_str(), field))
4811        .collect::<std::collections::HashMap<_, _>>();
4812    filters
4813        .iter()
4814        .map(|filter| match filter {
4815            OperationalFilterClause::Exact { field, value } => {
4816                let declared = field_map.get(field.as_str()).ok_or_else(|| {
4817                    EngineError::InvalidWrite(format!(
4818                        "operational read filter uses undeclared field '{field}'"
4819                    ))
4820                })?;
4821                if !declared.modes.contains(&OperationalFilterMode::Exact) {
4822                    return Err(EngineError::InvalidWrite(format!(
4823                        "operational read field '{field}' does not allow exact filters"
4824                    )));
4825                }
4826                let condition = match (declared.field_type, value) {
4827                    (OperationalFilterFieldType::String, OperationalFilterValue::String(value)) => {
4828                        OperationalReadCondition::ExactString(value.clone())
4829                    }
4830                    (
4831                        OperationalFilterFieldType::Integer | OperationalFilterFieldType::Timestamp,
4832                        OperationalFilterValue::Integer(value),
4833                    ) => OperationalReadCondition::ExactInteger(*value),
4834                    _ => {
4835                        return Err(EngineError::InvalidWrite(format!(
4836                            "operational read field '{field}' received a value with the wrong type"
4837                        )));
4838                    }
4839                };
4840                Ok(CompiledOperationalReadFilter {
4841                    field: field.clone(),
4842                    condition,
4843                })
4844            }
4845            OperationalFilterClause::Prefix { field, value } => {
4846                let declared = field_map.get(field.as_str()).ok_or_else(|| {
4847                    EngineError::InvalidWrite(format!(
4848                        "operational read filter uses undeclared field '{field}'"
4849                    ))
4850                })?;
4851                if !declared.modes.contains(&OperationalFilterMode::Prefix) {
4852                    return Err(EngineError::InvalidWrite(format!(
4853                        "operational read field '{field}' does not allow prefix filters"
4854                    )));
4855                }
4856                if declared.field_type != OperationalFilterFieldType::String {
4857                    return Err(EngineError::InvalidWrite(format!(
4858                        "operational read field '{field}' only supports prefix filters for strings"
4859                    )));
4860                }
4861                Ok(CompiledOperationalReadFilter {
4862                    field: field.clone(),
4863                    condition: OperationalReadCondition::Prefix(value.clone()),
4864                })
4865            }
4866            OperationalFilterClause::Range {
4867                field,
4868                lower,
4869                upper,
4870            } => {
4871                let declared = field_map.get(field.as_str()).ok_or_else(|| {
4872                    EngineError::InvalidWrite(format!(
4873                        "operational read filter uses undeclared field '{field}'"
4874                    ))
4875                })?;
4876                if !declared.modes.contains(&OperationalFilterMode::Range) {
4877                    return Err(EngineError::InvalidWrite(format!(
4878                        "operational read field '{field}' does not allow range filters"
4879                    )));
4880                }
4881                if !matches!(
4882                    declared.field_type,
4883                    OperationalFilterFieldType::Integer | OperationalFilterFieldType::Timestamp
4884                ) {
4885                    return Err(EngineError::InvalidWrite(format!(
4886                        "operational read field '{field}' only supports range filters for integer/timestamp fields"
4887                    )));
4888                }
4889                if lower.is_none() && upper.is_none() {
4890                    return Err(EngineError::InvalidWrite(format!(
4891                        "operational read range filter for '{field}' must specify a lower or upper bound"
4892                    )));
4893                }
4894                Ok(CompiledOperationalReadFilter {
4895                    field: field.clone(),
4896                    condition: OperationalReadCondition::Range {
4897                        lower: *lower,
4898                        upper: *upper,
4899                    },
4900                })
4901            }
4902        })
4903        .collect()
4904}
4905
4906fn match_append_only_secondary_index_read<'a>(
4907    filters: &'a [CompiledOperationalReadFilter],
4908    indexes: &'a [OperationalSecondaryIndexDefinition],
4909) -> Option<MatchedAppendOnlySecondaryIndexRead<'a>> {
4910    indexes.iter().find_map(|index| {
4911        let OperationalSecondaryIndexDefinition::AppendOnlyFieldTime {
4912            name,
4913            field,
4914            value_type,
4915            time_field,
4916        } = index
4917        else {
4918            return None;
4919        };
4920        if !(1..=2).contains(&filters.len()) {
4921            return None;
4922        }
4923
4924        let mut value_filter = None;
4925        let mut time_range = None;
4926        for filter in filters {
4927            if filter.field == *field {
4928                let supported = matches!(
4929                    (&filter.condition, value_type),
4930                    (
4931                        OperationalReadCondition::ExactString(_)
4932                            | OperationalReadCondition::Prefix(_),
4933                        crate::operational::OperationalSecondaryIndexValueType::String
4934                    ) | (
4935                        OperationalReadCondition::ExactInteger(_),
4936                        crate::operational::OperationalSecondaryIndexValueType::Integer
4937                            | crate::operational::OperationalSecondaryIndexValueType::Timestamp
4938                    )
4939                );
4940                if !supported || value_filter.is_some() {
4941                    return None;
4942                }
4943                value_filter = Some(filter);
4944                continue;
4945            }
4946            if filter.field == *time_field {
4947                if !matches!(filter.condition, OperationalReadCondition::Range { .. })
4948                    || time_range.is_some()
4949                {
4950                    return None;
4951                }
4952                time_range = Some(filter);
4953                continue;
4954            }
4955            return None;
4956        }
4957
4958        value_filter.map(|value_filter| MatchedAppendOnlySecondaryIndexRead {
4959            index_name: name.as_str(),
4960            value_filter,
4961            time_range,
4962        })
4963    })
4964}
4965
4966fn execute_operational_secondary_index_read(
4967    conn: &rusqlite::Connection,
4968    collection_name: &str,
4969    filters: &[CompiledOperationalReadFilter],
4970    indexes: &[OperationalSecondaryIndexDefinition],
4971    applied_limit: usize,
4972) -> Result<Option<OperationalReadReport>, EngineError> {
4973    use rusqlite::types::Value;
4974
4975    let Some(matched) = match_append_only_secondary_index_read(filters, indexes) else {
4976        return Ok(None);
4977    };
4978
4979    let mut sql = String::from(
4980        "SELECT m.id, m.collection_name, m.record_key, m.op_kind, m.payload_json, m.source_ref, m.created_at \
4981         FROM operational_secondary_index_entries s \
4982         JOIN operational_mutations m ON m.id = s.mutation_id \
4983         WHERE s.collection_name = ?1 AND s.index_name = ?2 AND s.subject_kind = 'mutation' ",
4984    );
4985    let mut params = vec![
4986        Value::from(collection_name.to_owned()),
4987        Value::from(matched.index_name.to_owned()),
4988    ];
4989
4990    match &matched.value_filter.condition {
4991        OperationalReadCondition::ExactString(value) => {
4992            let _ = write!(sql, "AND s.slot1_text = ?{} ", params.len() + 1);
4993            params.push(Value::from(value.clone()));
4994        }
4995        OperationalReadCondition::Prefix(value) => {
4996            let _ = write!(sql, "AND s.slot1_text GLOB ?{} ", params.len() + 1);
4997            params.push(Value::from(glob_prefix_pattern(value)));
4998        }
4999        OperationalReadCondition::ExactInteger(value) => {
5000            let _ = write!(sql, "AND s.slot1_integer = ?{} ", params.len() + 1);
5001            params.push(Value::from(*value));
5002        }
5003        OperationalReadCondition::Range { .. } => return Ok(None),
5004    }
5005
5006    if let Some(time_range) = matched.time_range
5007        && let OperationalReadCondition::Range { lower, upper } = &time_range.condition
5008    {
5009        if let Some(lower) = lower {
5010            let _ = write!(sql, "AND s.sort_timestamp >= ?{} ", params.len() + 1);
5011            params.push(Value::from(*lower));
5012        }
5013        if let Some(upper) = upper {
5014            let _ = write!(sql, "AND s.sort_timestamp <= ?{} ", params.len() + 1);
5015            params.push(Value::from(*upper));
5016        }
5017    }
5018
5019    let _ = write!(
5020        sql,
5021        "ORDER BY s.sort_timestamp DESC, m.mutation_order DESC LIMIT ?{}",
5022        params.len() + 1
5023    );
5024    params.push(Value::from(i64::try_from(applied_limit + 1).map_err(
5025        |_| EngineError::Bridge("operational read limit overflow".to_owned()),
5026    )?));
5027
5028    let mut stmt = conn.prepare(&sql)?;
5029    let mut rows = stmt
5030        .query_map(
5031            rusqlite::params_from_iter(params),
5032            map_operational_mutation_row,
5033        )?
5034        .collect::<Result<Vec<_>, _>>()?;
5035    let was_limited = rows.len() > applied_limit;
5036    if was_limited {
5037        rows.truncate(applied_limit);
5038    }
5039
5040    Ok(Some(OperationalReadReport {
5041        collection_name: collection_name.to_owned(),
5042        row_count: rows.len(),
5043        applied_limit,
5044        was_limited,
5045        rows,
5046    }))
5047}
5048
5049fn execute_operational_filtered_read(
5050    conn: &rusqlite::Connection,
5051    collection_name: &str,
5052    filters: &[CompiledOperationalReadFilter],
5053    applied_limit: usize,
5054) -> Result<OperationalReadReport, EngineError> {
5055    use rusqlite::types::Value;
5056
5057    let mut sql = String::from(
5058        "SELECT m.id, m.collection_name, m.record_key, m.op_kind, m.payload_json, m.source_ref, m.created_at \
5059         FROM operational_mutations m ",
5060    );
5061    let mut params = vec![Value::from(collection_name.to_owned())];
5062    for (index, filter) in filters.iter().enumerate() {
5063        let _ = write!(
5064            sql,
5065            "JOIN operational_filter_values f{index} \
5066             ON f{index}.mutation_id = m.id \
5067            AND f{index}.collection_name = m.collection_name "
5068        );
5069        match &filter.condition {
5070            OperationalReadCondition::ExactString(value) => {
5071                let _ = write!(
5072                    sql,
5073                    "AND f{index}.field_name = ?{} AND f{index}.string_value = ?{} ",
5074                    params.len() + 1,
5075                    params.len() + 2
5076                );
5077                params.push(Value::from(filter.field.clone()));
5078                params.push(Value::from(value.clone()));
5079            }
5080            OperationalReadCondition::ExactInteger(value) => {
5081                let _ = write!(
5082                    sql,
5083                    "AND f{index}.field_name = ?{} AND f{index}.integer_value = ?{} ",
5084                    params.len() + 1,
5085                    params.len() + 2
5086                );
5087                params.push(Value::from(filter.field.clone()));
5088                params.push(Value::from(*value));
5089            }
5090            OperationalReadCondition::Prefix(value) => {
5091                let _ = write!(
5092                    sql,
5093                    "AND f{index}.field_name = ?{} AND f{index}.string_value GLOB ?{} ",
5094                    params.len() + 1,
5095                    params.len() + 2
5096                );
5097                params.push(Value::from(filter.field.clone()));
5098                params.push(Value::from(glob_prefix_pattern(value)));
5099            }
5100            OperationalReadCondition::Range { lower, upper } => {
5101                let _ = write!(sql, "AND f{index}.field_name = ?{} ", params.len() + 1);
5102                params.push(Value::from(filter.field.clone()));
5103                if let Some(lower) = lower {
5104                    let _ = write!(sql, "AND f{index}.integer_value >= ?{} ", params.len() + 1);
5105                    params.push(Value::from(*lower));
5106                }
5107                if let Some(upper) = upper {
5108                    let _ = write!(sql, "AND f{index}.integer_value <= ?{} ", params.len() + 1);
5109                    params.push(Value::from(*upper));
5110                }
5111            }
5112        }
5113    }
5114    let _ = write!(
5115        sql,
5116        "WHERE m.collection_name = ?1 ORDER BY m.mutation_order DESC LIMIT ?{}",
5117        params.len() + 1
5118    );
5119    params.push(Value::from(i64::try_from(applied_limit + 1).map_err(
5120        |_| EngineError::Bridge("operational read limit overflow".to_owned()),
5121    )?));
5122
5123    let mut stmt = conn.prepare(&sql)?;
5124    let mut rows = stmt
5125        .query_map(
5126            rusqlite::params_from_iter(params),
5127            map_operational_mutation_row,
5128        )?
5129        .collect::<Result<Vec<_>, _>>()?;
5130    let was_limited = rows.len() > applied_limit;
5131    if was_limited {
5132        rows.truncate(applied_limit);
5133    }
5134    Ok(OperationalReadReport {
5135        collection_name: collection_name.to_owned(),
5136        row_count: rows.len(),
5137        applied_limit,
5138        was_limited,
5139        rows,
5140    })
5141}
5142
5143fn glob_prefix_pattern(value: &str) -> String {
5144    let mut pattern = String::with_capacity(value.len() + 1);
5145    for ch in value.chars() {
5146        match ch {
5147            '*' => pattern.push_str("[*]"),
5148            '?' => pattern.push_str("[?]"),
5149            '[' => pattern.push_str("[[]"),
5150            _ => pattern.push(ch),
5151        }
5152    }
5153    pattern.push('*');
5154    pattern
5155}
5156
5157#[derive(Clone, Debug, PartialEq, Eq)]
5158struct ExtractedOperationalFilterValue {
5159    field_name: String,
5160    string_value: Option<String>,
5161    integer_value: Option<i64>,
5162}
5163
5164fn extract_operational_filter_values(
5165    filter_fields: &[OperationalFilterField],
5166    payload_json: &str,
5167) -> Vec<ExtractedOperationalFilterValue> {
5168    let Ok(parsed) = serde_json::from_str::<serde_json::Value>(payload_json) else {
5169        return Vec::new();
5170    };
5171    let Some(object) = parsed.as_object() else {
5172        return Vec::new();
5173    };
5174
5175    filter_fields
5176        .iter()
5177        .filter_map(|field| {
5178            let value = object.get(&field.name)?;
5179            match field.field_type {
5180                OperationalFilterFieldType::String => {
5181                    value
5182                        .as_str()
5183                        .map(|string_value| ExtractedOperationalFilterValue {
5184                            field_name: field.name.clone(),
5185                            string_value: Some(string_value.to_owned()),
5186                            integer_value: None,
5187                        })
5188                }
5189                OperationalFilterFieldType::Integer | OperationalFilterFieldType::Timestamp => {
5190                    value
5191                        .as_i64()
5192                        .map(|integer_value| ExtractedOperationalFilterValue {
5193                            field_name: field.name.clone(),
5194                            string_value: None,
5195                            integer_value: Some(integer_value),
5196                        })
5197                }
5198            }
5199        })
5200        .collect()
5201}
5202
5203fn operational_compaction_candidates(
5204    conn: &rusqlite::Connection,
5205    retention_json: &str,
5206    collection_name: &str,
5207) -> Result<(Vec<String>, Option<i64>), EngineError> {
5208    operational_compaction_candidates_at(
5209        conn,
5210        retention_json,
5211        collection_name,
5212        current_unix_timestamp()?,
5213    )
5214}
5215
5216fn operational_compaction_candidates_at(
5217    conn: &rusqlite::Connection,
5218    retention_json: &str,
5219    collection_name: &str,
5220    now_timestamp: i64,
5221) -> Result<(Vec<String>, Option<i64>), EngineError> {
5222    let policy = parse_operational_retention_policy(retention_json)?;
5223    match policy {
5224        OperationalRetentionPolicy::KeepAll => Ok((Vec::new(), None)),
5225        OperationalRetentionPolicy::PurgeBeforeSeconds { max_age_seconds } => {
5226            let before_timestamp = now_timestamp - max_age_seconds;
5227            let mut stmt = conn.prepare(
5228                "SELECT id FROM operational_mutations \
5229                 WHERE collection_name = ?1 AND created_at < ?2 \
5230                 ORDER BY mutation_order",
5231            )?;
5232            let mutation_ids = stmt
5233                .query_map(
5234                    rusqlite::params![collection_name, before_timestamp],
5235                    |row| row.get::<_, String>(0),
5236                )?
5237                .collect::<Result<Vec<_>, _>>()?;
5238            Ok((mutation_ids, Some(before_timestamp)))
5239        }
5240        OperationalRetentionPolicy::KeepLast { max_rows } => {
5241            let mut stmt = conn.prepare(
5242                "SELECT id FROM operational_mutations \
5243                 WHERE collection_name = ?1 \
5244                 ORDER BY mutation_order DESC",
5245            )?;
5246            let ordered_ids = stmt
5247                .query_map([collection_name], |row| row.get::<_, String>(0))?
5248                .collect::<Result<Vec<_>, _>>()?;
5249            Ok((ordered_ids.into_iter().skip(max_rows).collect(), None))
5250        }
5251    }
5252}
5253
5254fn parse_operational_retention_policy(
5255    retention_json: &str,
5256) -> Result<OperationalRetentionPolicy, EngineError> {
5257    let policy: OperationalRetentionPolicy = serde_json::from_str(retention_json)
5258        .map_err(|error| EngineError::InvalidWrite(format!("invalid retention_json: {error}")))?;
5259    match policy {
5260        OperationalRetentionPolicy::KeepAll => Ok(policy),
5261        OperationalRetentionPolicy::PurgeBeforeSeconds { max_age_seconds } => {
5262            if max_age_seconds <= 0 {
5263                return Err(EngineError::InvalidWrite(
5264                    "retention_json max_age_seconds must be greater than zero".to_owned(),
5265                ));
5266            }
5267            Ok(policy)
5268        }
5269        OperationalRetentionPolicy::KeepLast { max_rows } => {
5270            if max_rows == 0 {
5271                return Err(EngineError::InvalidWrite(
5272                    "retention_json max_rows must be greater than zero".to_owned(),
5273                ));
5274            }
5275            Ok(policy)
5276        }
5277    }
5278}
5279
5280fn load_operational_retention_records(
5281    conn: &rusqlite::Connection,
5282    collection_names: Option<&[String]>,
5283    max_collections: Option<usize>,
5284) -> Result<Vec<OperationalCollectionRecord>, EngineError> {
5285    let limit = max_collections.unwrap_or(usize::MAX);
5286    if limit == 0 {
5287        return Err(EngineError::InvalidWrite(
5288            "max_collections must be greater than zero".to_owned(),
5289        ));
5290    }
5291
5292    let mut records = Vec::new();
5293    if let Some(collection_names) = collection_names {
5294        for name in collection_names.iter().take(limit) {
5295            let record = load_operational_collection_record(conn, name)?.ok_or_else(|| {
5296                EngineError::InvalidWrite(format!(
5297                    "operational collection '{name}' is not registered"
5298                ))
5299            })?;
5300            records.push(record);
5301        }
5302        return Ok(records);
5303    }
5304
5305    let mut stmt = conn.prepare(
5306        "SELECT name, kind, schema_json, retention_json, filter_fields_json, validation_json, secondary_indexes_json, format_version, created_at, disabled_at \
5307         FROM operational_collections ORDER BY name",
5308    )?;
5309    let rows = stmt
5310        .query_map([], map_operational_collection_row)?
5311        .take(limit)
5312        .collect::<Result<Vec<_>, _>>()?;
5313    Ok(rows)
5314}
5315
5316fn last_operational_retention_run_at(
5317    conn: &rusqlite::Connection,
5318    collection_name: &str,
5319) -> Result<Option<i64>, EngineError> {
5320    conn.query_row(
5321        "SELECT MAX(executed_at) FROM operational_retention_runs WHERE collection_name = ?1",
5322        [collection_name],
5323        |row| row.get(0),
5324    )
5325    .optional()
5326    .map_err(EngineError::Sqlite)
5327    .map(Option::flatten)
5328}
5329
5330fn count_operational_mutations_for_collection(
5331    conn: &rusqlite::Connection,
5332    collection_name: &str,
5333) -> Result<usize, EngineError> {
5334    let count: i64 = conn.query_row(
5335        "SELECT count(*) FROM operational_mutations WHERE collection_name = ?1",
5336        [collection_name],
5337        |row| row.get(0),
5338    )?;
5339    usize::try_from(count).map_err(|_| {
5340        EngineError::Bridge(format!("count overflow for collection {collection_name}"))
5341    })
5342}
5343
5344fn retention_action_kind_and_limit(
5345    policy: &OperationalRetentionPolicy,
5346) -> (OperationalRetentionActionKind, Option<usize>) {
5347    match policy {
5348        OperationalRetentionPolicy::KeepAll => (OperationalRetentionActionKind::Noop, None),
5349        OperationalRetentionPolicy::PurgeBeforeSeconds { .. } => {
5350            (OperationalRetentionActionKind::PurgeBeforeSeconds, None)
5351        }
5352        OperationalRetentionPolicy::KeepLast { max_rows } => {
5353            (OperationalRetentionActionKind::KeepLast, Some(*max_rows))
5354        }
5355    }
5356}
5357
5358fn plan_operational_retention_item(
5359    conn: &rusqlite::Connection,
5360    record: &OperationalCollectionRecord,
5361    now_timestamp: i64,
5362) -> Result<OperationalRetentionPlanItem, EngineError> {
5363    let last_run_at = last_operational_retention_run_at(conn, &record.name)?;
5364    if record.kind != OperationalCollectionKind::AppendOnlyLog {
5365        return Ok(OperationalRetentionPlanItem {
5366            collection_name: record.name.clone(),
5367            action_kind: OperationalRetentionActionKind::Noop,
5368            candidate_deletions: 0,
5369            before_timestamp: None,
5370            max_rows: None,
5371            last_run_at,
5372        });
5373    }
5374    let policy = parse_operational_retention_policy(&record.retention_json)?;
5375    let (action_kind, max_rows) = retention_action_kind_and_limit(&policy);
5376    let (candidate_ids, before_timestamp) = operational_compaction_candidates_at(
5377        conn,
5378        &record.retention_json,
5379        &record.name,
5380        now_timestamp,
5381    )?;
5382    Ok(OperationalRetentionPlanItem {
5383        collection_name: record.name.clone(),
5384        action_kind,
5385        candidate_deletions: candidate_ids.len(),
5386        before_timestamp,
5387        max_rows,
5388        last_run_at,
5389    })
5390}
5391
5392fn run_operational_retention_item(
5393    tx: &rusqlite::Transaction<'_>,
5394    record: &OperationalCollectionRecord,
5395    now_timestamp: i64,
5396    dry_run: bool,
5397) -> Result<OperationalRetentionRunItem, EngineError> {
5398    let plan = plan_operational_retention_item(tx, record, now_timestamp)?;
5399    let mut deleted_mutations = 0usize;
5400    if record.kind == OperationalCollectionKind::AppendOnlyLog
5401        && plan.action_kind != OperationalRetentionActionKind::Noop
5402        && plan.candidate_deletions > 0
5403        && !dry_run
5404    {
5405        let (candidate_ids, _) = operational_compaction_candidates_at(
5406            tx,
5407            &record.retention_json,
5408            &record.name,
5409            now_timestamp,
5410        )?;
5411        let mut delete_stmt =
5412            tx.prepare_cached("DELETE FROM operational_mutations WHERE id = ?1")?;
5413        for mutation_id in &candidate_ids {
5414            delete_stmt.execute([mutation_id.as_str()])?;
5415            deleted_mutations += 1;
5416        }
5417        drop(delete_stmt);
5418
5419        persist_simple_provenance_event(
5420            tx,
5421            "operational_retention_run",
5422            &record.name,
5423            Some(serde_json::json!({
5424                "action_kind": plan.action_kind,
5425                "deleted_mutations": deleted_mutations,
5426                "before_timestamp": plan.before_timestamp,
5427                "max_rows": plan.max_rows,
5428                "executed_at": now_timestamp,
5429            })),
5430        )?;
5431    }
5432
5433    let live_rows_remaining = count_operational_mutations_for_collection(tx, &record.name)?;
5434    let effective_deleted_mutations = if dry_run {
5435        plan.candidate_deletions
5436    } else {
5437        deleted_mutations
5438    };
5439    let rows_remaining = if dry_run {
5440        live_rows_remaining.saturating_sub(effective_deleted_mutations)
5441    } else {
5442        live_rows_remaining
5443    };
5444    if !dry_run && plan.action_kind != OperationalRetentionActionKind::Noop {
5445        tx.execute(
5446            "INSERT INTO operational_retention_runs \
5447             (id, collection_name, executed_at, action_kind, dry_run, deleted_mutations, rows_remaining, metadata_json) \
5448             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
5449            rusqlite::params![
5450                new_id(),
5451                record.name,
5452                now_timestamp,
5453                serde_json::to_string(&plan.action_kind)
5454                    .unwrap_or_else(|_| "\"noop\"".to_owned())
5455                    .trim_matches('"')
5456                    .to_owned(),
5457                i32::from(dry_run),
5458                deleted_mutations,
5459                rows_remaining,
5460                serde_json::json!({
5461                    "before_timestamp": plan.before_timestamp,
5462                    "max_rows": plan.max_rows,
5463                })
5464                .to_string(),
5465            ],
5466        )?;
5467    }
5468
5469    Ok(OperationalRetentionRunItem {
5470        collection_name: plan.collection_name,
5471        action_kind: plan.action_kind,
5472        deleted_mutations: effective_deleted_mutations,
5473        before_timestamp: plan.before_timestamp,
5474        max_rows: plan.max_rows,
5475        rows_remaining,
5476    })
5477}
5478
5479fn current_unix_timestamp() -> Result<i64, EngineError> {
5480    let now = SystemTime::now()
5481        .duration_since(SystemTime::UNIX_EPOCH)
5482        .map_err(|error| EngineError::Bridge(format!("system clock error: {error}")))?;
5483    i64::try_from(now.as_secs())
5484        .map_err(|_| EngineError::Bridge("unix timestamp overflow".to_owned()))
5485}
5486
5487fn map_operational_collection_row(
5488    row: &rusqlite::Row<'_>,
5489) -> Result<OperationalCollectionRecord, rusqlite::Error> {
5490    let kind_text: String = row.get(1)?;
5491    let kind = OperationalCollectionKind::try_from(kind_text.as_str()).map_err(|message| {
5492        rusqlite::Error::FromSqlConversionFailure(
5493            1,
5494            rusqlite::types::Type::Text,
5495            Box::new(io::Error::new(io::ErrorKind::InvalidData, message)),
5496        )
5497    })?;
5498    Ok(OperationalCollectionRecord {
5499        name: row.get(0)?,
5500        kind,
5501        schema_json: row.get(2)?,
5502        retention_json: row.get(3)?,
5503        filter_fields_json: row.get(4)?,
5504        validation_json: row.get(5)?,
5505        secondary_indexes_json: row.get(6)?,
5506        format_version: row.get(7)?,
5507        created_at: row.get(8)?,
5508        disabled_at: row.get(9)?,
5509    })
5510}
5511
5512fn map_operational_mutation_row(
5513    row: &rusqlite::Row<'_>,
5514) -> Result<OperationalMutationRow, rusqlite::Error> {
5515    Ok(OperationalMutationRow {
5516        id: row.get(0)?,
5517        collection_name: row.get(1)?,
5518        record_key: row.get(2)?,
5519        op_kind: row.get(3)?,
5520        payload_json: row.get(4)?,
5521        source_ref: row.get(5)?,
5522        created_at: row.get(6)?,
5523    })
5524}
5525
5526fn map_operational_current_row(
5527    row: &rusqlite::Row<'_>,
5528) -> Result<OperationalCurrentRow, rusqlite::Error> {
5529    Ok(OperationalCurrentRow {
5530        collection_name: row.get(0)?,
5531        record_key: row.get(1)?,
5532        payload_json: row.get(2)?,
5533        updated_at: row.get(3)?,
5534        last_mutation_id: row.get(4)?,
5535    })
5536}
5537
5538#[cfg(test)]
5539#[allow(clippy::expect_used)]
5540mod tests {
5541    use std::fs;
5542    use std::sync::Arc;
5543
5544    use fathomdb_schema::SchemaManager;
5545    use tempfile::NamedTempFile;
5546
5547    use super::{
5548        AdminService, FtsPropertyPathMode, FtsPropertyPathSpec, SafeExportOptions,
5549        VectorRegenerationConfig,
5550    };
5551    use crate::embedder::{BatchEmbedder, EmbedderError, QueryEmbedder, QueryEmbedderIdentity};
5552    use crate::projection::ProjectionTarget;
5553    use crate::sqlite;
5554    use crate::{EngineError, OperationalCollectionKind, OperationalRegisterRequest};
5555
5556    #[cfg(feature = "sqlite-vec")]
5557    use crate::{ExecutionCoordinator, TelemetryCounters};
5558
5559    #[cfg(feature = "sqlite-vec")]
5560    use fathomdb_query::QueryBuilder;
5561
5562    #[cfg(feature = "sqlite-vec")]
5563    use super::load_vector_regeneration_config;
5564
5565    /// In-process embedder used by the regeneration test suite. The
5566    /// vector is parameterized so individual tests can distinguish which
5567    /// embedder produced which profile row.
5568    #[derive(Debug)]
5569    #[allow(dead_code)]
5570    struct TestEmbedder {
5571        identity: QueryEmbedderIdentity,
5572        vector: Vec<f32>,
5573    }
5574
5575    #[allow(dead_code)]
5576    impl TestEmbedder {
5577        fn new(model: &str, dimension: usize) -> Self {
5578            Self {
5579                identity: QueryEmbedderIdentity {
5580                    model_identity: model.to_owned(),
5581                    model_version: "1.0.0".to_owned(),
5582                    dimension,
5583                    normalization_policy: "l2".to_owned(),
5584                },
5585                vector: vec![1.0; dimension],
5586            }
5587        }
5588    }
5589
5590    impl QueryEmbedder for TestEmbedder {
5591        fn embed_query(&self, _text: &str) -> Result<Vec<f32>, EmbedderError> {
5592            Ok(self.vector.clone())
5593        }
5594        fn identity(&self) -> QueryEmbedderIdentity {
5595            self.identity.clone()
5596        }
5597        fn max_tokens(&self) -> usize {
5598            512
5599        }
5600    }
5601
5602    impl BatchEmbedder for TestEmbedder {
5603        fn batch_embed(&self, texts: &[String]) -> Result<Vec<Vec<f32>>, EmbedderError> {
5604            Ok(texts.iter().map(|_| self.vector.clone()).collect())
5605        }
5606        fn identity(&self) -> QueryEmbedderIdentity {
5607            self.identity.clone()
5608        }
5609        fn max_tokens(&self) -> usize {
5610            512
5611        }
5612    }
5613
5614    /// Embedder that always fails — used to exercise the post-request
5615    /// failure audit path without the complexity of subprocess machinery.
5616    #[derive(Debug)]
5617    #[allow(dead_code)]
5618    struct FailingEmbedder {
5619        identity: QueryEmbedderIdentity,
5620    }
5621
5622    impl QueryEmbedder for FailingEmbedder {
5623        fn embed_query(&self, _text: &str) -> Result<Vec<f32>, EmbedderError> {
5624            Err(EmbedderError::Failed("test failure".to_owned()))
5625        }
5626        fn identity(&self) -> QueryEmbedderIdentity {
5627            self.identity.clone()
5628        }
5629        fn max_tokens(&self) -> usize {
5630            512
5631        }
5632    }
5633
5634    #[allow(dead_code)]
5635    #[cfg(unix)]
5636    fn set_file_mode(path: &std::path::Path, mode: u32) {
5637        use std::os::unix::fs::PermissionsExt;
5638
5639        let mut permissions = fs::metadata(path).expect("script metadata").permissions();
5640        permissions.set_mode(mode);
5641        fs::set_permissions(path, permissions).expect("chmod");
5642    }
5643
5644    #[allow(dead_code)]
5645    #[cfg(not(unix))]
5646    fn set_file_mode(_path: &std::path::Path, _mode: u32) {}
5647
5648    fn setup() -> (NamedTempFile, AdminService) {
5649        let db = NamedTempFile::new().expect("temp file");
5650        let schema = Arc::new(SchemaManager::new());
5651        {
5652            let conn = sqlite::open_connection(db.path()).expect("connection");
5653            schema.bootstrap(&conn).expect("bootstrap");
5654        }
5655        let service = AdminService::new(db.path(), Arc::clone(&schema));
5656        (db, service)
5657    }
5658
5659    #[test]
5660    fn check_integrity_includes_active_uniqueness_count() {
5661        let (_db, service) = setup();
5662        let report = service.check_integrity().expect("integrity check");
5663        assert_eq!(report.duplicate_active_logical_ids, 0);
5664        assert_eq!(report.operational_missing_collections, 0);
5665        assert_eq!(report.operational_missing_last_mutations, 0);
5666    }
5667
5668    #[test]
5669    fn trace_source_returns_node_logical_ids() {
5670        let (db, service) = setup();
5671        {
5672            let conn = sqlite::open_connection(db.path()).expect("conn");
5673            conn.execute(
5674                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5675                 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 'source-1')",
5676                [],
5677            )
5678            .expect("insert node");
5679        }
5680        let report = service.trace_source("source-1").expect("trace");
5681        assert_eq!(report.node_rows, 1);
5682        assert_eq!(report.node_logical_ids, vec!["lg1"]);
5683    }
5684
5685    #[test]
5686    fn trace_source_includes_operational_mutations() {
5687        let (db, service) = setup();
5688        {
5689            let conn = sqlite::open_connection(db.path()).expect("conn");
5690            conn.execute(
5691                "INSERT INTO operational_collections \
5692                 (name, kind, schema_json, retention_json, format_version, created_at) \
5693                 VALUES ('connector_health', 'latest_state', '{}', '{}', 1, 100)",
5694                [],
5695            )
5696            .expect("insert collection");
5697            conn.execute(
5698                "INSERT INTO operational_mutations \
5699                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
5700                 VALUES ('m1', 'connector_health', 'gmail', 'put', '{\"status\":\"ok\"}', 'source-1', 100, 1)",
5701                [],
5702            )
5703            .expect("insert mutation");
5704        }
5705
5706        let report = service.trace_source("source-1").expect("trace");
5707        assert_eq!(report.operational_mutation_rows, 1);
5708        assert_eq!(report.operational_mutation_ids, vec!["m1"]);
5709    }
5710
5711    #[test]
5712    fn excise_source_restores_prior_active_node() {
5713        let (db, service) = setup();
5714        {
5715            let conn = sqlite::open_connection(db.path()).expect("conn");
5716            conn.execute(
5717                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5718                 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 200, 'source-1')",
5719                [],
5720            )
5721            .expect("insert v1 superseded");
5722            conn.execute(
5723                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5724                 VALUES ('r2', 'lg1', 'Meeting', '{}', 200, 'source-2')",
5725                [],
5726            )
5727            .expect("insert v2 active");
5728        }
5729        service.excise_source("source-2").expect("excise");
5730        {
5731            let conn = sqlite::open_connection(db.path()).expect("conn");
5732            let active_row_id: String = conn
5733                .query_row(
5734                    "SELECT row_id FROM nodes WHERE logical_id = 'lg1' AND superseded_at IS NULL",
5735                    [],
5736                    |row| row.get(0),
5737                )
5738                .expect("active row exists after excise");
5739            assert_eq!(active_row_id, "r1");
5740        }
5741    }
5742
5743    #[test]
5744    fn excise_source_deletes_operational_mutations_and_repairs_latest_state_current() {
5745        let (db, service) = setup();
5746        {
5747            let conn = sqlite::open_connection(db.path()).expect("conn");
5748            conn.execute(
5749                "INSERT INTO operational_collections \
5750                 (name, kind, schema_json, retention_json, format_version, created_at) \
5751                 VALUES ('connector_health', 'latest_state', '{}', '{}', 1, 100)",
5752                [],
5753            )
5754            .expect("insert collection");
5755            conn.execute(
5756                "INSERT INTO operational_mutations \
5757                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
5758                 VALUES ('m1', 'connector_health', 'gmail', 'put', '{\"status\":\"old\"}', 'source-1', 100, 1)",
5759                [],
5760            )
5761            .expect("insert prior mutation");
5762            conn.execute(
5763                "INSERT INTO operational_mutations \
5764                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
5765                 VALUES ('m2', 'connector_health', 'gmail', 'put', '{\"status\":\"new\"}', 'source-2', 200, 2)",
5766                [],
5767            )
5768            .expect("insert excised mutation");
5769            conn.execute(
5770                "INSERT INTO operational_current \
5771                 (collection_name, record_key, payload_json, updated_at, last_mutation_id) \
5772                 VALUES ('connector_health', 'gmail', '{\"status\":\"new\"}', 200, 'm2')",
5773                [],
5774            )
5775            .expect("insert current row");
5776        }
5777
5778        let traced = service
5779            .trace_source("source-2")
5780            .expect("trace before excise");
5781        assert_eq!(traced.operational_mutation_rows, 1);
5782        assert_eq!(traced.operational_mutation_ids, vec!["m2"]);
5783
5784        let excised = service.excise_source("source-2").expect("excise");
5785        assert_eq!(excised.operational_mutation_rows, 0);
5786        assert!(excised.operational_mutation_ids.is_empty());
5787
5788        {
5789            let conn = sqlite::open_connection(db.path()).expect("conn");
5790            let remaining: i64 = conn
5791                .query_row(
5792                    "SELECT count(*) FROM operational_mutations WHERE source_ref = 'source-2'",
5793                    [],
5794                    |row| row.get(0),
5795                )
5796                .expect("remaining count");
5797            assert_eq!(remaining, 0);
5798
5799            let current: (String, String) = conn
5800                .query_row(
5801                    "SELECT payload_json, last_mutation_id FROM operational_current \
5802                     WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
5803                    [],
5804                    |row| Ok((row.get(0)?, row.get(1)?)),
5805                )
5806                .expect("rebuilt current row");
5807            assert_eq!(current.0, "{\"status\":\"old\"}");
5808            assert_eq!(current.1, "m1");
5809        }
5810    }
5811
5812    #[test]
5813    fn restore_logical_id_reestablishes_last_pre_retire_content_and_attached_edges() {
5814        let (db, service) = setup();
5815        {
5816            let conn = sqlite::open_connection(db.path()).expect("conn");
5817            conn.execute(
5818                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5819                 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 'seed')",
5820                [],
5821            )
5822            .expect("insert node");
5823            conn.execute(
5824                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5825                 VALUES ('node-row-topic', 'topic-1', 'Topic', '{}', 100, 'seed')",
5826                [],
5827            )
5828            .expect("insert target node");
5829            conn.execute(
5830                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5831                 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
5832                [],
5833            )
5834            .expect("insert chunk");
5835            conn.execute(
5836                "INSERT INTO edges \
5837                 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
5838                 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'topic-1', 'TAGGED', '{}', 100, 'seed')",
5839                [],
5840            )
5841            .expect("insert edge");
5842            conn.execute(
5843                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5844                 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5845                [],
5846            )
5847            .expect("insert node retire event");
5848            conn.execute(
5849                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5850                 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
5851                [],
5852            )
5853            .expect("insert edge retire event");
5854            conn.execute(
5855                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
5856                [],
5857            )
5858            .expect("retire node");
5859            conn.execute(
5860                "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
5861                [],
5862            )
5863            .expect("retire edge");
5864            conn.execute("DELETE FROM fts_nodes", [])
5865                .expect("clear fts");
5866        }
5867
5868        let report = service.restore_logical_id("doc-1").expect("restore");
5869        assert_eq!(report.logical_id, "doc-1");
5870        assert!(!report.was_noop);
5871        assert_eq!(report.restored_node_rows, 1);
5872        assert_eq!(report.restored_edge_rows, 1);
5873        assert_eq!(report.restored_chunk_rows, 1);
5874        assert_eq!(report.restored_fts_rows, 1);
5875
5876        let conn = sqlite::open_connection(db.path()).expect("conn");
5877        let active_node_count: i64 = conn
5878            .query_row(
5879                "SELECT count(*) FROM nodes WHERE logical_id = 'doc-1' AND superseded_at IS NULL",
5880                [],
5881                |row| row.get(0),
5882            )
5883            .expect("active node count");
5884        assert_eq!(active_node_count, 1);
5885        let active_edge_count: i64 = conn
5886            .query_row(
5887                "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
5888                [],
5889                |row| row.get(0),
5890            )
5891            .expect("active edge count");
5892        assert_eq!(active_edge_count, 1);
5893        let fts_count: i64 = conn
5894            .query_row(
5895                "SELECT count(*) FROM fts_nodes WHERE chunk_id = 'chunk-1'",
5896                [],
5897                |row| row.get(0),
5898            )
5899            .expect("fts count");
5900        assert_eq!(fts_count, 1);
5901    }
5902
5903    #[test]
5904    fn restore_logical_id_restores_edges_retired_after_the_node_retire_event() {
5905        let (db, service) = setup();
5906        {
5907            let conn = sqlite::open_connection(db.path()).expect("conn");
5908            conn.execute(
5909                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5910                 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 'seed')",
5911                [],
5912            )
5913            .expect("insert node");
5914            conn.execute(
5915                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5916                 VALUES ('node-row-topic', 'topic-1', 'Topic', '{}', 100, 'seed')",
5917                [],
5918            )
5919            .expect("insert target node");
5920            conn.execute(
5921                "INSERT INTO edges \
5922                 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
5923                 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'topic-1', 'TAGGED', '{}', 100, 'seed')",
5924                [],
5925            )
5926            .expect("insert edge");
5927            conn.execute(
5928                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5929                 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5930                [],
5931            )
5932            .expect("insert node retire event");
5933            conn.execute(
5934                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5935                 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 201, '')",
5936                [],
5937            )
5938            .expect("insert edge retire event");
5939            conn.execute(
5940                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
5941                [],
5942            )
5943            .expect("retire node");
5944            conn.execute(
5945                "UPDATE edges SET superseded_at = 201 WHERE logical_id = 'edge-1'",
5946                [],
5947            )
5948            .expect("retire edge");
5949        }
5950
5951        let report = service.restore_logical_id("doc-1").expect("restore");
5952        assert_eq!(report.restored_edge_rows, 1);
5953
5954        let conn = sqlite::open_connection(db.path()).expect("conn");
5955        let active_edge_count: i64 = conn
5956            .query_row(
5957                "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
5958                [],
5959                |row| row.get(0),
5960            )
5961            .expect("active edge count");
5962        assert_eq!(active_edge_count, 1);
5963    }
5964
5965    #[test]
5966    fn restore_logical_id_prefers_latest_retired_revision_when_timestamps_tie() {
5967        let (db, service) = setup();
5968        {
5969            let conn = sqlite::open_connection(db.path()).expect("conn");
5970            conn.execute(
5971                "INSERT INTO nodes \
5972                 (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5973                 VALUES ('node-row-older', 'doc-1', 'Document', '{\"title\":\"older\"}', 100, 200, 'forget-1')",
5974                [],
5975            )
5976            .expect("insert older retired node");
5977            conn.execute(
5978                "INSERT INTO nodes \
5979                 (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5980                 VALUES ('node-row-newer', 'doc-1', 'Document', '{\"title\":\"newer\"}', 100, 200, 'forget-1')",
5981                [],
5982            )
5983            .expect("insert newer retired node");
5984            conn.execute(
5985                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5986                 VALUES ('evt-retire-older', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5987                [],
5988            )
5989            .expect("insert older retire event");
5990            conn.execute(
5991                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5992                 VALUES ('evt-retire-newer', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5993                [],
5994            )
5995            .expect("insert newer retire event");
5996        }
5997
5998        let report = service.restore_logical_id("doc-1").expect("restore");
5999
6000        assert!(!report.was_noop);
6001        let conn = sqlite::open_connection(db.path()).expect("conn");
6002        let active_row: (String, String) = conn
6003            .query_row(
6004                "SELECT row_id, properties FROM nodes \
6005                 WHERE logical_id = 'doc-1' AND superseded_at IS NULL",
6006                [],
6007                |row| Ok((row.get(0)?, row.get(1)?)),
6008            )
6009            .expect("restored active row");
6010        assert_eq!(active_row.0, "node-row-newer");
6011        assert_eq!(active_row.1, "{\"title\":\"newer\"}");
6012    }
6013
6014    #[test]
6015    fn purge_logical_id_removes_retired_content_and_records_tombstone() {
6016        let (db, service) = setup();
6017        {
6018            let conn = sqlite::open_connection(db.path()).expect("conn");
6019            conn.execute(
6020                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
6021                 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 200, 'seed')",
6022                [],
6023            )
6024            .expect("insert retired node");
6025            conn.execute(
6026                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
6027                 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
6028                [],
6029            )
6030            .expect("insert chunk");
6031            conn.execute(
6032                "INSERT INTO edges \
6033                 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, superseded_at, source_ref) \
6034                 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'topic-1', 'TAGGED', '{}', 100, 200, 'seed')",
6035                [],
6036            )
6037            .expect("insert retired edge");
6038            conn.execute(
6039                "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
6040                 VALUES ('chunk-1', 'doc-1', 'Document', 'budget narrative')",
6041                [],
6042            )
6043            .expect("insert fts");
6044        }
6045
6046        let report = service.purge_logical_id("doc-1").expect("purge");
6047        assert_eq!(report.logical_id, "doc-1");
6048        assert!(!report.was_noop);
6049        assert_eq!(report.deleted_node_rows, 1);
6050        assert_eq!(report.deleted_edge_rows, 1);
6051        assert_eq!(report.deleted_chunk_rows, 1);
6052        assert_eq!(report.deleted_fts_rows, 1);
6053
6054        let conn = sqlite::open_connection(db.path()).expect("conn");
6055        let remaining_nodes: i64 = conn
6056            .query_row(
6057                "SELECT count(*) FROM nodes WHERE logical_id = 'doc-1'",
6058                [],
6059                |row| row.get(0),
6060            )
6061            .expect("remaining nodes");
6062        assert_eq!(remaining_nodes, 0);
6063        let remaining_edges: i64 = conn
6064            .query_row(
6065                "SELECT count(*) FROM edges WHERE logical_id = 'edge-1'",
6066                [],
6067                |row| row.get(0),
6068            )
6069            .expect("remaining edges");
6070        assert_eq!(remaining_edges, 0);
6071        let remaining_chunks: i64 = conn
6072            .query_row(
6073                "SELECT count(*) FROM chunks WHERE id = 'chunk-1'",
6074                [],
6075                |row| row.get(0),
6076            )
6077            .expect("remaining chunks");
6078        assert_eq!(remaining_chunks, 0);
6079        let purge_events: i64 = conn
6080            .query_row(
6081                "SELECT count(*) FROM provenance_events WHERE event_type = 'purge_logical_id' AND subject = 'doc-1'",
6082                [],
6083                |row| row.get(0),
6084            )
6085            .expect("purge events");
6086        assert_eq!(purge_events, 1);
6087    }
6088
6089    #[test]
6090    fn check_semantics_accepts_preserved_retired_chunks() {
6091        let (db, service) = setup();
6092        {
6093            let conn = sqlite::open_connection(db.path()).expect("conn");
6094            conn.execute(
6095                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
6096                 VALUES ('node-row-1', 'doc-1', 'Document', '{}', 100, 200, 'seed')",
6097                [],
6098            )
6099            .expect("insert retired node");
6100            conn.execute(
6101                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
6102                 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
6103                [],
6104            )
6105            .expect("insert chunk");
6106        }
6107
6108        let report = service.check_semantics().expect("semantics");
6109        assert_eq!(report.orphaned_chunks, 0);
6110    }
6111
6112    #[test]
6113    fn check_semantics_detects_missing_retired_node_history_for_preserved_chunks() {
6114        let (db, service) = setup();
6115        {
6116            let conn = sqlite::open_connection(db.path()).expect("conn");
6117            conn.execute(
6118                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
6119                 VALUES ('chunk-1', 'ghost-doc', 'budget narrative', 100)",
6120                [],
6121            )
6122            .expect("insert orphaned chunk");
6123        }
6124
6125        let report = service.check_semantics().expect("semantics");
6126        assert_eq!(report.orphaned_chunks, 1);
6127    }
6128
6129    #[cfg(feature = "sqlite-vec")]
6130    #[test]
6131    fn check_semantics_detects_missing_retired_node_history_for_preserved_vec_rows() {
6132        let (db, service) = setup();
6133        {
6134            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6135            service
6136                .schema_manager
6137                .ensure_vec_kind_profile(&conn, "Doc", 4)
6138                .expect("ensure vec kind profile");
6139            conn.execute(
6140                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
6141                 VALUES ('chunk-1', 'ghost-doc', 'budget narrative', 100)",
6142                [],
6143            )
6144            .expect("insert orphaned chunk");
6145            conn.execute(
6146                "INSERT INTO vec_doc (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
6147                [],
6148            )
6149            .expect("insert vec row");
6150        }
6151
6152        let report = service.check_semantics().expect("semantics");
6153        assert_eq!(report.orphaned_chunks, 1);
6154        assert_eq!(report.vec_rows_for_superseded_nodes, 1);
6155    }
6156
6157    #[cfg(feature = "sqlite-vec")]
6158    #[test]
6159    fn restore_logical_id_reestablishes_vector_search_without_reingest() {
6160        let (db, service) = setup();
6161        {
6162            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6163            service
6164                .schema_manager
6165                .ensure_vec_kind_profile(&conn, "Document", 4)
6166                .expect("ensure vec kind profile");
6167            conn.execute(
6168                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
6169                 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 200, 'seed')",
6170                [],
6171            )
6172            .expect("insert retired node");
6173            conn.execute(
6174                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
6175                 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
6176                [],
6177            )
6178            .expect("insert chunk");
6179            conn.execute(
6180                "INSERT INTO vec_document (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
6181                [],
6182            )
6183            .expect("insert vec row");
6184            conn.execute(
6185                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
6186                 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
6187                [],
6188            )
6189            .expect("insert retire event");
6190        }
6191
6192        let report = service.restore_logical_id("doc-1").expect("restore");
6193        assert_eq!(report.restored_vec_rows, 1);
6194
6195        let coordinator = ExecutionCoordinator::open(
6196            db.path(),
6197            Arc::new(SchemaManager::new()),
6198            Some(4),
6199            1,
6200            Arc::new(TelemetryCounters::default()),
6201            None,
6202        )
6203        .expect("coordinator");
6204        let compiled = QueryBuilder::nodes("Document")
6205            .vector_search("[0.0, 0.0, 0.0, 0.0]", 5)
6206            .compile()
6207            .expect("compile");
6208        let rows = coordinator
6209            .execute_compiled_read(&compiled)
6210            .expect("vector read");
6211        assert!(
6212            rows.nodes.iter().any(|row| row.logical_id == "doc-1"),
6213            "restore should make the preserved vec row visible again without re-ingest"
6214        );
6215    }
6216
6217    #[cfg(feature = "sqlite-vec")]
6218    #[test]
6219    fn purge_logical_id_deletes_vec_rows_for_retired_content() {
6220        let (db, service) = setup();
6221        {
6222            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6223            service
6224                .schema_manager
6225                .ensure_vec_kind_profile(&conn, "Document", 4)
6226                .expect("ensure vec kind profile");
6227            conn.execute(
6228                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
6229                 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 200, 'seed')",
6230                [],
6231            )
6232            .expect("insert retired node");
6233            conn.execute(
6234                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
6235                 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
6236                [],
6237            )
6238            .expect("insert chunk");
6239            conn.execute(
6240                "INSERT INTO vec_document (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
6241                [],
6242            )
6243            .expect("insert vec row");
6244        }
6245
6246        let report = service.purge_logical_id("doc-1").expect("purge");
6247        assert_eq!(report.deleted_vec_rows, 1);
6248
6249        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6250        let vec_count: i64 = conn
6251            .query_row("SELECT count(*) FROM vec_document", [], |row| row.get(0))
6252            .expect("vec count");
6253        assert_eq!(vec_count, 0);
6254    }
6255
6256    #[cfg(feature = "sqlite-vec")]
6257    #[test]
6258    fn restore_logical_id_restores_visibility_of_regenerated_vectors() {
6259        let (db, service) = setup();
6260
6261        {
6262            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6263            conn.execute(
6264                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
6265                 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 'seed')",
6266                [],
6267            )
6268            .expect("insert node");
6269            conn.execute(
6270                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
6271                 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
6272                [],
6273            )
6274            .expect("insert chunk");
6275        }
6276
6277        let embedder = TestEmbedder::new("test-model", 4);
6278        service
6279            .regenerate_vector_embeddings(
6280                &embedder,
6281                &VectorRegenerationConfig {
6282                    kind: "Document".to_owned(),
6283                    profile: "default".to_owned(),
6284                    chunking_policy: "per_chunk".to_owned(),
6285                    preprocessing_policy: "trim".to_owned(),
6286                },
6287            )
6288            .expect("regenerate");
6289
6290        {
6291            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6292            conn.execute(
6293                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
6294                 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
6295                [],
6296            )
6297            .expect("insert retire event");
6298            conn.execute(
6299                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
6300                [],
6301            )
6302            .expect("retire node");
6303        }
6304
6305        let report = service.restore_logical_id("doc-1").expect("restore");
6306        assert_eq!(report.restored_vec_rows, 1);
6307
6308        let coordinator = ExecutionCoordinator::open(
6309            db.path(),
6310            Arc::new(SchemaManager::new()),
6311            Some(4),
6312            1,
6313            Arc::new(TelemetryCounters::default()),
6314            None,
6315        )
6316        .expect("coordinator");
6317        let compiled = QueryBuilder::nodes("Document")
6318            .vector_search("[0.0, 0.0, 0.0, 0.0]", 5)
6319            .compile()
6320            .expect("compile");
6321        let rows = coordinator
6322            .execute_compiled_read(&compiled)
6323            .expect("vector read");
6324        assert!(
6325            rows.nodes.iter().any(|row| row.logical_id == "doc-1"),
6326            "restored logical_id should become visible through regenerated vectors"
6327        );
6328    }
6329
6330    #[test]
6331    fn check_semantics_clean_db_returns_zeros() {
6332        let (_db, service) = setup();
6333        let report = service.check_semantics().expect("semantics check");
6334        assert_eq!(report.orphaned_chunks, 0);
6335        assert_eq!(report.null_source_ref_nodes, 0);
6336        assert_eq!(report.broken_step_fk, 0);
6337        assert_eq!(report.broken_action_fk, 0);
6338        assert_eq!(report.stale_fts_rows, 0);
6339        assert_eq!(report.fts_rows_for_superseded_nodes, 0);
6340        assert_eq!(report.dangling_edges, 0);
6341        assert_eq!(report.orphaned_supersession_chains, 0);
6342        assert_eq!(report.stale_vec_rows, 0);
6343        assert_eq!(report.vec_rows_for_superseded_nodes, 0);
6344        assert_eq!(report.missing_operational_current_rows, 0);
6345        assert_eq!(report.stale_operational_current_rows, 0);
6346        assert_eq!(report.disabled_collection_mutations, 0);
6347        assert_eq!(report.mismatched_kind_property_fts_rows, 0);
6348        assert_eq!(report.duplicate_property_fts_rows, 0);
6349        assert_eq!(report.drifted_property_fts_rows, 0);
6350        assert!(report.warnings.is_empty());
6351    }
6352
6353    #[test]
6354    fn register_operational_collection_persists_and_emits_provenance() {
6355        let (db, service) = setup();
6356        let record = service
6357            .register_operational_collection(&OperationalRegisterRequest {
6358                name: "connector_health".to_owned(),
6359                kind: OperationalCollectionKind::LatestState,
6360                schema_json: "{}".to_owned(),
6361                retention_json: "{}".to_owned(),
6362                filter_fields_json: "[]".to_owned(),
6363                validation_json: String::new(),
6364                secondary_indexes_json: "[]".to_owned(),
6365                format_version: 1,
6366            })
6367            .expect("register collection");
6368
6369        assert_eq!(record.name, "connector_health");
6370        assert_eq!(record.kind, OperationalCollectionKind::LatestState);
6371        assert_eq!(record.schema_json, "{}");
6372        assert_eq!(record.retention_json, "{}");
6373        assert_eq!(record.filter_fields_json, "[]");
6374        assert!(record.created_at > 0);
6375        assert_eq!(record.disabled_at, None);
6376
6377        let described = service
6378            .describe_operational_collection("connector_health")
6379            .expect("describe collection")
6380            .expect("collection exists");
6381        assert_eq!(described, record);
6382
6383        let conn = sqlite::open_connection(db.path()).expect("conn");
6384        let provenance_count: i64 = conn
6385            .query_row(
6386                "SELECT count(*) FROM provenance_events \
6387                 WHERE event_type = 'operational_collection_registered' AND subject = 'connector_health'",
6388                [],
6389                |row| row.get(0),
6390            )
6391            .expect("provenance count");
6392        assert_eq!(provenance_count, 1);
6393    }
6394
6395    #[test]
6396    fn register_and_update_operational_collection_validation_round_trip() {
6397        let (db, service) = setup();
6398        let record = service
6399            .register_operational_collection(&OperationalRegisterRequest {
6400                name: "connector_health".to_owned(),
6401                kind: OperationalCollectionKind::LatestState,
6402                schema_json: "{}".to_owned(),
6403                retention_json: "{}".to_owned(),
6404                filter_fields_json: "[]".to_owned(),
6405                validation_json: String::new(),
6406                secondary_indexes_json: "[]".to_owned(),
6407                format_version: 1,
6408            })
6409            .expect("register collection");
6410        assert_eq!(record.validation_json, "");
6411
6412        let validation_json = r#"{"format_version":1,"mode":"enforce","additional_properties":false,"fields":[{"name":"status","type":"string","required":true,"enum":["ok","failed"]}]}"#;
6413        let updated = service
6414            .update_operational_collection_validation("connector_health", validation_json)
6415            .expect("update validation");
6416        assert_eq!(updated.validation_json, validation_json);
6417
6418        let described = service
6419            .describe_operational_collection("connector_health")
6420            .expect("describe collection")
6421            .expect("collection exists");
6422        assert_eq!(described.validation_json, validation_json);
6423
6424        let conn = sqlite::open_connection(db.path()).expect("conn");
6425        let provenance_count: i64 = conn
6426            .query_row(
6427                "SELECT count(*) FROM provenance_events \
6428                 WHERE event_type = 'operational_collection_validation_updated' \
6429                   AND subject = 'connector_health'",
6430                [],
6431                |row| row.get(0),
6432            )
6433            .expect("provenance count");
6434        assert_eq!(provenance_count, 1);
6435    }
6436
6437    #[test]
6438    fn register_update_and_rebuild_operational_secondary_indexes_round_trip() {
6439        let (db, service) = setup();
6440        let record = service
6441            .register_operational_collection(&OperationalRegisterRequest {
6442                name: "audit_log".to_owned(),
6443                kind: OperationalCollectionKind::AppendOnlyLog,
6444                schema_json: "{}".to_owned(),
6445                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6446                filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#.to_owned(),
6447                validation_json: String::new(),
6448                secondary_indexes_json: "[]".to_owned(),
6449                format_version: 1,
6450            })
6451            .expect("register collection");
6452        assert_eq!(record.secondary_indexes_json, "[]");
6453
6454        {
6455            let writer = crate::WriterActor::start(
6456                db.path(),
6457                Arc::new(SchemaManager::new()),
6458                crate::ProvenanceMode::Warn,
6459                Arc::new(crate::TelemetryCounters::default()),
6460            )
6461            .expect("writer");
6462            writer
6463                .submit(crate::WriteRequest {
6464                    label: "secondary-index-seed".to_owned(),
6465                    nodes: vec![],
6466                    node_retires: vec![],
6467                    edges: vec![],
6468                    edge_retires: vec![],
6469                    chunks: vec![],
6470                    runs: vec![],
6471                    steps: vec![],
6472                    actions: vec![],
6473                    optional_backfills: vec![],
6474                    vec_inserts: vec![],
6475                    operational_writes: vec![
6476                        crate::OperationalWrite::Append {
6477                            collection: "audit_log".to_owned(),
6478                            record_key: "evt-1".to_owned(),
6479                            payload_json: r#"{"actor":"alice","ts":100}"#.to_owned(),
6480                            source_ref: Some("src-1".to_owned()),
6481                        },
6482                        crate::OperationalWrite::Append {
6483                            collection: "audit_log".to_owned(),
6484                            record_key: "evt-2".to_owned(),
6485                            payload_json: r#"{"actor":"bob","ts":200}"#.to_owned(),
6486                            source_ref: Some("src-2".to_owned()),
6487                        },
6488                    ],
6489                })
6490                .expect("seed writes");
6491        }
6492
6493        let secondary_indexes_json = r#"[{"name":"actor_ts","kind":"append_only_field_time","field":"actor","value_type":"string","time_field":"ts"}]"#;
6494        let updated = service
6495            .update_operational_collection_secondary_indexes("audit_log", secondary_indexes_json)
6496            .expect("update secondary indexes");
6497        assert_eq!(updated.secondary_indexes_json, secondary_indexes_json);
6498
6499        let conn = sqlite::open_connection(db.path()).expect("conn");
6500        let entry_count: i64 = conn
6501            .query_row(
6502                "SELECT count(*) FROM operational_secondary_index_entries \
6503                 WHERE collection_name = 'audit_log' AND index_name = 'actor_ts'",
6504                [],
6505                |row| row.get(0),
6506            )
6507            .expect("secondary index count");
6508        assert_eq!(entry_count, 2);
6509        conn.execute(
6510            "DELETE FROM operational_secondary_index_entries WHERE collection_name = 'audit_log'",
6511            [],
6512        )
6513        .expect("clear index entries");
6514        drop(conn);
6515
6516        let rebuild = service
6517            .rebuild_operational_secondary_indexes("audit_log")
6518            .expect("rebuild secondary indexes");
6519        assert_eq!(rebuild.collection_name, "audit_log");
6520        assert_eq!(rebuild.mutation_entries_rebuilt, 2);
6521        assert_eq!(rebuild.current_entries_rebuilt, 0);
6522    }
6523
6524    #[test]
6525    fn register_operational_collection_rejects_invalid_validation_contract() {
6526        let (_db, service) = setup();
6527
6528        let error = service
6529            .register_operational_collection(&OperationalRegisterRequest {
6530                name: "connector_health".to_owned(),
6531                kind: OperationalCollectionKind::LatestState,
6532                schema_json: "{}".to_owned(),
6533                retention_json: "{}".to_owned(),
6534                filter_fields_json: "[]".to_owned(),
6535                validation_json: r#"{"format_version":1,"mode":"enforce","fields":[{"name":"status","type":"string","minimum":0}]}"#
6536                    .to_owned(),
6537                secondary_indexes_json: "[]".to_owned(),
6538                format_version: 1,
6539            })
6540            .expect_err("invalid validation contract should reject");
6541
6542        assert!(matches!(error, EngineError::InvalidWrite(_)));
6543        assert!(error.to_string().contains("minimum/maximum"));
6544    }
6545
6546    #[test]
6547    fn validate_operational_collection_history_reports_invalid_rows_without_mutation() {
6548        let (db, service) = setup();
6549        service
6550            .register_operational_collection(&OperationalRegisterRequest {
6551                name: "audit_log".to_owned(),
6552                kind: OperationalCollectionKind::AppendOnlyLog,
6553                schema_json: "{}".to_owned(),
6554                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6555                filter_fields_json: "[]".to_owned(),
6556                validation_json: r#"{"format_version":1,"mode":"disabled","additional_properties":false,"fields":[{"name":"status","type":"string","required":true,"enum":["ok","failed"]}]}"#
6557                    .to_owned(),
6558                secondary_indexes_json: "[]".to_owned(),
6559                format_version: 1,
6560            })
6561            .expect("register collection");
6562        {
6563            let writer = crate::WriterActor::start(
6564                db.path(),
6565                Arc::new(SchemaManager::new()),
6566                crate::ProvenanceMode::Warn,
6567                Arc::new(crate::TelemetryCounters::default()),
6568            )
6569            .expect("writer");
6570            writer
6571                .submit(crate::WriteRequest {
6572                    label: "history-validation".to_owned(),
6573                    nodes: vec![],
6574                    node_retires: vec![],
6575                    edges: vec![],
6576                    edge_retires: vec![],
6577                    chunks: vec![],
6578                    runs: vec![],
6579                    steps: vec![],
6580                    actions: vec![],
6581                    optional_backfills: vec![],
6582                    vec_inserts: vec![],
6583                    operational_writes: vec![
6584                        crate::OperationalWrite::Append {
6585                            collection: "audit_log".to_owned(),
6586                            record_key: "evt-1".to_owned(),
6587                            payload_json: r#"{"status":"ok"}"#.to_owned(),
6588                            source_ref: Some("src-1".to_owned()),
6589                        },
6590                        crate::OperationalWrite::Append {
6591                            collection: "audit_log".to_owned(),
6592                            record_key: "evt-2".to_owned(),
6593                            payload_json: r#"{"status":"bogus"}"#.to_owned(),
6594                            source_ref: Some("src-2".to_owned()),
6595                        },
6596                    ],
6597                })
6598                .expect("write");
6599        }
6600
6601        let report = service
6602            .validate_operational_collection_history("audit_log")
6603            .expect("validate history");
6604        assert_eq!(report.collection_name, "audit_log");
6605        assert_eq!(report.checked_rows, 2);
6606        assert_eq!(report.invalid_row_count, 1);
6607        assert_eq!(report.issues.len(), 1);
6608        assert_eq!(report.issues[0].record_key, "evt-2");
6609        assert!(report.issues[0].message.contains("must be one of"));
6610
6611        let trace = service
6612            .trace_operational_collection("audit_log", None)
6613            .expect("trace");
6614        assert_eq!(trace.mutation_count, 2);
6615
6616        let conn = sqlite::open_connection(db.path()).expect("conn");
6617        let provenance_count: i64 = conn
6618            .query_row(
6619                "SELECT count(*) FROM provenance_events \
6620                 WHERE event_type = 'operational_collection_history_validated' \
6621                   AND subject = 'audit_log'",
6622                [],
6623                |row| row.get(0),
6624            )
6625            .expect("provenance count");
6626        assert_eq!(provenance_count, 0);
6627    }
6628
6629    #[test]
6630    fn trace_operational_collection_returns_mutations_and_current_rows() {
6631        let (db, service) = setup();
6632        service
6633            .register_operational_collection(&OperationalRegisterRequest {
6634                name: "connector_health".to_owned(),
6635                kind: OperationalCollectionKind::LatestState,
6636                schema_json: "{}".to_owned(),
6637                retention_json: "{}".to_owned(),
6638                filter_fields_json: "[]".to_owned(),
6639                validation_json: String::new(),
6640                secondary_indexes_json: "[]".to_owned(),
6641                format_version: 1,
6642            })
6643            .expect("register collection");
6644        {
6645            let writer = crate::WriterActor::start(
6646                db.path(),
6647                Arc::new(SchemaManager::new()),
6648                crate::ProvenanceMode::Warn,
6649                Arc::new(crate::TelemetryCounters::default()),
6650            )
6651            .expect("writer");
6652            writer
6653                .submit(crate::WriteRequest {
6654                    label: "operational".to_owned(),
6655                    nodes: vec![],
6656                    node_retires: vec![],
6657                    edges: vec![],
6658                    edge_retires: vec![],
6659                    chunks: vec![],
6660                    runs: vec![],
6661                    steps: vec![],
6662                    actions: vec![],
6663                    optional_backfills: vec![],
6664                    vec_inserts: vec![],
6665                    operational_writes: vec![crate::OperationalWrite::Put {
6666                        collection: "connector_health".to_owned(),
6667                        record_key: "gmail".to_owned(),
6668                        payload_json: r#"{"status":"ok"}"#.to_owned(),
6669                        source_ref: Some("src-1".to_owned()),
6670                    }],
6671                })
6672                .expect("write");
6673        }
6674
6675        let report = service
6676            .trace_operational_collection("connector_health", Some("gmail"))
6677            .expect("trace");
6678        assert_eq!(report.collection_name, "connector_health");
6679        assert_eq!(report.record_key.as_deref(), Some("gmail"));
6680        assert_eq!(report.mutation_count, 1);
6681        assert_eq!(report.current_count, 1);
6682        assert_eq!(report.mutations[0].op_kind, "put");
6683        assert_eq!(report.current_rows[0].payload_json, r#"{"status":"ok"}"#);
6684    }
6685
6686    #[test]
6687    fn trace_operational_collection_rejects_unknown_collection() {
6688        let (_db, service) = setup();
6689
6690        let error = service
6691            .trace_operational_collection("missing_collection", None)
6692            .expect_err("unknown collection should fail");
6693
6694        assert!(matches!(error, EngineError::InvalidWrite(_)));
6695        assert!(error.to_string().contains("is not registered"));
6696    }
6697
6698    #[test]
6699    fn rebuild_operational_current_repairs_missing_latest_state_rows() {
6700        let (db, service) = setup();
6701        service
6702            .register_operational_collection(&OperationalRegisterRequest {
6703                name: "connector_health".to_owned(),
6704                kind: OperationalCollectionKind::LatestState,
6705                schema_json: "{}".to_owned(),
6706                retention_json: "{}".to_owned(),
6707                filter_fields_json: "[]".to_owned(),
6708                validation_json: String::new(),
6709                secondary_indexes_json: "[]".to_owned(),
6710                format_version: 1,
6711            })
6712            .expect("register collection");
6713        {
6714            let writer = crate::WriterActor::start(
6715                db.path(),
6716                Arc::new(SchemaManager::new()),
6717                crate::ProvenanceMode::Warn,
6718                Arc::new(crate::TelemetryCounters::default()),
6719            )
6720            .expect("writer");
6721            writer
6722                .submit(crate::WriteRequest {
6723                    label: "operational".to_owned(),
6724                    nodes: vec![],
6725                    node_retires: vec![],
6726                    edges: vec![],
6727                    edge_retires: vec![],
6728                    chunks: vec![],
6729                    runs: vec![],
6730                    steps: vec![],
6731                    actions: vec![],
6732                    optional_backfills: vec![],
6733                    vec_inserts: vec![],
6734                    operational_writes: vec![crate::OperationalWrite::Put {
6735                        collection: "connector_health".to_owned(),
6736                        record_key: "gmail".to_owned(),
6737                        payload_json: r#"{"status":"ok"}"#.to_owned(),
6738                        source_ref: Some("src-1".to_owned()),
6739                    }],
6740                })
6741                .expect("write");
6742        }
6743        {
6744            let conn = sqlite::open_connection(db.path()).expect("conn");
6745            conn.execute(
6746                "DELETE FROM operational_current WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
6747                [],
6748            )
6749            .expect("delete current row");
6750        }
6751
6752        let before = service.check_semantics().expect("semantics before rebuild");
6753        assert_eq!(before.missing_operational_current_rows, 1);
6754
6755        let repair = service
6756            .rebuild_operational_current(Some("connector_health"))
6757            .expect("rebuild current");
6758        assert_eq!(repair.collections_rebuilt, 1);
6759        assert_eq!(repair.current_rows_rebuilt, 1);
6760
6761        let after = service.check_semantics().expect("semantics after rebuild");
6762        assert_eq!(after.missing_operational_current_rows, 0);
6763
6764        let conn = sqlite::open_connection(db.path()).expect("conn");
6765        let payload: String = conn
6766            .query_row(
6767                "SELECT payload_json FROM operational_current \
6768                 WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
6769                [],
6770                |row| row.get(0),
6771            )
6772            .expect("restored payload");
6773        assert_eq!(payload, r#"{"status":"ok"}"#);
6774    }
6775
6776    #[test]
6777    fn rebuild_operational_current_restores_latest_state_secondary_index_entries() {
6778        let (db, service) = setup();
6779        service
6780            .register_operational_collection(&OperationalRegisterRequest {
6781                name: "connector_health".to_owned(),
6782                kind: OperationalCollectionKind::LatestState,
6783                schema_json: "{}".to_owned(),
6784                retention_json: "{}".to_owned(),
6785                filter_fields_json: "[]".to_owned(),
6786                validation_json: String::new(),
6787                secondary_indexes_json: r#"[{"name":"status_current","kind":"latest_state_field","field":"status","value_type":"string"}]"#.to_owned(),
6788                format_version: 1,
6789            })
6790            .expect("register collection");
6791        {
6792            let writer = crate::WriterActor::start(
6793                db.path(),
6794                Arc::new(SchemaManager::new()),
6795                crate::ProvenanceMode::Warn,
6796                Arc::new(crate::TelemetryCounters::default()),
6797            )
6798            .expect("writer");
6799            writer
6800                .submit(crate::WriteRequest {
6801                    label: "operational".to_owned(),
6802                    nodes: vec![],
6803                    node_retires: vec![],
6804                    edges: vec![],
6805                    edge_retires: vec![],
6806                    chunks: vec![],
6807                    runs: vec![],
6808                    steps: vec![],
6809                    actions: vec![],
6810                    optional_backfills: vec![],
6811                    vec_inserts: vec![],
6812                    operational_writes: vec![crate::OperationalWrite::Put {
6813                        collection: "connector_health".to_owned(),
6814                        record_key: "gmail".to_owned(),
6815                        payload_json: r#"{"status":"ok"}"#.to_owned(),
6816                        source_ref: Some("src-1".to_owned()),
6817                    }],
6818                })
6819                .expect("write");
6820        }
6821        {
6822            let conn = sqlite::open_connection(db.path()).expect("conn");
6823            let entry_count: i64 = conn
6824                .query_row(
6825                    "SELECT count(*) FROM operational_secondary_index_entries \
6826                     WHERE collection_name = 'connector_health' AND subject_kind = 'current'",
6827                    [],
6828                    |row| row.get(0),
6829                )
6830                .expect("secondary index count before repair");
6831            assert_eq!(entry_count, 1);
6832            conn.execute(
6833                "DELETE FROM operational_current WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
6834                [],
6835            )
6836            .expect("delete current row");
6837        }
6838
6839        service
6840            .rebuild_operational_current(Some("connector_health"))
6841            .expect("rebuild current");
6842
6843        let conn = sqlite::open_connection(db.path()).expect("conn");
6844        let entry_count: i64 = conn
6845            .query_row(
6846                "SELECT count(*) FROM operational_secondary_index_entries \
6847                 WHERE collection_name = 'connector_health' AND subject_kind = 'current'",
6848                [],
6849                |row| row.get(0),
6850            )
6851            .expect("secondary index count after repair");
6852        assert_eq!(entry_count, 1);
6853    }
6854
6855    #[test]
6856    fn operational_current_semantics_and_rebuild_follow_mutation_order() {
6857        let (db, service) = setup();
6858        {
6859            let conn = sqlite::open_connection(db.path()).expect("conn");
6860            conn.execute(
6861                "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
6862                 VALUES ('connector_health', 'latest_state', '{}', '{}', 1, 100)",
6863                [],
6864            )
6865            .expect("seed collection");
6866            conn.execute(
6867                "INSERT INTO operational_mutations \
6868                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6869                 VALUES ('m3', 'connector_health', 'gmail', 'put', '{\"status\":\"old\"}', 'src-1', 100, 1)",
6870                [],
6871            )
6872            .expect("seed first put");
6873            conn.execute(
6874                "INSERT INTO operational_mutations \
6875                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6876                 VALUES ('m2', 'connector_health', 'gmail', 'delete', '', 'src-2', 100, 2)",
6877                [],
6878            )
6879            .expect("seed delete");
6880            conn.execute(
6881                "INSERT INTO operational_mutations \
6882                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6883                 VALUES ('m1', 'connector_health', 'gmail', 'put', '{\"status\":\"new\"}', 'src-3', 100, 3)",
6884                [],
6885            )
6886            .expect("seed final put");
6887            conn.execute(
6888                "INSERT INTO operational_current \
6889                 (collection_name, record_key, payload_json, updated_at, last_mutation_id) \
6890                 VALUES ('connector_health', 'gmail', '{\"status\":\"new\"}', 100, 'm1')",
6891                [],
6892            )
6893            .expect("seed current");
6894        }
6895
6896        let before = service.check_semantics().expect("semantics before rebuild");
6897        assert_eq!(before.missing_operational_current_rows, 0);
6898        assert_eq!(before.stale_operational_current_rows, 0);
6899
6900        {
6901            let conn = sqlite::open_connection(db.path()).expect("conn");
6902            conn.execute(
6903                "DELETE FROM operational_current WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
6904                [],
6905            )
6906            .expect("delete current row");
6907        }
6908
6909        let missing = service.check_semantics().expect("semantics after delete");
6910        assert_eq!(missing.missing_operational_current_rows, 1);
6911        assert_eq!(missing.stale_operational_current_rows, 0);
6912
6913        service
6914            .rebuild_operational_current(Some("connector_health"))
6915            .expect("rebuild current");
6916
6917        let after = service.check_semantics().expect("semantics after rebuild");
6918        assert_eq!(after.missing_operational_current_rows, 0);
6919        assert_eq!(after.stale_operational_current_rows, 0);
6920
6921        let conn = sqlite::open_connection(db.path()).expect("conn");
6922        let payload: String = conn
6923            .query_row(
6924                "SELECT payload_json FROM operational_current \
6925                 WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
6926                [],
6927                |row| row.get(0),
6928            )
6929            .expect("restored payload");
6930        assert_eq!(payload, r#"{"status":"new"}"#);
6931    }
6932
6933    #[test]
6934    fn disable_operational_collection_sets_disabled_at_and_emits_provenance() {
6935        let (db, service) = setup();
6936        service
6937            .register_operational_collection(&OperationalRegisterRequest {
6938                name: "audit_log".to_owned(),
6939                kind: OperationalCollectionKind::AppendOnlyLog,
6940                schema_json: "{}".to_owned(),
6941                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6942                filter_fields_json: "[]".to_owned(),
6943                validation_json: String::new(),
6944                secondary_indexes_json: "[]".to_owned(),
6945                format_version: 1,
6946            })
6947            .expect("register collection");
6948
6949        let record = service
6950            .disable_operational_collection("audit_log")
6951            .expect("disable collection");
6952        assert_eq!(record.name, "audit_log");
6953        assert!(record.disabled_at.is_some());
6954
6955        let disabled_at = record.disabled_at.expect("disabled_at");
6956        let described = service
6957            .describe_operational_collection("audit_log")
6958            .expect("describe collection")
6959            .expect("collection exists");
6960        assert_eq!(described.disabled_at, Some(disabled_at));
6961
6962        let writer = crate::WriterActor::start(
6963            db.path(),
6964            Arc::new(SchemaManager::new()),
6965            crate::ProvenanceMode::Warn,
6966            Arc::new(crate::TelemetryCounters::default()),
6967        )
6968        .expect("writer");
6969        let error = writer
6970            .submit(crate::WriteRequest {
6971                label: "disabled-operational".to_owned(),
6972                nodes: vec![],
6973                node_retires: vec![],
6974                edges: vec![],
6975                edge_retires: vec![],
6976                chunks: vec![],
6977                runs: vec![],
6978                steps: vec![],
6979                actions: vec![],
6980                optional_backfills: vec![],
6981                vec_inserts: vec![],
6982                operational_writes: vec![crate::OperationalWrite::Append {
6983                    collection: "audit_log".to_owned(),
6984                    record_key: "evt-1".to_owned(),
6985                    payload_json: r#"{"type":"sync"}"#.to_owned(),
6986                    source_ref: Some("src-1".to_owned()),
6987                }],
6988            })
6989            .expect_err("disabled collection should reject writes");
6990        assert!(matches!(error, EngineError::InvalidWrite(_)));
6991        assert!(error.to_string().contains("is disabled"));
6992
6993        let conn = sqlite::open_connection(db.path()).expect("conn");
6994        let provenance_count: i64 = conn
6995            .query_row(
6996                "SELECT count(*) FROM provenance_events \
6997                 WHERE event_type = 'operational_collection_disabled' AND subject = 'audit_log'",
6998                [],
6999                |row| row.get(0),
7000            )
7001            .expect("provenance count");
7002        assert_eq!(provenance_count, 1);
7003    }
7004
7005    #[test]
7006    fn purge_operational_collection_deletes_append_only_rows_before_cutoff() {
7007        let (db, service) = setup();
7008        {
7009            let conn = sqlite::open_connection(db.path()).expect("conn");
7010            conn.execute(
7011                "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
7012                 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_all\"}', 1, 100)",
7013                [],
7014            )
7015            .expect("seed collection");
7016            conn.execute(
7017                "INSERT INTO operational_mutations \
7018                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
7019                 VALUES ('evt-1', 'audit_log', 'evt-1', 'append', '{\"seq\":1}', 'src-1', 100, 1)",
7020                [],
7021            )
7022            .expect("seed event 1");
7023            conn.execute(
7024                "INSERT INTO operational_mutations \
7025                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
7026                 VALUES ('evt-2', 'audit_log', 'evt-2', 'append', '{\"seq\":2}', 'src-2', 200, 2)",
7027                [],
7028            )
7029            .expect("seed event 2");
7030            conn.execute(
7031                "INSERT INTO operational_mutations \
7032                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
7033                 VALUES ('evt-3', 'audit_log', 'evt-3', 'append', '{\"seq\":3}', 'src-3', 300, 3)",
7034                [],
7035            )
7036            .expect("seed event 3");
7037        }
7038
7039        let report = service
7040            .purge_operational_collection("audit_log", 250)
7041            .expect("purge collection");
7042        assert_eq!(report.collection_name, "audit_log");
7043        assert_eq!(report.deleted_mutations, 2);
7044        assert_eq!(report.before_timestamp, 250);
7045
7046        let conn = sqlite::open_connection(db.path()).expect("conn");
7047        let remaining: Vec<String> = {
7048            let mut stmt = conn
7049                .prepare(
7050                    "SELECT id FROM operational_mutations \
7051                     WHERE collection_name = 'audit_log' ORDER BY mutation_order",
7052                )
7053                .expect("stmt");
7054            stmt.query_map([], |row| row.get(0))
7055                .expect("rows")
7056                .collect::<Result<_, _>>()
7057                .expect("collect")
7058        };
7059        assert_eq!(remaining, vec!["evt-3".to_owned()]);
7060        let provenance_count: i64 = conn
7061            .query_row(
7062                "SELECT count(*) FROM provenance_events \
7063                 WHERE event_type = 'operational_collection_purged' AND subject = 'audit_log'",
7064                [],
7065                |row| row.get(0),
7066            )
7067            .expect("provenance count");
7068        assert_eq!(provenance_count, 1);
7069    }
7070
7071    #[test]
7072    fn compact_operational_collection_dry_run_reports_without_mutation() {
7073        let (db, service) = setup();
7074        {
7075            let conn = sqlite::open_connection(db.path()).expect("conn");
7076            conn.execute(
7077                "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
7078                 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
7079                [],
7080            )
7081            .expect("seed collection");
7082            for (index, created_at) in [(1_i64, 100_i64), (2, 200), (3, 300)] {
7083                conn.execute(
7084                    "INSERT INTO operational_mutations \
7085                     (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
7086                     VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
7087                    rusqlite::params![
7088                        format!("evt-{index}"),
7089                        format!("{{\"seq\":{index}}}"),
7090                        created_at,
7091                        index,
7092                    ],
7093                )
7094                .expect("seed event");
7095            }
7096        }
7097
7098        let report = service
7099            .compact_operational_collection("audit_log", true)
7100            .expect("compact collection");
7101        assert_eq!(report.collection_name, "audit_log");
7102        assert_eq!(report.deleted_mutations, 1);
7103        assert!(report.dry_run);
7104        assert_eq!(report.before_timestamp, None);
7105
7106        let conn = sqlite::open_connection(db.path()).expect("conn");
7107        let remaining_count: i64 = conn
7108            .query_row(
7109                "SELECT count(*) FROM operational_mutations WHERE collection_name = 'audit_log'",
7110                [],
7111                |row| row.get(0),
7112            )
7113            .expect("remaining count");
7114        assert_eq!(remaining_count, 3);
7115        let provenance_count: i64 = conn
7116            .query_row(
7117                "SELECT count(*) FROM provenance_events \
7118                 WHERE event_type = 'operational_collection_compacted' AND subject = 'audit_log'",
7119                [],
7120                |row| row.get(0),
7121            )
7122            .expect("provenance count");
7123        assert_eq!(provenance_count, 0);
7124    }
7125
7126    #[test]
7127    fn compact_operational_collection_keep_last_deletes_oldest_rows() {
7128        let (db, service) = setup();
7129        {
7130            let conn = sqlite::open_connection(db.path()).expect("conn");
7131            conn.execute(
7132                "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
7133                 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
7134                [],
7135            )
7136            .expect("seed collection");
7137            for (index, created_at) in [(1_i64, 100_i64), (2, 200), (3, 300)] {
7138                conn.execute(
7139                    "INSERT INTO operational_mutations \
7140                     (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
7141                     VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
7142                    rusqlite::params![
7143                        format!("evt-{index}"),
7144                        format!("{{\"seq\":{index}}}"),
7145                        created_at,
7146                        index,
7147                    ],
7148                )
7149                .expect("seed event");
7150            }
7151        }
7152
7153        let report = service
7154            .compact_operational_collection("audit_log", false)
7155            .expect("compact collection");
7156        assert_eq!(report.deleted_mutations, 1);
7157        assert!(!report.dry_run);
7158
7159        let conn = sqlite::open_connection(db.path()).expect("conn");
7160        let remaining: Vec<String> = {
7161            let mut stmt = conn
7162                .prepare(
7163                    "SELECT id FROM operational_mutations \
7164                     WHERE collection_name = 'audit_log' ORDER BY mutation_order",
7165                )
7166                .expect("stmt");
7167            stmt.query_map([], |row| row.get(0))
7168                .expect("rows")
7169                .collect::<Result<_, _>>()
7170                .expect("collect")
7171        };
7172        assert_eq!(remaining, vec!["evt-2".to_owned(), "evt-3".to_owned()]);
7173        let provenance_count: i64 = conn
7174            .query_row(
7175                "SELECT count(*) FROM provenance_events \
7176                 WHERE event_type = 'operational_collection_compacted' AND subject = 'audit_log'",
7177                [],
7178                |row| row.get(0),
7179            )
7180            .expect("provenance count");
7181        assert_eq!(provenance_count, 1);
7182    }
7183
7184    #[test]
7185    fn plan_and_run_operational_retention_keep_last() {
7186        let (db, service) = setup();
7187        {
7188            let conn = sqlite::open_connection(db.path()).expect("conn");
7189            conn.execute(
7190                "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
7191                 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
7192                [],
7193            )
7194            .expect("seed collection");
7195            for (index, created_at) in [(1_i64, 100_i64), (2, 200), (3, 300)] {
7196                conn.execute(
7197                    "INSERT INTO operational_mutations \
7198                     (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
7199                     VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
7200                    rusqlite::params![
7201                        format!("evt-{index}"),
7202                        format!("{{\"seq\":{index}}}"),
7203                        created_at,
7204                        index,
7205                    ],
7206                )
7207                .expect("seed event");
7208            }
7209        }
7210
7211        let plan = service
7212            .plan_operational_retention(1_000, None, Some(10))
7213            .expect("plan retention");
7214        assert_eq!(plan.collections_examined, 1);
7215        assert_eq!(plan.items[0].collection_name, "audit_log");
7216        assert_eq!(
7217            plan.items[0].action_kind,
7218            crate::operational::OperationalRetentionActionKind::KeepLast
7219        );
7220        assert_eq!(plan.items[0].candidate_deletions, 1);
7221        assert_eq!(plan.items[0].max_rows, Some(2));
7222        assert_eq!(plan.items[0].last_run_at, None);
7223
7224        let dry_run = service
7225            .run_operational_retention(1_000, None, Some(10), true)
7226            .expect("dry-run retention");
7227        assert!(dry_run.dry_run);
7228        assert_eq!(dry_run.collections_acted_on, 1);
7229        assert_eq!(dry_run.items[0].deleted_mutations, 1);
7230        assert_eq!(dry_run.items[0].rows_remaining, 2);
7231
7232        let conn = sqlite::open_connection(db.path()).expect("conn");
7233        let remaining_count: i64 = conn
7234            .query_row(
7235                "SELECT count(*) FROM operational_mutations WHERE collection_name = 'audit_log'",
7236                [],
7237                |row| row.get(0),
7238            )
7239            .expect("remaining count after dry run");
7240        assert_eq!(remaining_count, 3);
7241        let retention_run_count: i64 = conn
7242            .query_row(
7243                "SELECT count(*) FROM operational_retention_runs WHERE collection_name = 'audit_log'",
7244                [],
7245                |row| row.get(0),
7246            )
7247            .expect("retention run count");
7248        assert_eq!(retention_run_count, 0);
7249        drop(conn);
7250
7251        let executed = service
7252            .run_operational_retention(1_000, None, Some(10), false)
7253            .expect("execute retention");
7254        assert_eq!(executed.collections_acted_on, 1);
7255        assert_eq!(executed.items[0].deleted_mutations, 1);
7256        assert_eq!(executed.items[0].rows_remaining, 2);
7257
7258        let conn = sqlite::open_connection(db.path()).expect("conn");
7259        let remaining: Vec<String> = {
7260            let mut stmt = conn
7261                .prepare(
7262                    "SELECT id FROM operational_mutations \
7263                     WHERE collection_name = 'audit_log' ORDER BY mutation_order",
7264                )
7265                .expect("stmt");
7266            stmt.query_map([], |row| row.get(0))
7267                .expect("rows")
7268                .collect::<Result<_, _>>()
7269                .expect("collect")
7270        };
7271        assert_eq!(remaining, vec!["evt-2".to_owned(), "evt-3".to_owned()]);
7272        let last_run_at: i64 = conn
7273            .query_row(
7274                "SELECT executed_at FROM operational_retention_runs \
7275                 WHERE collection_name = 'audit_log' ORDER BY executed_at DESC LIMIT 1",
7276                [],
7277                |row| row.get(0),
7278            )
7279            .expect("last run at");
7280        assert_eq!(last_run_at, 1_000);
7281    }
7282
7283    #[test]
7284    fn dry_run_operational_retention_does_not_mark_noop_collection_as_acted_on() {
7285        let (db, service) = setup();
7286        let conn = sqlite::open_connection(db.path()).expect("conn");
7287        conn.execute(
7288            "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
7289             VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
7290            [],
7291        )
7292        .expect("seed collection");
7293        for (index, created_at) in [(1_i64, 100_i64), (2, 200)] {
7294            conn.execute(
7295                "INSERT INTO operational_mutations \
7296                 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
7297                 VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
7298                rusqlite::params![
7299                    format!("evt-{index}"),
7300                    format!("{{\"seq\":{index}}}"),
7301                    created_at,
7302                    index,
7303                ],
7304            )
7305            .expect("seed event");
7306        }
7307        drop(conn);
7308
7309        let dry_run = service
7310            .run_operational_retention(1_000, None, Some(10), true)
7311            .expect("dry-run retention");
7312        assert!(dry_run.dry_run);
7313        assert_eq!(dry_run.collections_acted_on, 0);
7314        assert_eq!(dry_run.items[0].deleted_mutations, 0);
7315        assert_eq!(dry_run.items[0].rows_remaining, 2);
7316    }
7317
7318    #[test]
7319    fn compact_operational_collection_rejects_latest_state() {
7320        let (_db, service) = setup();
7321        service
7322            .register_operational_collection(&OperationalRegisterRequest {
7323                name: "connector_health".to_owned(),
7324                kind: OperationalCollectionKind::LatestState,
7325                schema_json: "{}".to_owned(),
7326                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
7327                filter_fields_json: "[]".to_owned(),
7328                validation_json: String::new(),
7329                secondary_indexes_json: "[]".to_owned(),
7330                format_version: 1,
7331            })
7332            .expect("register collection");
7333
7334        let error = service
7335            .compact_operational_collection("connector_health", false)
7336            .expect_err("latest_state compaction should be rejected");
7337        assert!(matches!(error, EngineError::InvalidWrite(_)));
7338        assert!(error.to_string().contains("append_only_log"));
7339    }
7340
7341    #[test]
7342    fn register_operational_collection_persists_filter_fields_json() {
7343        let (_db, service) = setup();
7344
7345        let record = service
7346            .register_operational_collection(&OperationalRegisterRequest {
7347                name: "audit_log".to_owned(),
7348                kind: OperationalCollectionKind::AppendOnlyLog,
7349                schema_json: "{}".to_owned(),
7350                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
7351                filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#.to_owned(),
7352                validation_json: String::new(),
7353                secondary_indexes_json: "[]".to_owned(),
7354                format_version: 1,
7355            })
7356            .expect("register collection");
7357
7358        assert_eq!(
7359            record.filter_fields_json,
7360            r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#
7361        );
7362    }
7363
7364    #[test]
7365    fn read_operational_collection_filters_append_only_rows_by_declared_fields() {
7366        let (db, service) = setup();
7367        service
7368            .register_operational_collection(&OperationalRegisterRequest {
7369                name: "audit_log".to_owned(),
7370                kind: OperationalCollectionKind::AppendOnlyLog,
7371                schema_json: "{}".to_owned(),
7372                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
7373                filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"seq","type":"integer","modes":["exact","range"]},{"name":"ts","type":"timestamp","modes":["exact","range"]}]"#.to_owned(),
7374                validation_json: String::new(),
7375                secondary_indexes_json: "[]".to_owned(),
7376                format_version: 1,
7377            })
7378            .expect("register collection");
7379        {
7380            let writer = crate::WriterActor::start(
7381                db.path(),
7382                Arc::new(SchemaManager::new()),
7383                crate::ProvenanceMode::Warn,
7384                Arc::new(crate::TelemetryCounters::default()),
7385            )
7386            .expect("writer");
7387            writer
7388                .submit(crate::WriteRequest {
7389                    label: "operational".to_owned(),
7390                    nodes: vec![],
7391                    node_retires: vec![],
7392                    edges: vec![],
7393                    edge_retires: vec![],
7394                    chunks: vec![],
7395                    runs: vec![],
7396                    steps: vec![],
7397                    actions: vec![],
7398                    optional_backfills: vec![],
7399                    vec_inserts: vec![],
7400                    operational_writes: vec![
7401                        crate::OperationalWrite::Append {
7402                            collection: "audit_log".to_owned(),
7403                            record_key: "evt-1".to_owned(),
7404                            payload_json: r#"{"actor":"alice","seq":1,"ts":100}"#.to_owned(),
7405                            source_ref: Some("src-1".to_owned()),
7406                        },
7407                        crate::OperationalWrite::Append {
7408                            collection: "audit_log".to_owned(),
7409                            record_key: "evt-2".to_owned(),
7410                            payload_json: r#"{"actor":"alice-admin","seq":2,"ts":200}"#.to_owned(),
7411                            source_ref: Some("src-2".to_owned()),
7412                        },
7413                        crate::OperationalWrite::Append {
7414                            collection: "audit_log".to_owned(),
7415                            record_key: "evt-3".to_owned(),
7416                            payload_json: r#"{"actor":"bob","seq":3,"ts":300}"#.to_owned(),
7417                            source_ref: Some("src-3".to_owned()),
7418                        },
7419                    ],
7420                })
7421                .expect("write");
7422        }
7423
7424        let report = service
7425            .read_operational_collection(&crate::operational::OperationalReadRequest {
7426                collection_name: "audit_log".to_owned(),
7427                filters: vec![
7428                    crate::operational::OperationalFilterClause::Prefix {
7429                        field: "actor".to_owned(),
7430                        value: "alice".to_owned(),
7431                    },
7432                    crate::operational::OperationalFilterClause::Range {
7433                        field: "ts".to_owned(),
7434                        lower: Some(150),
7435                        upper: Some(250),
7436                    },
7437                ],
7438                limit: Some(10),
7439            })
7440            .expect("filtered read");
7441
7442        assert_eq!(report.collection_name, "audit_log");
7443        assert_eq!(report.row_count, 1);
7444        assert!(!report.was_limited);
7445        assert_eq!(report.rows.len(), 1);
7446        assert_eq!(report.rows[0].record_key, "evt-2");
7447        assert_eq!(
7448            report.rows[0].payload_json,
7449            r#"{"actor":"alice-admin","seq":2,"ts":200}"#
7450        );
7451    }
7452
7453    #[test]
7454    fn read_operational_collection_uses_secondary_index_when_filter_values_are_missing() {
7455        let (db, service) = setup();
7456        service
7457            .register_operational_collection(&OperationalRegisterRequest {
7458                name: "audit_log".to_owned(),
7459                kind: OperationalCollectionKind::AppendOnlyLog,
7460                schema_json: "{}".to_owned(),
7461                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
7462                filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#.to_owned(),
7463                validation_json: String::new(),
7464                secondary_indexes_json: r#"[{"name":"actor_ts","kind":"append_only_field_time","field":"actor","value_type":"string","time_field":"ts"}]"#.to_owned(),
7465                format_version: 1,
7466            })
7467            .expect("register collection");
7468        {
7469            let writer = crate::WriterActor::start(
7470                db.path(),
7471                Arc::new(SchemaManager::new()),
7472                crate::ProvenanceMode::Warn,
7473                Arc::new(crate::TelemetryCounters::default()),
7474            )
7475            .expect("writer");
7476            writer
7477                .submit(crate::WriteRequest {
7478                    label: "operational".to_owned(),
7479                    nodes: vec![],
7480                    node_retires: vec![],
7481                    edges: vec![],
7482                    edge_retires: vec![],
7483                    chunks: vec![],
7484                    runs: vec![],
7485                    steps: vec![],
7486                    actions: vec![],
7487                    optional_backfills: vec![],
7488                    vec_inserts: vec![],
7489                    operational_writes: vec![
7490                        crate::OperationalWrite::Append {
7491                            collection: "audit_log".to_owned(),
7492                            record_key: "evt-1".to_owned(),
7493                            payload_json: r#"{"actor":"alice","ts":100}"#.to_owned(),
7494                            source_ref: Some("src-1".to_owned()),
7495                        },
7496                        crate::OperationalWrite::Append {
7497                            collection: "audit_log".to_owned(),
7498                            record_key: "evt-2".to_owned(),
7499                            payload_json: r#"{"actor":"alice-admin","ts":200}"#.to_owned(),
7500                            source_ref: Some("src-2".to_owned()),
7501                        },
7502                    ],
7503                })
7504                .expect("write");
7505        }
7506        let conn = sqlite::open_connection(db.path()).expect("conn");
7507        conn.execute(
7508            "DELETE FROM operational_filter_values WHERE collection_name = 'audit_log'",
7509            [],
7510        )
7511        .expect("clear filter values");
7512        drop(conn);
7513
7514        let report = service
7515            .read_operational_collection(&crate::operational::OperationalReadRequest {
7516                collection_name: "audit_log".to_owned(),
7517                filters: vec![
7518                    crate::operational::OperationalFilterClause::Prefix {
7519                        field: "actor".to_owned(),
7520                        value: "alice".to_owned(),
7521                    },
7522                    crate::operational::OperationalFilterClause::Range {
7523                        field: "ts".to_owned(),
7524                        lower: Some(150),
7525                        upper: Some(250),
7526                    },
7527                ],
7528                limit: Some(10),
7529            })
7530            .expect("secondary-index read");
7531
7532        assert_eq!(report.row_count, 1);
7533        assert_eq!(report.rows[0].record_key, "evt-2");
7534    }
7535
7536    #[test]
7537    fn read_operational_collection_rejects_undeclared_fields_and_latest_state_collections() {
7538        let (_db, service) = setup();
7539        service
7540            .register_operational_collection(&OperationalRegisterRequest {
7541                name: "connector_health".to_owned(),
7542                kind: OperationalCollectionKind::LatestState,
7543                schema_json: "{}".to_owned(),
7544                retention_json: "{}".to_owned(),
7545                filter_fields_json: r#"[{"name":"status","type":"string","modes":["exact"]}]"#
7546                    .to_owned(),
7547                validation_json: String::new(),
7548                secondary_indexes_json: "[]".to_owned(),
7549                format_version: 1,
7550            })
7551            .expect("register collection");
7552
7553        let latest_state_error = service
7554            .read_operational_collection(&crate::operational::OperationalReadRequest {
7555                collection_name: "connector_health".to_owned(),
7556                filters: vec![crate::operational::OperationalFilterClause::Exact {
7557                    field: "status".to_owned(),
7558                    value: crate::operational::OperationalFilterValue::String("ok".to_owned()),
7559                }],
7560                limit: Some(10),
7561            })
7562            .expect_err("latest_state filtered reads should be rejected");
7563        assert!(latest_state_error.to_string().contains("append_only_log"));
7564
7565        service
7566            .register_operational_collection(&OperationalRegisterRequest {
7567                name: "audit_log".to_owned(),
7568                kind: OperationalCollectionKind::AppendOnlyLog,
7569                schema_json: "{}".to_owned(),
7570                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
7571                filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact"]}]"#
7572                    .to_owned(),
7573                validation_json: String::new(),
7574                secondary_indexes_json: "[]".to_owned(),
7575                format_version: 1,
7576            })
7577            .expect("register append-only collection");
7578
7579        let undeclared_error = service
7580            .read_operational_collection(&crate::operational::OperationalReadRequest {
7581                collection_name: "audit_log".to_owned(),
7582                filters: vec![crate::operational::OperationalFilterClause::Exact {
7583                    field: "missing".to_owned(),
7584                    value: crate::operational::OperationalFilterValue::String("x".to_owned()),
7585                }],
7586                limit: Some(10),
7587            })
7588            .expect_err("undeclared field should be rejected");
7589        assert!(undeclared_error.to_string().contains("undeclared"));
7590    }
7591
7592    #[test]
7593    fn read_operational_collection_applies_limit_and_reports_truncation() {
7594        let (db, service) = setup();
7595        service
7596            .register_operational_collection(&OperationalRegisterRequest {
7597                name: "audit_log".to_owned(),
7598                kind: OperationalCollectionKind::AppendOnlyLog,
7599                schema_json: "{}".to_owned(),
7600                retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
7601                filter_fields_json: r#"[{"name":"actor","type":"string","modes":["prefix"]}]"#
7602                    .to_owned(),
7603                validation_json: String::new(),
7604                secondary_indexes_json: "[]".to_owned(),
7605                format_version: 1,
7606            })
7607            .expect("register collection");
7608        {
7609            let writer = crate::WriterActor::start(
7610                db.path(),
7611                Arc::new(SchemaManager::new()),
7612                crate::ProvenanceMode::Warn,
7613                Arc::new(crate::TelemetryCounters::default()),
7614            )
7615            .expect("writer");
7616            writer
7617                .submit(crate::WriteRequest {
7618                    label: "operational".to_owned(),
7619                    nodes: vec![],
7620                    node_retires: vec![],
7621                    edges: vec![],
7622                    edge_retires: vec![],
7623                    chunks: vec![],
7624                    runs: vec![],
7625                    steps: vec![],
7626                    actions: vec![],
7627                    optional_backfills: vec![],
7628                    vec_inserts: vec![],
7629                    operational_writes: vec![
7630                        crate::OperationalWrite::Append {
7631                            collection: "audit_log".to_owned(),
7632                            record_key: "evt-1".to_owned(),
7633                            payload_json: r#"{"actor":"alice-1"}"#.to_owned(),
7634                            source_ref: Some("src-1".to_owned()),
7635                        },
7636                        crate::OperationalWrite::Append {
7637                            collection: "audit_log".to_owned(),
7638                            record_key: "evt-2".to_owned(),
7639                            payload_json: r#"{"actor":"alice-2"}"#.to_owned(),
7640                            source_ref: Some("src-2".to_owned()),
7641                        },
7642                    ],
7643                })
7644                .expect("write");
7645        }
7646
7647        let report = service
7648            .read_operational_collection(&crate::operational::OperationalReadRequest {
7649                collection_name: "audit_log".to_owned(),
7650                filters: vec![crate::operational::OperationalFilterClause::Prefix {
7651                    field: "actor".to_owned(),
7652                    value: "alice".to_owned(),
7653                }],
7654                limit: Some(1),
7655            })
7656            .expect("limited read");
7657
7658        assert_eq!(report.row_count, 1);
7659        assert_eq!(report.applied_limit, 1);
7660        assert!(report.was_limited);
7661        assert_eq!(report.rows[0].record_key, "evt-2");
7662    }
7663
7664    #[test]
7665    fn preexisting_operational_collection_can_gain_filter_contract_after_upgrade() {
7666        let db = NamedTempFile::new().expect("temp db");
7667        let conn = sqlite::open_connection(db.path()).expect("conn");
7668        conn.execute_batch(
7669            r#"
7670            CREATE TABLE operational_collections (
7671                name TEXT PRIMARY KEY,
7672                kind TEXT NOT NULL,
7673                schema_json TEXT NOT NULL,
7674                retention_json TEXT NOT NULL,
7675                format_version INTEGER NOT NULL DEFAULT 1,
7676                created_at INTEGER NOT NULL DEFAULT 100,
7677                disabled_at INTEGER
7678            );
7679            CREATE TABLE operational_mutations (
7680                id TEXT PRIMARY KEY,
7681                collection_name TEXT NOT NULL,
7682                record_key TEXT NOT NULL,
7683                op_kind TEXT NOT NULL,
7684                payload_json TEXT NOT NULL,
7685                source_ref TEXT,
7686                created_at INTEGER NOT NULL DEFAULT 100,
7687                mutation_order INTEGER NOT NULL DEFAULT 1
7688            );
7689            INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at)
7690            VALUES ('audit_log', 'append_only_log', '{}', '{"mode":"keep_all"}', 1, 100);
7691            INSERT INTO operational_mutations
7692                (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order)
7693            VALUES
7694                ('evt-1', 'audit_log', 'evt-1', 'append', '{"actor":"alice","ts":0}', 'src-1', 100, 1);
7695            "#,
7696        )
7697        .expect("seed pre-v10 schema");
7698        drop(conn);
7699
7700        let service = AdminService::new(db.path(), Arc::new(SchemaManager::new()));
7701        let pre_update = service
7702            .read_operational_collection(&crate::operational::OperationalReadRequest {
7703                collection_name: "audit_log".to_owned(),
7704                filters: vec![crate::operational::OperationalFilterClause::Exact {
7705                    field: "actor".to_owned(),
7706                    value: crate::operational::OperationalFilterValue::String("alice".to_owned()),
7707                }],
7708                limit: Some(10),
7709            })
7710            .expect_err("read should reject undeclared fields before migration update");
7711        assert!(pre_update.to_string().contains("undeclared"));
7712
7713        let updated = service
7714            .update_operational_collection_filters(
7715                "audit_log",
7716                r#"[{"name":"actor","type":"string","modes":["exact"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#,
7717            )
7718            .expect("update filter contract");
7719        assert!(updated.filter_fields_json.contains("\"actor\""));
7720
7721        let report = service
7722            .read_operational_collection(&crate::operational::OperationalReadRequest {
7723                collection_name: "audit_log".to_owned(),
7724                filters: vec![crate::operational::OperationalFilterClause::Range {
7725                    field: "ts".to_owned(),
7726                    lower: Some(0),
7727                    upper: Some(0),
7728                }],
7729                limit: Some(10),
7730            })
7731            .expect("read after explicit filter update");
7732        assert_eq!(report.row_count, 1);
7733        assert_eq!(report.rows[0].record_key, "evt-1");
7734    }
7735
7736    #[cfg(feature = "sqlite-vec")]
7737    #[test]
7738    fn check_semantics_detects_stale_vec_rows() {
7739        use crate::sqlite::open_connection_with_vec;
7740
7741        let db = NamedTempFile::new().expect("temp file");
7742        let schema = Arc::new(SchemaManager::new());
7743        {
7744            let conn = open_connection_with_vec(db.path()).expect("vec conn");
7745            schema.bootstrap(&conn).expect("bootstrap");
7746            schema
7747                .ensure_vec_kind_profile(&conn, "Doc", 3)
7748                .expect("vec kind profile");
7749            // Insert a vec row whose chunk does not exist.
7750            let bytes: Vec<u8> = [0.1f32, 0.2f32, 0.3f32]
7751                .iter()
7752                .flat_map(|f| f.to_le_bytes())
7753                .collect();
7754            conn.execute(
7755                "INSERT INTO vec_doc (chunk_id, embedding) VALUES ('ghost-chunk', ?1)",
7756                rusqlite::params![bytes],
7757            )
7758            .expect("insert stale vec row");
7759        }
7760        let service = AdminService::new(db.path(), Arc::clone(&schema));
7761        let report = service.check_semantics().expect("semantics check");
7762        assert_eq!(report.stale_vec_rows, 1);
7763        assert!(
7764            report.warnings.iter().any(|w| w.contains("stale vec")),
7765            "warning must mention stale vec"
7766        );
7767    }
7768
7769    #[cfg(feature = "sqlite-vec")]
7770    #[test]
7771    fn restore_vector_profiles_recreates_vec_table_from_metadata() {
7772        let db = NamedTempFile::new().expect("temp file");
7773        let schema = Arc::new(SchemaManager::new());
7774        {
7775            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7776            schema.bootstrap(&conn).expect("bootstrap");
7777            conn.execute(
7778                "INSERT INTO vector_profiles (profile, table_name, dimension, enabled) \
7779                 VALUES ('default', 'vec_nodes_active', 3, 1)",
7780                [],
7781            )
7782            .expect("insert vector profile");
7783        }
7784
7785        let service = AdminService::new(db.path(), Arc::clone(&schema));
7786        let report = service
7787            .restore_vector_profiles()
7788            .expect("restore vector profiles");
7789        assert_eq!(
7790            report.targets,
7791            vec![crate::projection::ProjectionTarget::Vec]
7792        );
7793        assert_eq!(report.rebuilt_rows, 1);
7794
7795        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7796        let count: i64 = conn
7797            .query_row(
7798                "SELECT count(*) FROM sqlite_schema WHERE name = 'vec_nodes_active'",
7799                [],
7800                |row| row.get(0),
7801            )
7802            .expect("vec schema count");
7803        assert_eq!(count, 1, "vec table should exist after restore");
7804    }
7805
7806    #[cfg(feature = "sqlite-vec")]
7807    #[test]
7808    fn load_vector_regeneration_config_supports_json_and_toml() {
7809        let dir = tempfile::tempdir().expect("temp dir");
7810        let json_path = dir.path().join("regen.json");
7811        let toml_path = dir.path().join("regen.toml");
7812
7813        let config = VectorRegenerationConfig {
7814            kind: "Document".to_owned(),
7815            profile: "default".to_owned(),
7816            chunking_policy: "per_chunk".to_owned(),
7817            preprocessing_policy: "trim".to_owned(),
7818        };
7819
7820        fs::write(&json_path, serde_json::to_string(&config).expect("json")).expect("write json");
7821        fs::write(&toml_path, toml::to_string(&config).expect("toml")).expect("write toml");
7822
7823        let parsed_json = load_vector_regeneration_config(&json_path).expect("json parse");
7824        let parsed_toml = load_vector_regeneration_config(&toml_path).expect("toml parse");
7825
7826        assert_eq!(parsed_json, config);
7827        assert_eq!(parsed_toml, config);
7828    }
7829
7830    /// The 0.4.0 rewrite removed the identity fields from the config.
7831    /// Any client that still serializes the pre-0.4 fields must be
7832    /// rejected AT THE SERDE BOUNDARY with a clear error — never
7833    /// silently accepted.
7834    #[test]
7835    fn regenerate_vector_embeddings_config_rejects_old_identity_fields() {
7836        // Pre-0.5.0 configs that include old fields (table_name, model_identity, etc.)
7837        // must be rejected at the serde boundary due to deny_unknown_fields.
7838        let legacy_json = r#"{
7839            "kind": "Document",
7840            "profile": "default",
7841            "table_name": "vec_nodes_active",
7842            "model_identity": "old-model",
7843            "model_version": "1.0",
7844            "dimension": 4,
7845            "normalization_policy": "l2",
7846            "chunking_policy": "per_chunk",
7847            "preprocessing_policy": "trim",
7848            "generator_command": ["/bin/echo"]
7849        }"#;
7850        let result: Result<VectorRegenerationConfig, _> = serde_json::from_str(legacy_json);
7851        assert!(
7852            result.is_err(),
7853            "legacy identity fields must be rejected at deserialization"
7854        );
7855    }
7856
7857    #[cfg(all(not(feature = "sqlite-vec"), unix))]
7858    #[test]
7859    fn regenerate_vector_embeddings_unsupported_vec_capability_writes_request_and_failed_audit() {
7860        let db = NamedTempFile::new().expect("temp file");
7861        let schema = Arc::new(SchemaManager::new());
7862
7863        {
7864            let conn = sqlite::open_connection(db.path()).expect("connection");
7865            schema.bootstrap(&conn).expect("bootstrap");
7866            conn.execute(
7867                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7868                 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7869                [],
7870            )
7871            .expect("insert node");
7872            conn.execute(
7873                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7874                 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7875                [],
7876            )
7877            .expect("insert chunk");
7878        }
7879
7880        let service = AdminService::new(db.path(), Arc::clone(&schema));
7881        let embedder = TestEmbedder::new("test-model", 4);
7882        let error = service
7883            .regenerate_vector_embeddings(
7884                &embedder,
7885                &VectorRegenerationConfig {
7886                    kind: "Document".to_owned(),
7887                    profile: "default".to_owned(),
7888                    chunking_policy: "per_chunk".to_owned(),
7889                    preprocessing_policy: "trim".to_owned(),
7890                },
7891            )
7892            .expect_err("sqlite-vec capability should be required");
7893
7894        assert!(error.to_string().contains("unsupported vec capability"));
7895
7896        let conn = sqlite::open_connection(db.path()).expect("connection");
7897        let request_count: i64 = conn
7898            .query_row(
7899                "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_requested' AND subject = 'default'",
7900                [],
7901                |row| row.get(0),
7902            )
7903            .expect("request count");
7904        assert_eq!(request_count, 1);
7905        let failed_count: i64 = conn
7906            .query_row(
7907                "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7908                [],
7909                |row| row.get(0),
7910            )
7911            .expect("failed count");
7912        assert_eq!(failed_count, 1);
7913        let metadata_json: String = conn
7914            .query_row(
7915                "SELECT metadata_json FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7916                [],
7917                |row| row.get(0),
7918            )
7919            .expect("failed metadata");
7920        assert!(metadata_json.contains("\"failure_class\":\"unsupported vec capability\""));
7921    }
7922
7923    #[cfg(feature = "sqlite-vec")]
7924    #[test]
7925    #[allow(clippy::too_many_lines)]
7926    fn regenerate_vector_embeddings_rebuilds_embeddings_via_embedder() {
7927        let db = NamedTempFile::new().expect("temp file");
7928        let schema = Arc::new(SchemaManager::new());
7929
7930        {
7931            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7932            schema.bootstrap(&conn).expect("bootstrap");
7933            conn.execute(
7934                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7935                 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7936                [],
7937            )
7938            .expect("insert node");
7939            conn.execute(
7940                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7941                 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7942                [],
7943            )
7944            .expect("insert chunk 1");
7945            conn.execute(
7946                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7947                 VALUES ('chunk-2', 'doc-1', 'travel plan', 101)",
7948                [],
7949            )
7950            .expect("insert chunk 2");
7951        }
7952
7953        let service = AdminService::new(db.path(), Arc::clone(&schema));
7954        let embedder = TestEmbedder::new("test-model", 4);
7955        let report = service
7956            .regenerate_vector_embeddings(
7957                &embedder,
7958                &VectorRegenerationConfig {
7959                    kind: "Document".to_owned(),
7960                    profile: "default".to_owned(),
7961                    chunking_policy: "per_chunk".to_owned(),
7962                    preprocessing_policy: "trim".to_owned(),
7963                },
7964            )
7965            .expect("regenerate vectors");
7966
7967        assert_eq!(report.profile, "default");
7968        assert_eq!(report.table_name, "vec_document");
7969        assert_eq!(report.dimension, 4);
7970        assert_eq!(report.total_chunks, 2);
7971        assert_eq!(report.regenerated_rows, 2);
7972        assert!(report.contract_persisted);
7973
7974        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7975        let vec_count: i64 = conn
7976            .query_row("SELECT count(*) FROM vec_document", [], |row| row.get(0))
7977            .expect("vec count");
7978        assert_eq!(vec_count, 2);
7979
7980        // The persisted vector contract must reflect the embedder
7981        // identity — not any string the caller passed in, because the
7982        // caller never passes one.
7983        let (model_identity, model_version, dimension, normalization_policy): (
7984            String,
7985            String,
7986            i64,
7987            String,
7988        ) = conn
7989            .query_row(
7990                "SELECT model_identity, model_version, dimension, normalization_policy \
7991                 FROM vector_embedding_contracts WHERE profile = 'default'",
7992                [],
7993                |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)),
7994            )
7995            .expect("contract row");
7996        assert_eq!(model_identity, "test-model");
7997        assert_eq!(model_version, "1.0.0");
7998        assert_eq!(dimension, 4);
7999        assert_eq!(normalization_policy, "l2");
8000
8001        let contract_format_version: i64 = conn
8002            .query_row(
8003                "SELECT contract_format_version FROM vector_embedding_contracts WHERE profile = 'default'",
8004                [],
8005                |row| row.get(0),
8006            )
8007            .expect("contract_format_version");
8008        assert_eq!(contract_format_version, 1);
8009        let request_count: i64 = conn
8010            .query_row(
8011                "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_requested' AND subject = 'default'",
8012                [],
8013                |row| row.get(0),
8014            )
8015            .expect("request audit count");
8016        assert_eq!(request_count, 1);
8017        let apply_count: i64 = conn
8018            .query_row(
8019                "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_apply' AND subject = 'default'",
8020                [],
8021                |row| row.get(0),
8022            )
8023            .expect("apply audit count");
8024        assert_eq!(apply_count, 1);
8025        let apply_metadata: String = conn
8026            .query_row(
8027                "SELECT metadata_json FROM provenance_events WHERE event_type = 'vector_regeneration_apply' AND subject = 'default'",
8028                [],
8029                |row| row.get(0),
8030            )
8031            .expect("apply metadata");
8032        assert!(apply_metadata.contains("\"profile\":\"default\""));
8033        assert!(apply_metadata.contains("\"snapshot_hash\":"));
8034        assert!(apply_metadata.contains("\"model_identity\":\"test-model\""));
8035    }
8036
8037    #[cfg(feature = "sqlite-vec")]
8038    #[test]
8039    #[allow(clippy::too_many_lines)]
8040    fn regenerate_vector_embeddings_embedder_failure_leaves_contract_and_vec_rows_unchanged() {
8041        let db = NamedTempFile::new().expect("temp file");
8042        let schema = Arc::new(SchemaManager::new());
8043
8044        {
8045            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8046            schema.bootstrap(&conn).expect("bootstrap");
8047            conn.execute(
8048                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8049                 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
8050                [],
8051            )
8052            .expect("insert node");
8053            conn.execute(
8054                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8055                 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
8056                [],
8057            )
8058            .expect("insert chunk");
8059            schema
8060                .ensure_vec_kind_profile(&conn, "Document", 4)
8061                .expect("ensure vec kind profile");
8062            conn.execute(
8063                r"
8064                INSERT INTO vector_embedding_contracts (
8065                    profile,
8066                    table_name,
8067                    model_identity,
8068                    model_version,
8069                    dimension,
8070                    normalization_policy,
8071                    chunking_policy,
8072                    preprocessing_policy,
8073                    generator_command_json,
8074                    applied_at,
8075                    snapshot_hash
8076                ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)
8077                ",
8078                rusqlite::params![
8079                    "default",
8080                    "vec_document",
8081                    "old-model",
8082                    "0.9.0",
8083                    4,
8084                    "l2",
8085                    "per_chunk",
8086                    "trim",
8087                    "[]",
8088                    111,
8089                    "old-snapshot"
8090                ],
8091            )
8092            .expect("seed contract");
8093            conn.execute(
8094                "INSERT INTO vec_document (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
8095                [],
8096            )
8097            .expect("seed vec row");
8098        }
8099
8100        let service = AdminService::new(db.path(), Arc::clone(&schema));
8101        let failing = FailingEmbedder {
8102            identity: QueryEmbedderIdentity {
8103                model_identity: "new-model".to_owned(),
8104                model_version: "1.0.0".to_owned(),
8105                dimension: 4,
8106                normalization_policy: "l2".to_owned(),
8107            },
8108        };
8109        let error = service
8110            .regenerate_vector_embeddings(
8111                &failing,
8112                &VectorRegenerationConfig {
8113                    kind: "Document".to_owned(),
8114                    profile: "default".to_owned(),
8115                    chunking_policy: "per_chunk".to_owned(),
8116                    preprocessing_policy: "trim".to_owned(),
8117                },
8118            )
8119            .expect_err("embedder should fail");
8120
8121        assert!(error.to_string().contains("embedder failure"));
8122
8123        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8124        let model_identity: String = conn
8125            .query_row(
8126                "SELECT model_identity FROM vector_embedding_contracts WHERE profile = 'default'",
8127                [],
8128                |row| row.get(0),
8129            )
8130            .expect("model identity");
8131        assert_eq!(model_identity, "old-model");
8132        let snapshot_hash: String = conn
8133            .query_row(
8134                "SELECT snapshot_hash FROM vector_embedding_contracts WHERE profile = 'default'",
8135                [],
8136                |row| row.get(0),
8137            )
8138            .expect("snapshot hash");
8139        assert_eq!(snapshot_hash, "old-snapshot");
8140        let vec_count: i64 = conn
8141            .query_row("SELECT count(*) FROM vec_document", [], |row| row.get(0))
8142            .expect("vec count");
8143        assert_eq!(vec_count, 1);
8144        let failure_count: i64 = conn
8145            .query_row(
8146                "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
8147                [],
8148                |row| row.get(0),
8149            )
8150            .expect("failure count");
8151        assert_eq!(failure_count, 1);
8152        let failure_metadata: String = conn
8153            .query_row(
8154                "SELECT metadata_json FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
8155                [],
8156                |row| row.get(0),
8157            )
8158            .expect("failure metadata");
8159        assert!(failure_metadata.contains("\"failure_class\":\"embedder failure\""));
8160    }
8161
8162    // Subprocess generator tests (snapshot-drift-via-concurrent-writer,
8163    // timeout, stdout/stderr overflow, oversized input, excessive chunk
8164    // count, malformed JSON, world-writable executable, disallowed
8165    // executable root, environment preservation) were removed in 0.4.0
8166    // along with the subprocess generator pattern itself. The failure
8167    // modes they exercised belong to the deleted
8168    // `run_vector_generator_bounded` pipeline and have no equivalent in
8169    // the direct-embedder path. See
8170    // `.claude/memory/project_vector_identity_invariant.md`.
8171
8172    #[cfg(feature = "sqlite-vec")]
8173    #[test]
8174    fn regenerate_vector_embeddings_rejects_whitespace_only_profile_before_mutation() {
8175        let db = NamedTempFile::new().expect("temp file");
8176        let schema = Arc::new(SchemaManager::new());
8177        {
8178            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8179            schema.bootstrap(&conn).expect("bootstrap");
8180            conn.execute(
8181                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8182                 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
8183                [],
8184            )
8185            .expect("insert node");
8186            conn.execute(
8187                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8188                 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
8189                [],
8190            )
8191            .expect("insert chunk");
8192        }
8193
8194        let service = AdminService::new(db.path(), Arc::clone(&schema));
8195        let embedder = TestEmbedder::new("test-model", 4);
8196        let error = service
8197            .regenerate_vector_embeddings(
8198                &embedder,
8199                &VectorRegenerationConfig {
8200                    kind: "Document".to_owned(),
8201                    profile: "   ".to_owned(),
8202                    chunking_policy: "per_chunk".to_owned(),
8203                    preprocessing_policy: "trim".to_owned(),
8204                },
8205            )
8206            .expect_err("whitespace profile should be rejected");
8207
8208        assert!(error.to_string().contains("invalid contract"));
8209        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8210        let contract_count: i64 = conn
8211            .query_row(
8212                "SELECT count(*) FROM vector_embedding_contracts",
8213                [],
8214                |row| row.get(0),
8215            )
8216            .expect("contract count");
8217        assert_eq!(contract_count, 0);
8218        let provenance_count: i64 = conn
8219            .query_row("SELECT count(*) FROM provenance_events", [], |row| {
8220                row.get(0)
8221            })
8222            .expect("provenance count");
8223        assert_eq!(provenance_count, 0);
8224    }
8225
8226    #[cfg(feature = "sqlite-vec")]
8227    #[test]
8228    fn regenerate_vector_embeddings_rejects_future_contract_format_version() {
8229        let db = NamedTempFile::new().expect("temp file");
8230        let schema = Arc::new(SchemaManager::new());
8231        {
8232            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8233            schema.bootstrap(&conn).expect("bootstrap");
8234            conn.execute(
8235                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8236                 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
8237                [],
8238            )
8239            .expect("insert node");
8240            conn.execute(
8241                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8242                 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
8243                [],
8244            )
8245            .expect("insert chunk");
8246            conn.execute(
8247                r"
8248                INSERT INTO vector_embedding_contracts (
8249                    profile,
8250                    table_name,
8251                    model_identity,
8252                    model_version,
8253                    dimension,
8254                    normalization_policy,
8255                    chunking_policy,
8256                    preprocessing_policy,
8257                    generator_command_json,
8258                    applied_at,
8259                    snapshot_hash,
8260                    contract_format_version,
8261                    updated_at
8262                ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13)
8263                ",
8264                rusqlite::params![
8265                    "default",
8266                    "vec_nodes_active",
8267                    "old-model",
8268                    "0.9.0",
8269                    4,
8270                    "l2",
8271                    "per_chunk",
8272                    "trim",
8273                    "[]",
8274                    111,
8275                    "old-snapshot",
8276                    99,
8277                    111,
8278                ],
8279            )
8280            .expect("seed future contract");
8281        }
8282
8283        let service = AdminService::new(db.path(), Arc::clone(&schema));
8284        let embedder = TestEmbedder::new("test-model", 4);
8285        let error = service
8286            .regenerate_vector_embeddings(
8287                &embedder,
8288                &VectorRegenerationConfig {
8289                    kind: "Document".to_owned(),
8290                    profile: "default".to_owned(),
8291                    chunking_policy: "per_chunk".to_owned(),
8292                    preprocessing_policy: "trim".to_owned(),
8293                },
8294            )
8295            .expect_err("future contract version should be rejected");
8296
8297        assert!(error.to_string().contains("unsupported"));
8298        assert!(error.to_string().contains("format version"));
8299    }
8300
8301    #[test]
8302    fn check_semantics_detects_orphaned_chunk() {
8303        let (db, service) = setup();
8304        {
8305            // Open without FK enforcement to insert chunk with no active node.
8306            let conn = sqlite::open_connection(db.path()).expect("conn");
8307            conn.execute(
8308                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8309                 VALUES ('c1', 'ghost-node', 'text', 100)",
8310                [],
8311            )
8312            .expect("insert orphaned chunk");
8313        }
8314        let report = service.check_semantics().expect("semantics check");
8315        assert_eq!(report.orphaned_chunks, 1);
8316    }
8317
8318    #[test]
8319    fn check_semantics_detects_null_source_ref() {
8320        let (db, service) = setup();
8321        {
8322            let conn = sqlite::open_connection(db.path()).expect("conn");
8323            conn.execute(
8324                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at) \
8325                 VALUES ('r1', 'lg1', 'Meeting', '{}', 100)",
8326                [],
8327            )
8328            .expect("insert node with null source_ref");
8329        }
8330        let report = service.check_semantics().expect("semantics check");
8331        assert_eq!(report.null_source_ref_nodes, 1);
8332    }
8333
8334    #[test]
8335    fn check_semantics_detects_broken_step_fk() {
8336        let (db, service) = setup();
8337        {
8338            // Explicitly disable FK enforcement for this connection so we can insert
8339            // an orphaned step (ghost run_id) to simulate a partial-write failure.
8340            let conn = sqlite::open_connection(db.path()).expect("conn");
8341            conn.execute_batch("PRAGMA foreign_keys = OFF;")
8342                .expect("disable FK");
8343            conn.execute(
8344                "INSERT INTO steps (id, run_id, kind, status, properties, created_at) \
8345                 VALUES ('s1', 'ghost-run', 'llm', 'completed', '{}', 100)",
8346                [],
8347            )
8348            .expect("insert step with ghost run_id");
8349        }
8350        let report = service.check_semantics().expect("semantics check");
8351        assert_eq!(report.broken_step_fk, 1);
8352    }
8353
8354    #[test]
8355    fn check_semantics_detects_broken_action_fk() {
8356        let (db, service) = setup();
8357        {
8358            let conn = sqlite::open_connection(db.path()).expect("conn");
8359            conn.execute_batch("PRAGMA foreign_keys = OFF;")
8360                .expect("disable FK");
8361            conn.execute(
8362                "INSERT INTO actions (id, step_id, kind, status, properties, created_at) \
8363                 VALUES ('a1', 'ghost-step', 'emit', 'completed', '{}', 100)",
8364                [],
8365            )
8366            .expect("insert action with ghost step_id");
8367        }
8368        let report = service.check_semantics().expect("semantics check");
8369        assert_eq!(report.broken_action_fk, 1);
8370    }
8371
8372    #[test]
8373    fn check_semantics_detects_stale_fts_rows() {
8374        let (db, service) = setup();
8375        {
8376            let conn = sqlite::open_connection(db.path()).expect("conn");
8377            // FTS virtual tables have no FK constraints; insert a row referencing
8378            // a chunk_id that does not exist in the chunks table.
8379            conn.execute(
8380                "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
8381                 VALUES ('ghost-chunk', 'any-node', 'Meeting', 'stale content')",
8382                [],
8383            )
8384            .expect("insert stale FTS row");
8385        }
8386        let report = service.check_semantics().expect("semantics check");
8387        assert_eq!(report.stale_fts_rows, 1);
8388    }
8389
8390    #[test]
8391    fn check_semantics_detects_fts_rows_for_superseded_nodes() {
8392        let (db, service) = setup();
8393        {
8394            let conn = sqlite::open_connection(db.path()).expect("conn");
8395            // Insert a node that has been fully superseded (superseded_at IS NOT NULL).
8396            conn.execute(
8397                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8398                 VALUES ('r1', 'lg-sup', 'Meeting', '{}', 100, 200, 'src-1')",
8399                [],
8400            )
8401            .expect("insert superseded node");
8402            // Insert an FTS row for the superseded node's logical_id.
8403            conn.execute(
8404                "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
8405                 VALUES ('ck-x', 'lg-sup', 'Meeting', 'superseded content')",
8406                [],
8407            )
8408            .expect("insert FTS row for superseded node");
8409        }
8410        let report = service.check_semantics().expect("semantics check");
8411        assert_eq!(report.fts_rows_for_superseded_nodes, 1);
8412    }
8413
8414    #[test]
8415    fn check_semantics_detects_dangling_edges() {
8416        let (db, service) = setup();
8417        {
8418            let conn = sqlite::open_connection(db.path()).expect("conn");
8419            conn.execute_batch("PRAGMA foreign_keys = OFF;")
8420                .expect("disable FK");
8421            // One active node as source; target does not exist — edge is dangling.
8422            conn.execute(
8423                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8424                 VALUES ('r1', 'lg-src', 'Meeting', '{}', 100, 'src-1')",
8425                [],
8426            )
8427            .expect("insert source node");
8428            conn.execute(
8429                "INSERT INTO edges \
8430                 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
8431                 VALUES ('e1', 'edge-1', 'lg-src', 'ghost-target', 'LINKS', '{}', 100, 'src-1')",
8432                [],
8433            )
8434            .expect("insert dangling edge");
8435        }
8436        let report = service.check_semantics().expect("semantics check");
8437        assert_eq!(report.dangling_edges, 1);
8438    }
8439
8440    #[test]
8441    fn check_semantics_detects_orphaned_supersession_chains() {
8442        let (db, service) = setup();
8443        {
8444            let conn = sqlite::open_connection(db.path()).expect("conn");
8445            // Every version of this logical_id is superseded — no active row remains.
8446            conn.execute(
8447                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8448                 VALUES ('r1', 'lg-orphaned', 'Meeting', '{}', 100, 200, 'src-1')",
8449                [],
8450            )
8451            .expect("insert fully superseded node");
8452        }
8453        let report = service.check_semantics().expect("semantics check");
8454        assert_eq!(report.orphaned_supersession_chains, 1);
8455    }
8456
8457    #[test]
8458    fn check_semantics_detects_mismatched_kind_property_fts_rows() {
8459        // With per-kind tables, mismatched_kind is always 0 — rows in fts_props_<kind>
8460        // must belong to that kind by construction. However, orphaned rows (per-kind table
8461        // with no registered schema) serve as the equivalent signal and are tested via
8462        // check_semantics_detects_fts_rows_for_superseded_nodes. This test verifies
8463        // mismatched_kind is 0 even when per-kind table rows exist for a node.
8464        let (db, service) = setup();
8465        {
8466            let conn = sqlite::open_connection(db.path()).expect("conn");
8467            conn.execute(
8468                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
8469                 VALUES ('Goal', '[\"$.name\"]', ' ')",
8470                [],
8471            )
8472            .expect("register schema");
8473            conn.execute(
8474                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8475                 VALUES ('r1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'src-1')",
8476                [],
8477            )
8478            .expect("insert node");
8479            // Create the per-kind table and insert a correctly-kind row.
8480            let table = fathomdb_schema::fts_kind_table_name("Goal");
8481            conn.execute_batch(&format!(
8482                "CREATE VIRTUAL TABLE IF NOT EXISTS {table} \
8483                 USING fts5(node_logical_id UNINDEXED, text_content, tokenize = 'porter unicode61 remove_diacritics 2')"
8484            ))
8485            .expect("create per-kind table");
8486            conn.execute(
8487                &format!("INSERT INTO {table} (node_logical_id, text_content) VALUES ('goal-1', 'Ship v2')"),
8488                [],
8489            )
8490            .expect("insert per-kind FTS row");
8491        }
8492        let report = service.check_semantics().expect("semantics check");
8493        // Per-kind tables make mismatched_kind impossible — always 0.
8494        assert_eq!(report.mismatched_kind_property_fts_rows, 0);
8495    }
8496
8497    #[test]
8498    fn check_semantics_detects_duplicate_property_fts_rows() {
8499        let (db, service) = setup();
8500        {
8501            let conn = sqlite::open_connection(db.path()).expect("conn");
8502            conn.execute(
8503                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8504                 VALUES ('r1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'src-1')",
8505                [],
8506            )
8507            .expect("insert node");
8508            // Create the per-kind table and insert two rows for the same logical ID.
8509            let table = fathomdb_schema::fts_kind_table_name("Goal");
8510            conn.execute_batch(&format!(
8511                "CREATE VIRTUAL TABLE IF NOT EXISTS {table} \
8512                 USING fts5(node_logical_id UNINDEXED, text_content, tokenize = 'porter unicode61 remove_diacritics 2')"
8513            ))
8514            .expect("create per-kind table");
8515            conn.execute(
8516                &format!("INSERT INTO {table} (node_logical_id, text_content) VALUES ('goal-1', 'Ship v2')"),
8517                [],
8518            )
8519            .expect("insert first property FTS row");
8520            conn.execute(
8521                &format!("INSERT INTO {table} (node_logical_id, text_content) VALUES ('goal-1', 'Ship v2 duplicate')"),
8522                [],
8523            )
8524            .expect("insert duplicate property FTS row");
8525        }
8526        let report = service.check_semantics().expect("semantics check");
8527        assert_eq!(report.duplicate_property_fts_rows, 1);
8528    }
8529
8530    #[test]
8531    fn check_semantics_detects_drifted_property_fts_text() {
8532        let (db, service) = setup();
8533        {
8534            let conn = sqlite::open_connection(db.path()).expect("conn");
8535            conn.execute(
8536                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
8537                 VALUES ('Goal', '[\"$.name\"]', ' ')",
8538                [],
8539            )
8540            .expect("register schema");
8541            conn.execute(
8542                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8543                 VALUES ('r1', 'goal-1', 'Goal', '{\"name\":\"Current name\"}', 100, 'src-1')",
8544                [],
8545            )
8546            .expect("insert node");
8547            // Create per-kind table and insert a row with outdated text content.
8548            let table = fathomdb_schema::fts_kind_table_name("Goal");
8549            conn.execute_batch(&format!(
8550                "CREATE VIRTUAL TABLE IF NOT EXISTS {table} \
8551                 USING fts5(node_logical_id UNINDEXED, text_content, tokenize = 'porter unicode61 remove_diacritics 2')"
8552            ))
8553            .expect("create per-kind table");
8554            conn.execute(
8555                &format!("INSERT INTO {table} (node_logical_id, text_content) VALUES ('goal-1', 'Old stale name')"),
8556                [],
8557            )
8558            .expect("insert stale property FTS row");
8559        }
8560        let report = service.check_semantics().expect("semantics check");
8561        assert_eq!(report.drifted_property_fts_rows, 1);
8562    }
8563
8564    #[test]
8565    fn check_semantics_detects_property_fts_row_that_should_not_exist() {
8566        let (db, service) = setup();
8567        {
8568            let conn = sqlite::open_connection(db.path()).expect("conn");
8569            conn.execute(
8570                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
8571                 VALUES ('Goal', '[\"$.searchable\"]', ' ')",
8572                [],
8573            )
8574            .expect("register schema");
8575            // Node does NOT have $.searchable — extraction yields no value.
8576            conn.execute(
8577                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8578                 VALUES ('r1', 'goal-1', 'Goal', '{\"other\":\"field\"}', 100, 'src-1')",
8579                [],
8580            )
8581            .expect("insert node");
8582            // Create per-kind table and insert a phantom row that should not exist.
8583            let table = fathomdb_schema::fts_kind_table_name("Goal");
8584            conn.execute_batch(&format!(
8585                "CREATE VIRTUAL TABLE IF NOT EXISTS {table} \
8586                 USING fts5(node_logical_id UNINDEXED, text_content, tokenize = 'porter unicode61 remove_diacritics 2')"
8587            ))
8588            .expect("create per-kind table");
8589            conn.execute(
8590                &format!("INSERT INTO {table} (node_logical_id, text_content) VALUES ('goal-1', 'phantom text')"),
8591                [],
8592            )
8593            .expect("insert phantom property FTS row");
8594        }
8595        let report = service.check_semantics().expect("semantics check");
8596        assert_eq!(
8597            report.drifted_property_fts_rows, 1,
8598            "row that should not exist must be counted as drifted"
8599        );
8600    }
8601
8602    #[test]
8603    fn safe_export_writes_manifest_with_sha256() {
8604        let (_db, service) = setup();
8605        let export_dir = tempfile::TempDir::new().expect("temp dir");
8606        let export_path = export_dir.path().join("backup.db");
8607
8608        let manifest = service
8609            .safe_export(
8610                &export_path,
8611                SafeExportOptions {
8612                    force_checkpoint: false,
8613                },
8614            )
8615            .expect("export");
8616
8617        assert!(export_path.exists(), "exported db should exist");
8618        let manifest_path = export_dir.path().join("backup.db.export-manifest.json");
8619        assert!(
8620            manifest_path.exists(),
8621            "manifest file should exist at {}",
8622            manifest_path.display()
8623        );
8624        assert_eq!(manifest.sha256.len(), 64, "sha256 should be 64 hex chars");
8625        assert!(
8626            manifest.exported_at > 0,
8627            "exported_at should be a unix timestamp"
8628        );
8629        assert_eq!(
8630            manifest.schema_version,
8631            SchemaManager::new().current_version().0,
8632            "schema_version should match the live schema version"
8633        );
8634        assert_eq!(manifest.protocol_version, 1, "protocol_version should be 1");
8635        assert!(manifest.page_count > 0, "page_count should be positive");
8636    }
8637
8638    #[test]
8639    fn safe_export_preserves_operational_validation_contracts() {
8640        let (_db, service) = setup();
8641        let validation_json = r#"{"format_version":1,"mode":"enforce","additional_properties":false,"fields":[{"name":"status","type":"string","required":true,"enum":["ok","failed"]}]}"#;
8642        service
8643            .register_operational_collection(&OperationalRegisterRequest {
8644                name: "connector_health".to_owned(),
8645                kind: OperationalCollectionKind::LatestState,
8646                schema_json: "{}".to_owned(),
8647                retention_json: "{}".to_owned(),
8648                filter_fields_json: "[]".to_owned(),
8649                validation_json: validation_json.to_owned(),
8650                secondary_indexes_json: "[]".to_owned(),
8651                format_version: 1,
8652            })
8653            .expect("register collection");
8654
8655        let export_dir = tempfile::TempDir::new().expect("temp dir");
8656        let export_path = export_dir.path().join("backup.db");
8657        service
8658            .safe_export(
8659                &export_path,
8660                SafeExportOptions {
8661                    force_checkpoint: false,
8662                },
8663            )
8664            .expect("export");
8665
8666        let exported = sqlite::open_connection(&export_path).expect("exported conn");
8667        let exported_validation_json: String = exported
8668            .query_row(
8669                "SELECT validation_json FROM operational_collections WHERE name = 'connector_health'",
8670                [],
8671                |row| row.get(0),
8672            )
8673            .expect("validation_json");
8674        assert_eq!(exported_validation_json, validation_json);
8675    }
8676
8677    #[test]
8678    fn safe_export_force_checkpoint_false_skips_wal_pragma() {
8679        let (_db, service) = setup();
8680        let export_dir = tempfile::TempDir::new().expect("temp dir");
8681        let export_path = export_dir.path().join("no-wal.db");
8682
8683        // force_checkpoint: false must not error even on a non-WAL database
8684        let manifest = service
8685            .safe_export(
8686                &export_path,
8687                SafeExportOptions {
8688                    force_checkpoint: false,
8689                },
8690            )
8691            .expect("export with no checkpoint");
8692
8693        assert!(
8694            manifest.page_count > 0,
8695            "page_count must be populated regardless of checkpoint mode"
8696        );
8697        assert_eq!(
8698            manifest.schema_version,
8699            SchemaManager::new().current_version().0
8700        );
8701        assert_eq!(manifest.protocol_version, 1);
8702    }
8703
8704    #[test]
8705    fn safe_export_force_checkpoint_false_still_captures_wal_backed_changes() {
8706        let (db, service) = setup();
8707        let conn = sqlite::open_connection(db.path()).expect("conn");
8708        let journal_mode: String = conn
8709            .query_row("PRAGMA journal_mode=WAL", [], |row| row.get(0))
8710            .expect("enable wal");
8711        assert_eq!(journal_mode.to_lowercase(), "wal");
8712        let auto_checkpoint_pages: i64 = conn
8713            .query_row("PRAGMA wal_autocheckpoint=0", [], |row| row.get(0))
8714            .expect("disable auto checkpoint");
8715        assert_eq!(auto_checkpoint_pages, 0);
8716        conn.execute(
8717            "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8718             VALUES ('r-wal', 'lg-wal', 'Meeting', '{}', 100, 'src-wal')",
8719            [],
8720        )
8721        .expect("insert wal-backed node");
8722
8723        let export_dir = tempfile::TempDir::new().expect("temp dir");
8724        let export_path = export_dir.path().join("wal-backed.db");
8725        service
8726            .safe_export(
8727                &export_path,
8728                SafeExportOptions {
8729                    force_checkpoint: false,
8730                },
8731            )
8732            .expect("export wal-backed db");
8733
8734        let exported = sqlite::open_connection(&export_path).expect("open exported db");
8735        let exported_count: i64 = exported
8736            .query_row(
8737                "SELECT count(*) FROM nodes WHERE logical_id = 'lg-wal'",
8738                [],
8739                |row| row.get(0),
8740            )
8741            .expect("count exported nodes");
8742        assert_eq!(
8743            exported_count, 1,
8744            "safe_export must include committed rows that are still resident in the WAL"
8745        );
8746    }
8747
8748    #[test]
8749    fn excise_source_removes_searchable_content_after_excision() {
8750        let (db, service) = setup();
8751        {
8752            let conn = sqlite::open_connection(db.path()).expect("conn");
8753            conn.execute(
8754                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8755                 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 200, 'source-1')",
8756                [],
8757            )
8758            .expect("insert v1");
8759            conn.execute(
8760                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8761                 VALUES ('r2', 'lg1', 'Meeting', '{}', 200, 'source-2')",
8762                [],
8763            )
8764            .expect("insert v2");
8765            conn.execute(
8766                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8767                 VALUES ('ck1', 'lg1', 'hello world', 100)",
8768                [],
8769            )
8770            .expect("insert chunk");
8771        }
8772        service.excise_source("source-2").expect("excise");
8773        {
8774            let conn = sqlite::open_connection(db.path()).expect("conn");
8775            let fts_count: i64 = conn
8776                .query_row(
8777                    "SELECT count(*) FROM fts_nodes WHERE chunk_id = 'ck1'",
8778                    [],
8779                    |row| row.get(0),
8780                )
8781                .expect("fts count");
8782            assert_eq!(
8783                fts_count, 0,
8784                "excised content should not remain searchable after excise"
8785            );
8786        }
8787    }
8788
8789    #[cfg(feature = "sqlite-vec")]
8790    #[test]
8791    fn excise_source_cleans_chunks_and_vec_rows_for_excised_version() {
8792        let (db, service) = setup();
8793        {
8794            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8795            service
8796                .schema_manager
8797                .ensure_vec_kind_profile(&conn, "Meeting", 4)
8798                .expect("ensure vec kind profile");
8799            conn.execute(
8800                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8801                 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 200, 'source-1')",
8802                [],
8803            )
8804            .expect("insert v1");
8805            conn.execute(
8806                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8807                 VALUES ('r2', 'lg1', 'Meeting', '{}', 200, 'source-2')",
8808                [],
8809            )
8810            .expect("insert v2");
8811            conn.execute(
8812                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8813                 VALUES ('ck1', 'lg1', 'new content', 200)",
8814                [],
8815            )
8816            .expect("insert chunk");
8817            conn.execute(
8818                "INSERT INTO vec_meeting (chunk_id, embedding) VALUES ('ck1', zeroblob(16))",
8819                [],
8820            )
8821            .expect("insert vec row");
8822        }
8823
8824        service.excise_source("source-2").expect("excise");
8825
8826        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8827        let active_row: String = conn
8828            .query_row(
8829                "SELECT row_id FROM nodes WHERE logical_id = 'lg1' AND superseded_at IS NULL",
8830                [],
8831                |row| row.get(0),
8832            )
8833            .expect("restored active row");
8834        assert_eq!(active_row, "r1");
8835        let chunk_count: i64 = conn
8836            .query_row(
8837                "SELECT count(*) FROM chunks WHERE node_logical_id = 'lg1'",
8838                [],
8839                |row| row.get(0),
8840            )
8841            .expect("chunk count");
8842        assert_eq!(
8843            chunk_count, 0,
8844            "excised source content must not survive as chunks"
8845        );
8846        let vec_count: i64 = conn
8847            .query_row("SELECT count(*) FROM vec_meeting", [], |row| row.get(0))
8848            .expect("vec count");
8849        assert_eq!(vec_count, 0, "excised source vec rows must be removed");
8850        let fts_count: i64 = conn
8851            .query_row(
8852                "SELECT count(*) FROM fts_nodes WHERE node_logical_id = 'lg1'",
8853                [],
8854                |row| row.get(0),
8855            )
8856            .expect("fts count");
8857        assert_eq!(
8858            fts_count, 0,
8859            "excised source content must not remain searchable"
8860        );
8861    }
8862
8863    #[test]
8864    fn export_page_count_matches_exported_file() {
8865        let (_db, service) = setup();
8866        let export_dir = tempfile::TempDir::new().expect("temp dir");
8867        let export_path = export_dir.path().join("page-count.db");
8868
8869        let manifest = service
8870            .safe_export(
8871                &export_path,
8872                SafeExportOptions {
8873                    force_checkpoint: false,
8874                },
8875            )
8876            .expect("export");
8877
8878        let exported = sqlite::open_connection(&export_path).expect("open exported db");
8879        let actual_page_count: u64 = exported
8880            .query_row("PRAGMA page_count", [], |row| row.get(0))
8881            .expect("page_count from exported file");
8882
8883        assert_eq!(
8884            manifest.page_count, actual_page_count,
8885            "manifest page_count must match the exported file's PRAGMA page_count"
8886        );
8887    }
8888
8889    #[test]
8890    fn no_temp_file_after_successful_export() {
8891        let (_db, service) = setup();
8892        let export_dir = tempfile::TempDir::new().expect("temp dir");
8893        let export_path = export_dir.path().join("no-tmp.db");
8894
8895        service
8896            .safe_export(
8897                &export_path,
8898                SafeExportOptions {
8899                    force_checkpoint: false,
8900                },
8901            )
8902            .expect("export");
8903
8904        let tmp_files: Vec<_> = fs::read_dir(export_dir.path())
8905            .expect("read export dir")
8906            .filter_map(Result::ok)
8907            .filter(|e| e.path().extension().is_some_and(|ext| ext == "tmp"))
8908            .collect();
8909
8910        assert!(
8911            tmp_files.is_empty(),
8912            "no .tmp files should remain after a successful export, found: {tmp_files:?}"
8913        );
8914    }
8915
8916    #[test]
8917    fn export_manifest_is_valid_json() {
8918        let (_db, service) = setup();
8919        let export_dir = tempfile::TempDir::new().expect("temp dir");
8920        let export_path = export_dir.path().join("valid-json.db");
8921
8922        service
8923            .safe_export(
8924                &export_path,
8925                SafeExportOptions {
8926                    force_checkpoint: false,
8927                },
8928            )
8929            .expect("export");
8930
8931        let manifest_path = export_dir.path().join("valid-json.db.export-manifest.json");
8932        let manifest_contents = fs::read_to_string(&manifest_path).expect("read manifest");
8933        let parsed: serde_json::Value =
8934            serde_json::from_str(&manifest_contents).expect("manifest must be valid JSON");
8935
8936        assert!(
8937            parsed.get("exported_at").is_some(),
8938            "manifest must contain exported_at"
8939        );
8940        assert!(
8941            parsed.get("sha256").is_some(),
8942            "manifest must contain sha256"
8943        );
8944        assert!(
8945            parsed.get("schema_version").is_some(),
8946            "manifest must contain schema_version"
8947        );
8948        assert!(
8949            parsed.get("protocol_version").is_some(),
8950            "manifest must contain protocol_version"
8951        );
8952        assert!(
8953            parsed.get("page_count").is_some(),
8954            "manifest must contain page_count"
8955        );
8956    }
8957
8958    #[test]
8959    fn provenance_purge_dry_run_reports_counts() {
8960        let (db, service) = setup();
8961        {
8962            let conn = sqlite::open_connection(db.path()).expect("conn");
8963            conn.execute(
8964                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8965                 VALUES ('p1', 'node_insert', 'lg1', 'src-1', 100)",
8966                [],
8967            )
8968            .expect("insert p1");
8969            conn.execute(
8970                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8971                 VALUES ('p2', 'node_insert', 'lg2', 'src-1', 200)",
8972                [],
8973            )
8974            .expect("insert p2");
8975            conn.execute(
8976                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8977                 VALUES ('p3', 'excise', 'lg3', 'src-1', 300)",
8978                [],
8979            )
8980            .expect("insert p3");
8981        }
8982
8983        let options = super::ProvenancePurgeOptions {
8984            dry_run: true,
8985            preserve_event_types: Vec::new(),
8986        };
8987        let report = service
8988            .purge_provenance_events(250, &options)
8989            .expect("dry run purge");
8990
8991        assert_eq!(report.events_deleted, 2);
8992        assert_eq!(report.events_preserved, 1);
8993        assert!(report.oldest_remaining.is_some());
8994
8995        let conn = sqlite::open_connection(db.path()).expect("conn");
8996        let total: i64 = conn
8997            .query_row("SELECT count(*) FROM provenance_events", [], |row| {
8998                row.get(0)
8999            })
9000            .expect("count");
9001        assert_eq!(total, 3, "dry_run must not delete any events");
9002    }
9003
9004    #[test]
9005    fn provenance_purge_deletes_old_events() {
9006        let (db, service) = setup();
9007        {
9008            let conn = sqlite::open_connection(db.path()).expect("conn");
9009            conn.execute(
9010                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
9011                 VALUES ('p1', 'node_insert', 'lg1', 'src-1', 100)",
9012                [],
9013            )
9014            .expect("insert p1");
9015            conn.execute(
9016                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
9017                 VALUES ('p2', 'node_insert', 'lg2', 'src-1', 200)",
9018                [],
9019            )
9020            .expect("insert p2");
9021        }
9022
9023        let options = super::ProvenancePurgeOptions {
9024            dry_run: false,
9025            preserve_event_types: Vec::new(),
9026        };
9027        let report = service
9028            .purge_provenance_events(150, &options)
9029            .expect("purge");
9030
9031        assert_eq!(report.events_deleted, 1);
9032        assert_eq!(report.events_preserved, 1);
9033        assert_eq!(report.oldest_remaining, Some(200));
9034
9035        let conn = sqlite::open_connection(db.path()).expect("conn");
9036        let remaining: i64 = conn
9037            .query_row("SELECT count(*) FROM provenance_events", [], |row| {
9038                row.get(0)
9039            })
9040            .expect("count");
9041        assert_eq!(remaining, 1);
9042    }
9043
9044    #[test]
9045    fn provenance_purge_preserves_specified_types() {
9046        let (db, service) = setup();
9047        {
9048            let conn = sqlite::open_connection(db.path()).expect("conn");
9049            conn.execute(
9050                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
9051                 VALUES ('p1', 'excise', 'lg1', 'src-1', 100)",
9052                [],
9053            )
9054            .expect("insert p1");
9055            conn.execute(
9056                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
9057                 VALUES ('p2', 'node_insert', 'lg2', 'src-1', 100)",
9058                [],
9059            )
9060            .expect("insert p2");
9061            conn.execute(
9062                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
9063                 VALUES ('p3', 'node_insert', 'lg3', 'src-1', 100)",
9064                [],
9065            )
9066            .expect("insert p3");
9067        }
9068
9069        let options = super::ProvenancePurgeOptions {
9070            dry_run: false,
9071            preserve_event_types: Vec::new(),
9072        };
9073        let report = service
9074            .purge_provenance_events(500, &options)
9075            .expect("purge");
9076
9077        assert_eq!(report.events_deleted, 2);
9078        assert_eq!(report.events_preserved, 1);
9079
9080        let conn = sqlite::open_connection(db.path()).expect("conn");
9081        let remaining_type: String = conn
9082            .query_row("SELECT event_type FROM provenance_events", [], |row| {
9083                row.get(0)
9084            })
9085            .expect("remaining event type");
9086        assert_eq!(remaining_type, "excise");
9087    }
9088
9089    #[test]
9090    fn provenance_purge_noop_with_zero_timestamp() {
9091        let (db, service) = setup();
9092        {
9093            let conn = sqlite::open_connection(db.path()).expect("conn");
9094            conn.execute(
9095                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
9096                 VALUES ('p1', 'node_insert', 'lg1', 'src-1', 100)",
9097                [],
9098            )
9099            .expect("insert p1");
9100        }
9101
9102        let options = super::ProvenancePurgeOptions {
9103            dry_run: false,
9104            preserve_event_types: Vec::new(),
9105        };
9106        let report = service.purge_provenance_events(0, &options).expect("purge");
9107
9108        assert_eq!(report.events_deleted, 0);
9109        assert_eq!(report.events_preserved, 1);
9110        assert_eq!(report.oldest_remaining, Some(100));
9111    }
9112
9113    #[test]
9114    fn restore_skips_edge_when_counterpart_purged() {
9115        let (db, service) = setup();
9116        {
9117            let conn = sqlite::open_connection(db.path()).expect("conn");
9118            // Create node A (doc-1) and node B (doc-2)
9119            conn.execute(
9120                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9121                 VALUES ('node-row-a', 'doc-1', 'Document', '{}', 100, 'seed')",
9122                [],
9123            )
9124            .expect("insert node A");
9125            conn.execute(
9126                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9127                 VALUES ('node-row-b', 'doc-2', 'Document', '{}', 100, 'seed')",
9128                [],
9129            )
9130            .expect("insert node B");
9131            // Create edge between A and B
9132            conn.execute(
9133                "INSERT INTO edges \
9134                 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
9135                 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'doc-2', 'RELATED', '{}', 100, 'seed')",
9136                [],
9137            )
9138            .expect("insert edge");
9139            // Retire both A and B, and the edge
9140            conn.execute(
9141                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9142                 VALUES ('evt-retire-a', 'node_retire', 'doc-1', 'forget-1', 200, '')",
9143                [],
9144            )
9145            .expect("insert retire event A");
9146            conn.execute(
9147                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9148                 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
9149                [],
9150            )
9151            .expect("insert edge retire event");
9152            conn.execute(
9153                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
9154                [],
9155            )
9156            .expect("retire node A");
9157            conn.execute(
9158                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-2'",
9159                [],
9160            )
9161            .expect("retire node B");
9162            conn.execute(
9163                "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
9164                [],
9165            )
9166            .expect("retire edge");
9167            // Simulate purge of B: delete node rows but leave the edge intact
9168            // to reproduce the dangling-edge scenario the validation guards against.
9169            conn.execute("DELETE FROM nodes WHERE logical_id = 'doc-2'", [])
9170                .expect("purge node B rows");
9171        }
9172
9173        // Restore A — the edge should be skipped because B has no active node
9174        let report = service.restore_logical_id("doc-1").expect("restore A");
9175        assert!(!report.was_noop);
9176        assert_eq!(report.restored_node_rows, 1);
9177        assert_eq!(report.restored_edge_rows, 0, "edge should not be restored");
9178        assert_eq!(report.skipped_edges.len(), 1);
9179        assert_eq!(report.skipped_edges[0].edge_logical_id, "edge-1");
9180        assert_eq!(report.skipped_edges[0].missing_endpoint, "doc-2");
9181
9182        // Verify the edge is still retired in the database
9183        let conn = sqlite::open_connection(db.path()).expect("conn");
9184        let active_edge_count: i64 = conn
9185            .query_row(
9186                "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
9187                [],
9188                |row| row.get(0),
9189            )
9190            .expect("active edge count");
9191        assert_eq!(active_edge_count, 0, "edge must remain retired");
9192    }
9193
9194    #[test]
9195    fn restore_restores_edges_to_active_nodes() {
9196        let (db, service) = setup();
9197        {
9198            let conn = sqlite::open_connection(db.path()).expect("conn");
9199            // Create node A and node B (B stays active)
9200            conn.execute(
9201                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9202                 VALUES ('node-row-a', 'doc-1', 'Document', '{}', 100, 'seed')",
9203                [],
9204            )
9205            .expect("insert node A");
9206            conn.execute(
9207                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9208                 VALUES ('node-row-b', 'doc-2', 'Document', '{}', 100, 'seed')",
9209                [],
9210            )
9211            .expect("insert node B");
9212            // Create edge between A and B
9213            conn.execute(
9214                "INSERT INTO edges \
9215                 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
9216                 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'doc-2', 'RELATED', '{}', 100, 'seed')",
9217                [],
9218            )
9219            .expect("insert edge");
9220            // Retire only A
9221            conn.execute(
9222                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9223                 VALUES ('evt-retire-a', 'node_retire', 'doc-1', 'forget-1', 200, '')",
9224                [],
9225            )
9226            .expect("insert retire event A");
9227            conn.execute(
9228                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9229                 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
9230                [],
9231            )
9232            .expect("insert edge retire event");
9233            conn.execute(
9234                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
9235                [],
9236            )
9237            .expect("retire node A");
9238            conn.execute(
9239                "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
9240                [],
9241            )
9242            .expect("retire edge");
9243        }
9244
9245        // Restore A — B is active, so the edge should be restored normally
9246        let report = service.restore_logical_id("doc-1").expect("restore A");
9247        assert!(!report.was_noop);
9248        assert_eq!(report.restored_node_rows, 1);
9249        assert!(report.restored_edge_rows > 0, "edge should be restored");
9250        assert!(
9251            report.skipped_edges.is_empty(),
9252            "no edges should be skipped"
9253        );
9254
9255        let conn = sqlite::open_connection(db.path()).expect("conn");
9256        let active_edge_count: i64 = conn
9257            .query_row(
9258                "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
9259                [],
9260                |row| row.get(0),
9261            )
9262            .expect("active edge count");
9263        assert_eq!(active_edge_count, 1, "edge must be active");
9264    }
9265
9266    #[test]
9267    fn restore_restores_edges_when_both_restored() {
9268        let (db, service) = setup();
9269        {
9270            let conn = sqlite::open_connection(db.path()).expect("conn");
9271            // Create node A and node B
9272            conn.execute(
9273                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9274                 VALUES ('node-row-a', 'doc-1', 'Document', '{}', 100, 'seed')",
9275                [],
9276            )
9277            .expect("insert node A");
9278            conn.execute(
9279                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9280                 VALUES ('node-row-b', 'doc-2', 'Document', '{}', 100, 'seed')",
9281                [],
9282            )
9283            .expect("insert node B");
9284            // Create edge between A and B
9285            conn.execute(
9286                "INSERT INTO edges \
9287                 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
9288                 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'doc-2', 'RELATED', '{}', 100, 'seed')",
9289                [],
9290            )
9291            .expect("insert edge");
9292            // Retire both A and B
9293            conn.execute(
9294                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9295                 VALUES ('evt-retire-a', 'node_retire', 'doc-1', 'forget-1', 200, '')",
9296                [],
9297            )
9298            .expect("insert retire event A");
9299            conn.execute(
9300                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9301                 VALUES ('evt-retire-b', 'node_retire', 'doc-2', 'forget-1', 200, '')",
9302                [],
9303            )
9304            .expect("insert retire event B");
9305            conn.execute(
9306                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9307                 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
9308                [],
9309            )
9310            .expect("insert edge retire event");
9311            conn.execute(
9312                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
9313                [],
9314            )
9315            .expect("retire node A");
9316            conn.execute(
9317                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-2'",
9318                [],
9319            )
9320            .expect("retire node B");
9321            conn.execute(
9322                "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
9323                [],
9324            )
9325            .expect("retire edge");
9326        }
9327
9328        // Restore B first — edge is skipped because A is still retired
9329        let report_b = service.restore_logical_id("doc-2").expect("restore B");
9330        assert!(!report_b.was_noop);
9331
9332        // Restore A — B is now active, so the edge should be restored
9333        let report_a = service.restore_logical_id("doc-1").expect("restore A");
9334        assert!(!report_a.was_noop);
9335        assert_eq!(report_a.restored_node_rows, 1);
9336        assert!(
9337            report_a.restored_edge_rows > 0,
9338            "edge should be restored when both endpoints active"
9339        );
9340        assert!(
9341            report_a.skipped_edges.is_empty(),
9342            "no edges should be skipped"
9343        );
9344
9345        let conn = sqlite::open_connection(db.path()).expect("conn");
9346        let active_edge_count: i64 = conn
9347            .query_row(
9348                "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
9349                [],
9350                |row| row.get(0),
9351            )
9352            .expect("active edge count");
9353        assert_eq!(
9354            active_edge_count, 1,
9355            "edge must be active after both endpoints restored"
9356        );
9357    }
9358
9359    // ── FTS property schema end-to-end tests ──────────────────────────
9360
9361    #[test]
9362    fn fts_property_schema_crud_round_trip() {
9363        let (_db, service) = setup();
9364
9365        // Register
9366        let record = service
9367            .register_fts_property_schema(
9368                "Meeting",
9369                &["$.title".to_owned(), "$.summary".to_owned()],
9370                None,
9371            )
9372            .expect("register");
9373        assert_eq!(record.kind, "Meeting");
9374        assert_eq!(record.property_paths, vec!["$.title", "$.summary"]);
9375        assert_eq!(record.separator, " ");
9376        assert_eq!(record.format_version, 1);
9377
9378        // Describe
9379        let described = service
9380            .describe_fts_property_schema("Meeting")
9381            .expect("describe")
9382            .expect("should exist");
9383        assert_eq!(described, record);
9384
9385        // Describe missing kind
9386        let missing = service
9387            .describe_fts_property_schema("NoSuchKind")
9388            .expect("describe missing");
9389        assert!(missing.is_none());
9390
9391        // List
9392        let list = service.list_fts_property_schemas().expect("list");
9393        assert_eq!(list.len(), 1);
9394        assert_eq!(list[0].kind, "Meeting");
9395
9396        // Update (idempotent upsert)
9397        let updated = service
9398            .register_fts_property_schema(
9399                "Meeting",
9400                &["$.title".to_owned(), "$.notes".to_owned()],
9401                Some("\n"),
9402            )
9403            .expect("update");
9404        assert_eq!(updated.property_paths, vec!["$.title", "$.notes"]);
9405        assert_eq!(updated.separator, "\n");
9406
9407        // Remove
9408        service
9409            .remove_fts_property_schema("Meeting")
9410            .expect("remove");
9411        let after_remove = service
9412            .describe_fts_property_schema("Meeting")
9413            .expect("describe after remove");
9414        assert!(after_remove.is_none());
9415
9416        // Remove non-existent is an error
9417        let err = service.remove_fts_property_schema("Meeting");
9418        assert!(err.is_err());
9419    }
9420
9421    #[test]
9422    fn describe_fts_property_schema_round_trips_recursive_entries() {
9423        let (_db, service) = setup();
9424
9425        let entries = vec![
9426            FtsPropertyPathSpec::scalar("$.title"),
9427            FtsPropertyPathSpec::recursive("$.payload"),
9428        ];
9429        let exclude = vec!["$.payload.private".to_owned()];
9430        let registered = service
9431            .register_fts_property_schema_with_entries(
9432                "KnowledgeItem",
9433                &entries,
9434                Some(" "),
9435                &exclude,
9436                crate::rebuild_actor::RebuildMode::Eager,
9437            )
9438            .expect("register recursive");
9439
9440        // The register entry point now echoes back the fully-populated
9441        // record via the same load helper used by describe/list.
9442        assert_eq!(registered.entries, entries);
9443        assert_eq!(registered.exclude_paths, exclude);
9444        assert_eq!(registered.property_paths, vec!["$.title", "$.payload"]);
9445
9446        let described = service
9447            .describe_fts_property_schema("KnowledgeItem")
9448            .expect("describe")
9449            .expect("should exist");
9450        assert_eq!(described.kind, "KnowledgeItem");
9451        assert_eq!(described.entries, entries);
9452        assert_eq!(described.exclude_paths, exclude);
9453        assert_eq!(described.property_paths, vec!["$.title", "$.payload"]);
9454        assert_eq!(described.separator, " ");
9455        assert_eq!(described.format_version, 1);
9456    }
9457
9458    #[test]
9459    fn list_fts_property_schemas_round_trips_recursive_entries() {
9460        let (_db, service) = setup();
9461
9462        let entries = vec![
9463            FtsPropertyPathSpec::scalar("$.title"),
9464            FtsPropertyPathSpec::recursive("$.payload"),
9465        ];
9466        let exclude = vec!["$.payload.secret".to_owned()];
9467        service
9468            .register_fts_property_schema_with_entries(
9469                "KnowledgeItem",
9470                &entries,
9471                Some(" "),
9472                &exclude,
9473                crate::rebuild_actor::RebuildMode::Eager,
9474            )
9475            .expect("register recursive");
9476
9477        let listed = service.list_fts_property_schemas().expect("list");
9478        assert_eq!(listed.len(), 1);
9479        let record = &listed[0];
9480        assert_eq!(record.kind, "KnowledgeItem");
9481        assert_eq!(record.entries, entries);
9482        assert_eq!(record.exclude_paths, exclude);
9483        assert_eq!(record.property_paths, vec!["$.title", "$.payload"]);
9484    }
9485
9486    #[test]
9487    fn describe_fts_property_schema_round_trips_scalar_only_entries() {
9488        let (_db, service) = setup();
9489
9490        service
9491            .register_fts_property_schema(
9492                "Meeting",
9493                &["$.title".to_owned(), "$.summary".to_owned()],
9494                None,
9495            )
9496            .expect("register scalar");
9497
9498        let described = service
9499            .describe_fts_property_schema("Meeting")
9500            .expect("describe")
9501            .expect("should exist");
9502        assert_eq!(described.property_paths, vec!["$.title", "$.summary"]);
9503        assert_eq!(described.entries.len(), 2);
9504        for entry in &described.entries {
9505            assert_eq!(
9506                entry.mode,
9507                FtsPropertyPathMode::Scalar,
9508                "scalar-only schema should deserialize every entry as Scalar"
9509            );
9510        }
9511        assert!(described.exclude_paths.is_empty());
9512    }
9513
9514    #[test]
9515    fn restore_reestablishes_property_fts_visibility() {
9516        let (db, service) = setup();
9517        let doc_table = fathomdb_schema::fts_kind_table_name("Document");
9518        {
9519            let conn = sqlite::open_connection(db.path()).expect("conn");
9520            // Register a property schema for Document kind.
9521            conn.execute(
9522                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9523                 VALUES ('Document', '[\"$.title\", \"$.body\"]', ' ')",
9524                [],
9525            )
9526            .expect("register schema");
9527            // Create the per-kind FTS table.
9528            conn.execute_batch(&format!(
9529                "CREATE VIRTUAL TABLE IF NOT EXISTS {doc_table} USING fts5(\
9530                    node_logical_id UNINDEXED, text_content, \
9531                    tokenize = 'porter unicode61 remove_diacritics 2'\
9532                )"
9533            ))
9534            .expect("create per-kind table");
9535            // Insert an active node with extractable properties.
9536            conn.execute(
9537                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9538                 VALUES ('row-1', 'doc-1', 'Document', '{\"title\":\"Budget\",\"body\":\"Q3 forecast\"}', 100, 'seed')",
9539                [],
9540            )
9541            .expect("insert node");
9542            // Insert a chunk so restore has something to work with for FTS.
9543            conn.execute(
9544                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
9545                 VALUES ('chunk-1', 'doc-1', 'budget text', 100)",
9546                [],
9547            )
9548            .expect("insert chunk");
9549            // Insert property FTS row into per-kind table (as write path would).
9550            conn.execute(
9551                &format!(
9552                    "INSERT INTO {doc_table} (node_logical_id, text_content) \
9553                     VALUES ('doc-1', 'Budget Q3 forecast')"
9554                ),
9555                [],
9556            )
9557            .expect("insert property fts");
9558            // Simulate retire: supersede node, clear FTS.
9559            conn.execute(
9560                "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9561                 VALUES ('evt-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
9562                [],
9563            )
9564            .expect("retire event");
9565            conn.execute(
9566                "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
9567                [],
9568            )
9569            .expect("supersede");
9570            conn.execute("DELETE FROM fts_nodes", [])
9571                .expect("clear chunk fts");
9572            conn.execute(&format!("DELETE FROM {doc_table}"), [])
9573                .expect("clear property fts");
9574        }
9575
9576        let report = service.restore_logical_id("doc-1").expect("restore");
9577        assert_eq!(report.restored_property_fts_rows, 1);
9578
9579        // Verify the property FTS row was recreated in the per-kind table.
9580        let conn = sqlite::open_connection(db.path()).expect("conn");
9581        let prop_fts_count: i64 = conn
9582            .query_row(
9583                &format!("SELECT count(*) FROM {doc_table} WHERE node_logical_id = 'doc-1'"),
9584                [],
9585                |row| row.get(0),
9586            )
9587            .expect("prop fts count");
9588        assert_eq!(prop_fts_count, 1, "property FTS must be restored");
9589
9590        let text: String = conn
9591            .query_row(
9592                &format!("SELECT text_content FROM {doc_table} WHERE node_logical_id = 'doc-1'"),
9593                [],
9594                |row| row.get(0),
9595            )
9596            .expect("prop fts text");
9597        assert_eq!(text, "Budget Q3 forecast");
9598    }
9599
9600    #[test]
9601    fn safe_export_preserves_fts_property_schemas() {
9602        let (_db, service) = setup();
9603        service
9604            .register_fts_property_schema(
9605                "Goal",
9606                &["$.name".to_owned(), "$.rationale".to_owned()],
9607                None,
9608            )
9609            .expect("register schema");
9610
9611        let export_dir = tempfile::TempDir::new().expect("temp dir");
9612        let export_path = export_dir.path().join("backup.db");
9613        service
9614            .safe_export(
9615                &export_path,
9616                SafeExportOptions {
9617                    force_checkpoint: false,
9618                },
9619            )
9620            .expect("export");
9621
9622        // Open the exported DB and verify the schema survived.
9623        let exported_conn = rusqlite::Connection::open(&export_path).expect("open exported db");
9624        let kind: String = exported_conn
9625            .query_row(
9626                "SELECT kind FROM fts_property_schemas WHERE kind = 'Goal'",
9627                [],
9628                |row| row.get(0),
9629            )
9630            .expect("schema must exist in export");
9631        assert_eq!(kind, "Goal");
9632        let paths_json: String = exported_conn
9633            .query_row(
9634                "SELECT property_paths_json FROM fts_property_schemas WHERE kind = 'Goal'",
9635                [],
9636                |row| row.get(0),
9637            )
9638            .expect("paths must exist");
9639        let paths: Vec<String> = serde_json::from_str(&paths_json).expect("valid json");
9640        assert_eq!(paths, vec!["$.name", "$.rationale"]);
9641    }
9642
9643    #[test]
9644    #[allow(clippy::too_many_lines)]
9645    fn export_recovery_rebuilds_property_fts_from_canonical_state() {
9646        let (db, service) = setup();
9647        let goal_table = fathomdb_schema::fts_kind_table_name("Goal");
9648        // Register a schema and insert two nodes with extractable properties.
9649        service
9650            .register_fts_property_schema("Goal", &["$.name".to_owned()], None)
9651            .expect("register");
9652        {
9653            let conn = sqlite::open_connection(db.path()).expect("conn");
9654            conn.execute(
9655                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9656                 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9657                [],
9658            )
9659            .expect("insert node 1");
9660            conn.execute(
9661                &format!(
9662                    "INSERT INTO {goal_table} (node_logical_id, text_content) \
9663                     VALUES ('goal-1', 'Ship v2')"
9664                ),
9665                [],
9666            )
9667            .expect("insert property FTS row 1");
9668            conn.execute(
9669                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9670                 VALUES ('row-2', 'goal-2', 'Goal', '{\"name\":\"Launch redesign\"}', 100, 'seed')",
9671                [],
9672            )
9673            .expect("insert node 2");
9674            conn.execute(
9675                &format!(
9676                    "INSERT INTO {goal_table} (node_logical_id, text_content) \
9677                     VALUES ('goal-2', 'Launch redesign')"
9678                ),
9679                [],
9680            )
9681            .expect("insert property FTS row 2");
9682        }
9683
9684        // Export.
9685        let export_dir = tempfile::TempDir::new().expect("temp dir");
9686        let export_path = export_dir.path().join("backup.db");
9687        service
9688            .safe_export(
9689                &export_path,
9690                SafeExportOptions {
9691                    force_checkpoint: false,
9692                },
9693            )
9694            .expect("export");
9695
9696        // Corrupt the derived rows: replace correct text with wrong text for
9697        // goal-1, and delete the row for goal-2 entirely. This exercises both
9698        // corrupted-but-present rows and missing rows in the same recovery.
9699        {
9700            let conn = rusqlite::Connection::open(&export_path).expect("open export");
9701            // Bootstrap the exported DB to get per-kind tables.
9702            SchemaManager::new()
9703                .bootstrap(&conn)
9704                .expect("bootstrap export");
9705            conn.execute(
9706                &format!("DELETE FROM {goal_table} WHERE node_logical_id = 'goal-1'"),
9707                [],
9708            )
9709            .expect("delete old row");
9710            conn.execute(
9711                &format!(
9712                    "INSERT INTO {goal_table} (node_logical_id, text_content) \
9713                     VALUES ('goal-1', 'completely wrong stale text')"
9714                ),
9715                [],
9716            )
9717            .expect("insert corrupted row");
9718            conn.execute(
9719                &format!("DELETE FROM {goal_table} WHERE node_logical_id = 'goal-2'"),
9720                [],
9721            )
9722            .expect("delete goal-2 row");
9723        }
9724
9725        // Open the exported DB and rebuild projections from canonical state.
9726        let schema = Arc::new(SchemaManager::new());
9727        let exported_service = AdminService::new(&export_path, Arc::clone(&schema));
9728        exported_service
9729            .rebuild_projections(ProjectionTarget::Fts)
9730            .expect("rebuild");
9731
9732        // Verify the per-kind table has the correct rows after recovery.
9733        let conn = rusqlite::Connection::open(&export_path).expect("open export for verify");
9734        let goal1_text: String = conn
9735            .query_row(
9736                &format!("SELECT text_content FROM {goal_table} WHERE node_logical_id = 'goal-1'"),
9737                [],
9738                |r| r.get(0),
9739            )
9740            .expect("goal-1 text after rebuild");
9741        assert_eq!(
9742            goal1_text, "Ship v2",
9743            "goal-1 text must be corrected by rebuild"
9744        );
9745
9746        let goal2_count: i64 = conn
9747            .query_row(
9748                &format!("SELECT count(*) FROM {goal_table} WHERE node_logical_id = 'goal-2'"),
9749                [],
9750                |r| r.get(0),
9751            )
9752            .expect("goal-2 count");
9753        assert_eq!(goal2_count, 1, "goal-2 row must be restored by rebuild");
9754
9755        let stale_count: i64 = conn
9756            .query_row(
9757                &format!("SELECT count(*) FROM {goal_table} WHERE text_content = 'completely wrong stale text'"),
9758                [],
9759                |r| r.get(0),
9760            )
9761            .expect("stale count");
9762        assert_eq!(stale_count, 0, "corrupted text must be gone after rebuild");
9763
9764        // Verify integrity and semantics are clean after recovery.
9765        let integrity = exported_service.check_integrity().expect("integrity");
9766        assert_eq!(integrity.missing_property_fts_rows, 0);
9767        let semantics = exported_service.check_semantics().expect("semantics");
9768        assert_eq!(semantics.drifted_property_fts_rows, 0);
9769        assert_eq!(semantics.orphaned_property_fts_rows, 0);
9770        assert_eq!(semantics.duplicate_property_fts_rows, 0);
9771    }
9772
9773    #[test]
9774    fn check_integrity_no_false_positives_for_empty_extraction() {
9775        let (db, service) = setup();
9776        {
9777            let conn = sqlite::open_connection(db.path()).expect("conn");
9778            // Register a schema that looks for $.searchable
9779            conn.execute(
9780                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9781                 VALUES ('Ticket', '[\"$.searchable\"]', ' ')",
9782                [],
9783            )
9784            .expect("register schema");
9785            // Insert a node whose properties do NOT contain $.searchable —
9786            // correctly has no property FTS row.
9787            conn.execute(
9788                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9789                 VALUES ('row-1', 'ticket-1', 'Ticket', '{\"status\":\"open\"}', 100, 'seed')",
9790                [],
9791            )
9792            .expect("insert node");
9793        }
9794
9795        let report = service.check_integrity().expect("integrity");
9796        assert_eq!(
9797            report.missing_property_fts_rows, 0,
9798            "node with no extractable values must not be counted as missing"
9799        );
9800    }
9801
9802    #[test]
9803    fn check_integrity_detects_genuinely_missing_property_fts_rows() {
9804        let (db, service) = setup();
9805        {
9806            let conn = sqlite::open_connection(db.path()).expect("conn");
9807            conn.execute(
9808                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9809                 VALUES ('Ticket', '[\"$.title\"]', ' ')",
9810                [],
9811            )
9812            .expect("register schema");
9813            // Insert a node WITH an extractable $.title but no property FTS row.
9814            conn.execute(
9815                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9816                 VALUES ('row-1', 'ticket-1', 'Ticket', '{\"title\":\"fix login bug\"}', 100, 'seed')",
9817                [],
9818            )
9819            .expect("insert node");
9820        }
9821
9822        let report = service.check_integrity().expect("integrity");
9823        assert_eq!(
9824            report.missing_property_fts_rows, 1,
9825            "node with extractable values but no property FTS row must be detected"
9826        );
9827    }
9828
9829    #[test]
9830    fn rebuild_projections_fts_restores_missing_property_fts_rows() {
9831        let (db, service) = setup();
9832        let goal_table = fathomdb_schema::fts_kind_table_name("Goal");
9833        {
9834            let conn = sqlite::open_connection(db.path()).expect("conn");
9835            conn.execute(
9836                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9837                 VALUES ('Goal', '[\"$.name\"]', ' ')",
9838                [],
9839            )
9840            .expect("register schema");
9841            conn.execute(
9842                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9843                 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9844                [],
9845            )
9846            .expect("insert node");
9847            // Deliberately do NOT insert a property FTS row.
9848        }
9849
9850        let report = service
9851            .rebuild_projections(ProjectionTarget::Fts)
9852            .expect("rebuild");
9853        assert!(
9854            report.rebuilt_rows >= 1,
9855            "rebuild must insert at least one property FTS row"
9856        );
9857
9858        let conn = sqlite::open_connection(db.path()).expect("conn");
9859        let text: String = conn
9860            .query_row(
9861                &format!("SELECT text_content FROM {goal_table} WHERE node_logical_id = 'goal-1'"),
9862                [],
9863                |row| row.get(0),
9864            )
9865            .expect("property FTS row must exist after rebuild");
9866        assert_eq!(text, "Ship v2");
9867    }
9868
9869    #[test]
9870    fn rebuild_missing_projections_fills_gap_for_deleted_property_fts_row() {
9871        let (db, service) = setup();
9872        let goal_table = fathomdb_schema::fts_kind_table_name("Goal");
9873        {
9874            let conn = sqlite::open_connection(db.path()).expect("conn");
9875            conn.execute(
9876                "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9877                 VALUES ('Goal', '[\"$.name\"]', ' ')",
9878                [],
9879            )
9880            .expect("register schema");
9881            conn.execute(
9882                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9883                 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9884                [],
9885            )
9886            .expect("insert node");
9887            // Create per-kind table and insert then delete to simulate corruption.
9888            conn.execute_batch(&format!(
9889                "CREATE VIRTUAL TABLE IF NOT EXISTS {goal_table} USING fts5(\
9890                    node_logical_id UNINDEXED, text_content, \
9891                    tokenize = 'porter unicode61 remove_diacritics 2'\
9892                )"
9893            ))
9894            .expect("create per-kind table");
9895            conn.execute(
9896                &format!(
9897                    "INSERT INTO {goal_table} (node_logical_id, text_content) \
9898                     VALUES ('goal-1', 'Ship v2')"
9899                ),
9900                [],
9901            )
9902            .expect("insert property fts");
9903            conn.execute(
9904                &format!("DELETE FROM {goal_table} WHERE node_logical_id = 'goal-1'"),
9905                [],
9906            )
9907            .expect("delete property fts");
9908        }
9909
9910        let report = service
9911            .rebuild_missing_projections()
9912            .expect("rebuild missing");
9913        assert!(
9914            report.rebuilt_rows >= 1,
9915            "missing rebuild must insert the gap-fill row"
9916        );
9917
9918        let conn = sqlite::open_connection(db.path()).expect("conn");
9919        let count: i64 = conn
9920            .query_row(
9921                &format!("SELECT count(*) FROM {goal_table} WHERE node_logical_id = 'goal-1'"),
9922                [],
9923                |row| row.get(0),
9924            )
9925            .expect("count");
9926        assert_eq!(
9927            count, 1,
9928            "gap-fill must restore exactly one property FTS row"
9929        );
9930    }
9931
9932    #[test]
9933    fn remove_schema_then_rebuild_cleans_stale_property_fts_rows() {
9934        // This test verifies that a full FTS rebuild clears per-kind tables whose
9935        // schema has been removed (orphaned state). We create the orphaned state
9936        // directly via SQL (bypassing the service API, which now eagerly deletes rows
9937        // on schema removal) to simulate a table that was left populated from a
9938        // previous registration cycle.
9939        let (db, service) = setup();
9940        let goal_table = fathomdb_schema::fts_kind_table_name("Goal");
9941        {
9942            let conn = sqlite::open_connection(db.path()).expect("conn");
9943            conn.execute(
9944                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9945                 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9946                [],
9947            )
9948            .expect("insert node");
9949            // Create per-kind table WITHOUT registering a schema — simulates orphaned rows
9950            // that remain after schema removal (or pre-existing table from a previous cycle).
9951            conn.execute_batch(&format!(
9952                "CREATE VIRTUAL TABLE IF NOT EXISTS {goal_table} \
9953                 USING fts5(node_logical_id UNINDEXED, text_content, tokenize = 'porter unicode61 remove_diacritics 2')"
9954            ))
9955            .expect("create per-kind table");
9956            conn.execute(
9957                &format!(
9958                    "INSERT INTO {goal_table} (node_logical_id, text_content) \
9959                     VALUES ('goal-1', 'Ship v2')"
9960                ),
9961                [],
9962            )
9963            .expect("insert property fts");
9964        }
9965
9966        // No schema registered — per-kind table has orphaned rows.
9967        let semantics = service.check_semantics().expect("semantics");
9968        assert_eq!(
9969            semantics.orphaned_property_fts_rows, 1,
9970            "orphaned property FTS rows must be detected with no registered schema"
9971        );
9972
9973        // Full rebuild should clean them (no schema means nothing to rebuild).
9974        service
9975            .rebuild_projections(ProjectionTarget::Fts)
9976            .expect("rebuild");
9977
9978        let conn = sqlite::open_connection(db.path()).expect("conn");
9979        let count: i64 = conn
9980            .query_row(
9981                &format!("SELECT count(*) FROM {goal_table} WHERE node_logical_id = 'goal-1'"),
9982                [],
9983                |row| row.get(0),
9984            )
9985            .expect("count");
9986        assert_eq!(
9987            count, 0,
9988            "rebuild must delete rows from per-kind tables with no registered schema"
9989        );
9990    }
9991
9992    mod validate_fts_property_paths_tests {
9993        use super::super::validate_fts_property_paths;
9994
9995        #[test]
9996        fn valid_simple_path() {
9997            assert!(validate_fts_property_paths(&["$.name".to_owned()]).is_ok());
9998        }
9999
10000        #[test]
10001        fn valid_nested_path() {
10002            assert!(validate_fts_property_paths(&["$.address.city".to_owned()]).is_ok());
10003        }
10004
10005        #[test]
10006        fn valid_underscore_segment() {
10007            assert!(validate_fts_property_paths(&["$.a_b".to_owned()]).is_ok());
10008        }
10009
10010        #[test]
10011        fn rejects_bare_prefix() {
10012            let result = validate_fts_property_paths(&["$.".to_owned()]);
10013            assert!(result.is_err(), "path '$.' must be rejected");
10014        }
10015
10016        #[test]
10017        fn rejects_double_dot() {
10018            let result = validate_fts_property_paths(&["$..x".to_owned()]);
10019            assert!(result.is_err(), "path '$..x' must be rejected");
10020        }
10021
10022        #[test]
10023        fn rejects_trailing_dot() {
10024            let result = validate_fts_property_paths(&["$.foo.".to_owned()]);
10025            assert!(result.is_err(), "path '$.foo.' must be rejected");
10026        }
10027
10028        #[test]
10029        fn rejects_space_in_segment() {
10030            let result = validate_fts_property_paths(&["$.foo bar".to_owned()]);
10031            assert!(result.is_err(), "path '$.foo bar' must be rejected");
10032        }
10033
10034        #[test]
10035        fn rejects_bracket_syntax() {
10036            let result = validate_fts_property_paths(&["$.foo[0]".to_owned()]);
10037            assert!(result.is_err(), "path '$.foo[0]' must be rejected");
10038        }
10039
10040        #[test]
10041        fn rejects_duplicates() {
10042            let result = validate_fts_property_paths(&["$.name".to_owned(), "$.name".to_owned()]);
10043            assert!(result.is_err(), "duplicate paths must be rejected");
10044        }
10045
10046        #[test]
10047        fn rejects_empty_list() {
10048            let result = validate_fts_property_paths(&[]);
10049            assert!(result.is_err(), "empty path list must be rejected");
10050        }
10051    }
10052
10053    // --- A-6: per-kind FTS table tests ---
10054
10055    #[test]
10056    fn register_fts_schema_writes_to_per_kind_table() {
10057        // After A-6: register_fts_property_schema writes rows to fts_props_<kind>,
10058        // NOT to fts_node_properties.
10059        let (db, service) = setup();
10060        {
10061            let conn = sqlite::open_connection(db.path()).expect("conn");
10062            // Insert a node before registering the schema.
10063            conn.execute(
10064                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
10065                 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
10066                [],
10067            )
10068            .expect("insert node");
10069        }
10070
10071        // Register schema — this triggers eager rebuild which writes to per-kind table.
10072        service
10073            .register_fts_property_schema("Goal", &["$.name".to_owned()], None)
10074            .expect("register schema");
10075
10076        let conn = sqlite::open_connection(db.path()).expect("conn");
10077        let table = fathomdb_schema::fts_kind_table_name("Goal");
10078        // Per-kind table must have the row.
10079        let per_kind_count: i64 = conn
10080            .query_row(
10081                &format!("SELECT count(*) FROM {table} WHERE node_logical_id = 'goal-1'"),
10082                [],
10083                |row| row.get(0),
10084            )
10085            .expect("per-kind count");
10086        assert_eq!(
10087            per_kind_count, 1,
10088            "per-kind table must have the row after registration"
10089        );
10090    }
10091
10092    #[test]
10093    fn remove_fts_schema_deletes_from_per_kind_table() {
10094        // After A-6: remove_fts_property_schema deletes rows from fts_props_<kind>.
10095        let (db, service) = setup();
10096        {
10097            let conn = sqlite::open_connection(db.path()).expect("conn");
10098            conn.execute(
10099                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
10100                 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
10101                [],
10102            )
10103            .expect("insert node");
10104        }
10105
10106        service
10107            .register_fts_property_schema("Goal", &["$.name".to_owned()], None)
10108            .expect("register schema");
10109        service
10110            .remove_fts_property_schema("Goal")
10111            .expect("remove schema");
10112
10113        let conn = sqlite::open_connection(db.path()).expect("conn");
10114        let table = fathomdb_schema::fts_kind_table_name("Goal");
10115        let per_kind_count: i64 = conn
10116            .query_row(
10117                &format!("SELECT count(*) FROM {table} WHERE node_logical_id = 'goal-1'"),
10118                [],
10119                |row| row.get(0),
10120            )
10121            .expect("per-kind count");
10122        assert_eq!(
10123            per_kind_count, 0,
10124            "per-kind table must be empty after schema removal"
10125        );
10126    }
10127
10128    // --- B-1: weight field tests ---
10129
10130    #[test]
10131    fn fts_path_spec_with_weight_builder() {
10132        let spec = FtsPropertyPathSpec::scalar("$.title").with_weight(5.0);
10133        assert_eq!(spec.weight, Some(5.0));
10134        assert_eq!(spec.path, "$.title");
10135        assert_eq!(spec.mode, FtsPropertyPathMode::Scalar);
10136    }
10137
10138    #[test]
10139    fn fts_path_spec_serialize_with_weight() {
10140        use super::serialize_property_paths_json;
10141        let entries = vec![
10142            FtsPropertyPathSpec::scalar("$.title").with_weight(2.0),
10143            FtsPropertyPathSpec::scalar("$.body"),
10144        ];
10145        let json = serialize_property_paths_json(&entries, &[]).expect("serialize");
10146        // Must use rich object format because a weight is present
10147        let v: serde_json::Value = serde_json::from_str(&json).expect("parse");
10148        let paths = v
10149            .get("paths")
10150            .expect("paths key")
10151            .as_array()
10152            .expect("array");
10153        assert_eq!(paths.len(), 2);
10154        // First entry has weight
10155        assert_eq!(
10156            paths[0].get("path").and_then(serde_json::Value::as_str),
10157            Some("$.title")
10158        );
10159        assert_eq!(
10160            paths[0].get("weight").and_then(serde_json::Value::as_f64),
10161            Some(2.0)
10162        );
10163        // Second entry has no weight field
10164        assert!(
10165            paths[1].get("weight").is_none(),
10166            "unweighted spec must omit weight field"
10167        );
10168    }
10169
10170    #[test]
10171    fn fts_path_spec_serialize_no_weights() {
10172        use super::serialize_property_paths_json;
10173        let entries = vec![
10174            FtsPropertyPathSpec::scalar("$.title"),
10175            FtsPropertyPathSpec::scalar("$.payload"),
10176        ];
10177        let json = serialize_property_paths_json(&entries, &[]).expect("serialize");
10178        // Must use bare string array (backward compat)
10179        let v: serde_json::Value = serde_json::from_str(&json).expect("parse");
10180        assert!(
10181            v.is_array(),
10182            "all-scalar no-weight schema must serialize as bare string array"
10183        );
10184        let arr = v.as_array().expect("array");
10185        assert_eq!(arr.len(), 2);
10186        assert_eq!(arr[0].as_str(), Some("$.title"));
10187        assert_eq!(arr[1].as_str(), Some("$.payload"));
10188    }
10189
10190    #[test]
10191    fn fts_weight_validation_out_of_range() {
10192        let (_db, service) = setup();
10193        // weight = 0.0 must be rejected
10194        let entries_zero = vec![FtsPropertyPathSpec::scalar("$.title").with_weight(0.0)];
10195        let result = service.register_fts_property_schema_with_entries(
10196            "Article",
10197            &entries_zero,
10198            None,
10199            &[],
10200            crate::rebuild_actor::RebuildMode::Eager,
10201        );
10202        assert!(result.is_err(), "weight 0.0 must be rejected");
10203        let err_msg = result.expect_err("weight 0.0 must be rejected").to_string();
10204        assert!(
10205            err_msg.contains("weight"),
10206            "error must mention weight: {err_msg}"
10207        );
10208
10209        // weight = 1001.0 must be rejected
10210        let entries_big = vec![FtsPropertyPathSpec::scalar("$.title").with_weight(1001.0)];
10211        let result = service.register_fts_property_schema_with_entries(
10212            "Article",
10213            &entries_big,
10214            None,
10215            &[],
10216            crate::rebuild_actor::RebuildMode::Eager,
10217        );
10218        assert!(result.is_err(), "weight 1001.0 must be rejected");
10219    }
10220
10221    #[test]
10222    fn fts_weight_validation_valid() {
10223        let (_db, service) = setup();
10224        let entries = vec![FtsPropertyPathSpec::scalar("$.title").with_weight(10.0)];
10225        let result = service.register_fts_property_schema_with_entries(
10226            "Article",
10227            &entries,
10228            None,
10229            &[],
10230            crate::rebuild_actor::RebuildMode::Eager,
10231        );
10232        assert!(
10233            result.is_ok(),
10234            "weight 10.0 must be accepted: {:?}",
10235            result.err()
10236        );
10237    }
10238
10239    // --- B-2: create_or_replace_fts_kind_table tests ---
10240
10241    #[test]
10242    fn create_or_replace_creates_multi_column_table() {
10243        use super::create_or_replace_fts_kind_table;
10244        let (db, _service) = setup();
10245        let conn = sqlite::open_connection(db.path()).expect("conn");
10246        let specs = vec![
10247            FtsPropertyPathSpec::scalar("$.title"),
10248            FtsPropertyPathSpec::recursive("$.payload"),
10249        ];
10250        create_or_replace_fts_kind_table(
10251            &conn,
10252            "Article",
10253            &specs,
10254            fathomdb_schema::DEFAULT_FTS_TOKENIZER,
10255        )
10256        .expect("create table");
10257
10258        // Verify table exists and has the expected columns.
10259        let table = fathomdb_schema::fts_kind_table_name("Article");
10260        // node_logical_id column
10261        let count: i64 = conn
10262            .query_row(&format!("SELECT count(*) FROM {table}"), [], |r| r.get(0))
10263            .expect("count");
10264        assert_eq!(count, 0, "new table must be empty");
10265
10266        // Verify columns exist by inserting a row with named columns
10267        let title_col = fathomdb_schema::fts_column_name("$.title", false);
10268        let payload_col = fathomdb_schema::fts_column_name("$.payload", true);
10269        conn.execute(
10270            &format!(
10271                "INSERT INTO {table} (node_logical_id, {title_col}, {payload_col}) VALUES ('id1', 'hello', 'world')"
10272            ),
10273            [],
10274        )
10275        .expect("insert with per-spec columns must succeed");
10276    }
10277
10278    #[test]
10279    fn create_or_replace_drops_and_recreates() {
10280        use super::create_or_replace_fts_kind_table;
10281        let (db, _service) = setup();
10282        let conn = sqlite::open_connection(db.path()).expect("conn");
10283
10284        // First call: 1 spec
10285        let specs_v1 = vec![FtsPropertyPathSpec::scalar("$.title")];
10286        create_or_replace_fts_kind_table(
10287            &conn,
10288            "Post",
10289            &specs_v1,
10290            fathomdb_schema::DEFAULT_FTS_TOKENIZER,
10291        )
10292        .expect("create v1");
10293
10294        // Second call: 2 specs (different layout)
10295        let specs_v2 = vec![
10296            FtsPropertyPathSpec::scalar("$.title"),
10297            FtsPropertyPathSpec::scalar("$.summary"),
10298        ];
10299        create_or_replace_fts_kind_table(
10300            &conn,
10301            "Post",
10302            &specs_v2,
10303            fathomdb_schema::DEFAULT_FTS_TOKENIZER,
10304        )
10305        .expect("create v2");
10306
10307        // Verify new layout: summary column must exist
10308        let table = fathomdb_schema::fts_kind_table_name("Post");
10309        let summary_col = fathomdb_schema::fts_column_name("$.summary", false);
10310        conn.execute(
10311            &format!("INSERT INTO {table} (node_logical_id, {summary_col}) VALUES ('id1', 'text')"),
10312            [],
10313        )
10314        .expect("second layout must allow summary column");
10315    }
10316
10317    #[test]
10318    fn create_or_replace_invalid_tokenizer() {
10319        use super::create_or_replace_fts_kind_table;
10320        let (db, _service) = setup();
10321        let conn = sqlite::open_connection(db.path()).expect("conn");
10322        let specs = vec![FtsPropertyPathSpec::scalar("$.title")];
10323        let result = create_or_replace_fts_kind_table(&conn, "Post", &specs, "'; DROP TABLE --");
10324        assert!(result.is_err(), "invalid tokenizer must be rejected");
10325        let err_msg = result
10326            .expect_err("invalid tokenizer must be rejected")
10327            .to_string();
10328        assert!(
10329            err_msg.contains("tokenizer"),
10330            "error must mention tokenizer: {err_msg}"
10331        );
10332    }
10333
10334    #[test]
10335    fn register_with_weights_creates_per_column_table() {
10336        let (db, service) = setup();
10337        let entries = vec![
10338            FtsPropertyPathSpec::scalar("$.title").with_weight(2.0),
10339            FtsPropertyPathSpec::scalar("$.body"),
10340        ];
10341        service
10342            .register_fts_property_schema_with_entries(
10343                "Article",
10344                &entries,
10345                None,
10346                &[],
10347                crate::rebuild_actor::RebuildMode::Eager,
10348            )
10349            .expect("register");
10350
10351        // Per-kind table must have per-spec columns, not just text_content
10352        let conn = sqlite::open_connection(db.path()).expect("conn");
10353        let table = fathomdb_schema::fts_kind_table_name("Article");
10354        let title_col = fathomdb_schema::fts_column_name("$.title", false);
10355        let body_col = fathomdb_schema::fts_column_name("$.body", false);
10356        // If the columns exist, insert must succeed
10357        conn.execute(
10358            &format!(
10359                "INSERT INTO {table} (node_logical_id, {title_col}, {body_col}) VALUES ('art-1', 'hello', 'world')"
10360            ),
10361            [],
10362        )
10363        .expect("per-spec columns must exist after registration with weights");
10364    }
10365
10366    #[test]
10367    fn weighted_to_unweighted_downgrade_recreates_table() {
10368        let (db, service) = setup();
10369
10370        // First register with weights (creates per-spec column layout).
10371        let weighted_entries = vec![
10372            FtsPropertyPathSpec::scalar("$.title").with_weight(2.0),
10373            FtsPropertyPathSpec::scalar("$.body"),
10374        ];
10375        service
10376            .register_fts_property_schema_with_entries(
10377                "Article",
10378                &weighted_entries,
10379                None,
10380                &[],
10381                crate::rebuild_actor::RebuildMode::Eager,
10382            )
10383            .expect("register weighted");
10384
10385        // Re-register the same kind WITHOUT weights.
10386        let unweighted_entries = vec![
10387            FtsPropertyPathSpec::scalar("$.title"),
10388            FtsPropertyPathSpec::scalar("$.body"),
10389        ];
10390        service
10391            .register_fts_property_schema_with_entries(
10392                "Article",
10393                &unweighted_entries,
10394                None,
10395                &[],
10396                crate::rebuild_actor::RebuildMode::Eager,
10397            )
10398            .expect("re-register unweighted");
10399
10400        // After downgrade, the table must have the text_content column
10401        // (legacy single-column layout), not the per-spec columns.
10402        let conn = sqlite::open_connection(db.path()).expect("conn");
10403        let table = fathomdb_schema::fts_kind_table_name("Article");
10404        let result = conn.execute(
10405            &format!("INSERT INTO {table} (node_logical_id, text_content) VALUES ('art-1', 'hello world')"),
10406            [],
10407        );
10408        assert!(
10409            result.is_ok(),
10410            "text_content column must exist after weighted-to-unweighted downgrade"
10411        );
10412    }
10413
10414    // --- Pack A+G: profile CRUD + tokenizer presets ---
10415
10416    #[test]
10417    fn set_get_fts_profile_roundtrip() {
10418        let (_db, service) = setup();
10419        let profile = service
10420            .set_fts_profile("book", "unicode61")
10421            .expect("set_fts_profile");
10422        assert_eq!(profile.kind, "book");
10423        assert_eq!(profile.tokenizer, "unicode61");
10424
10425        let got = service
10426            .get_fts_profile("book")
10427            .expect("get_fts_profile")
10428            .expect("should be Some");
10429        assert_eq!(got.kind, "book");
10430        assert_eq!(got.tokenizer, "unicode61");
10431    }
10432
10433    #[test]
10434    fn fts_profile_upsert() {
10435        let (_db, service) = setup();
10436        service
10437            .set_fts_profile("article", "unicode61")
10438            .expect("first set");
10439        service
10440            .set_fts_profile("article", "porter unicode61 remove_diacritics 2")
10441            .expect("second set");
10442        let got = service
10443            .get_fts_profile("article")
10444            .expect("get")
10445            .expect("Some");
10446        assert_eq!(got.tokenizer, "porter unicode61 remove_diacritics 2");
10447    }
10448
10449    #[test]
10450    fn invalid_tokenizer_rejected() {
10451        let (_db, service) = setup();
10452        let result = service.set_fts_profile("book", "'; DROP TABLE nodes --");
10453        assert!(result.is_err(), "invalid tokenizer must be rejected");
10454        let msg = result.expect_err("must be Err").to_string();
10455        assert!(
10456            msg.contains("tokenizer") || msg.contains("invalid"),
10457            "error must mention tokenizer or invalid: {msg}"
10458        );
10459    }
10460
10461    #[test]
10462    fn preset_recall_optimized_english() {
10463        assert_eq!(
10464            super::resolve_tokenizer_preset("recall-optimized-english"),
10465            "porter unicode61 remove_diacritics 2"
10466        );
10467    }
10468
10469    #[test]
10470    fn preset_precision_optimized() {
10471        assert_eq!(
10472            super::resolve_tokenizer_preset("precision-optimized"),
10473            "unicode61 remove_diacritics 2"
10474        );
10475    }
10476
10477    #[test]
10478    fn preset_global_cjk() {
10479        assert_eq!(super::resolve_tokenizer_preset("global-cjk"), "icu");
10480    }
10481
10482    #[test]
10483    fn preset_substring_trigram() {
10484        assert_eq!(
10485            super::resolve_tokenizer_preset("substring-trigram"),
10486            "trigram"
10487        );
10488    }
10489
10490    #[test]
10491    fn preset_source_code() {
10492        assert_eq!(
10493            super::resolve_tokenizer_preset("source-code"),
10494            "unicode61 tokenchars '._-$@'"
10495        );
10496    }
10497
10498    #[test]
10499    fn preview_fts_row_count() {
10500        let (db, service) = setup();
10501        {
10502            let conn = sqlite::open_connection(db.path()).expect("conn");
10503            for i in 0..5u32 {
10504                conn.execute(
10505                    "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
10506                     VALUES (?1, ?2, 'book', '{}', 100, 'src')",
10507                    rusqlite::params![format!("r{i}"), format!("lg{i}")],
10508                )
10509                .expect("insert node");
10510            }
10511            // Insert one superseded node that must NOT count
10512            conn.execute(
10513                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref, superseded_at) \
10514                 VALUES ('r99', 'lg99', 'book', '{}', 100, 'src', 200)",
10515                [],
10516            )
10517            .expect("insert superseded");
10518        }
10519        let impact = service
10520            .preview_projection_impact("book", "fts")
10521            .expect("preview");
10522        assert_eq!(impact.rows_to_rebuild, 5);
10523    }
10524
10525    #[test]
10526    fn preview_populates_current_tokenizer() {
10527        let (_db, service) = setup();
10528        service
10529            .set_fts_profile("doc", "trigram")
10530            .expect("set profile");
10531        let impact = service
10532            .preview_projection_impact("doc", "fts")
10533            .expect("preview");
10534        assert_eq!(impact.current_tokenizer, Some("trigram".to_owned()));
10535        assert_eq!(impact.target_tokenizer, None);
10536    }
10537
10538    // --- Review fix: tokenizer allowlist alignment ---
10539
10540    #[test]
10541    fn create_or_replace_source_code_tokenizer_is_accepted() {
10542        // The source-code preset expands to "unicode61 tokenchars '._-$@'" which
10543        // contains `.`, `-`, `$`, `@`. The allowlist in create_or_replace_fts_kind_table
10544        // must accept these characters (matching set_fts_profile's allowlist).
10545        use super::create_or_replace_fts_kind_table;
10546        let (db, _service) = setup();
10547        let conn = sqlite::open_connection(db.path()).expect("conn");
10548        let specs = vec![FtsPropertyPathSpec::scalar("$.symbol")];
10549        let source_code_tokenizer = "unicode61 tokenchars '._-$@'";
10550        let result =
10551            create_or_replace_fts_kind_table(&conn, "Symbol", &specs, source_code_tokenizer);
10552        assert!(
10553            result.is_ok(),
10554            "source-code tokenizer string must be accepted by create_or_replace_fts_kind_table: {:?}",
10555            result.err()
10556        );
10557    }
10558
10559    #[test]
10560    fn source_code_profile_round_trip_through_register_fts_schema() {
10561        // Verify that set_fts_profile("source-code") followed by
10562        // register_fts_property_schema succeeds end-to-end.
10563        // Previously failed because set_fts_profile accepted "unicode61 tokenchars '._-$@'"
10564        // but create_or_replace_fts_kind_table rejected it (only allowed " '_").
10565        let db = tempfile::NamedTempFile::new().expect("temp file");
10566        let schema = Arc::new(fathomdb_schema::SchemaManager::new());
10567
10568        // Bootstrap the schema (creates projection_profiles table via migration 20).
10569        {
10570            let _coord = crate::ExecutionCoordinator::open(
10571                db.path(),
10572                Arc::clone(&schema),
10573                None,
10574                1,
10575                Arc::new(crate::TelemetryCounters::default()),
10576                None,
10577            )
10578            .expect("coordinator opens for bootstrap");
10579        }
10580
10581        let service = AdminService::new(db.path(), Arc::clone(&schema));
10582
10583        // Set source-code profile (uses preset resolver, stores "unicode61 tokenchars '._-$@'").
10584        service
10585            .set_fts_profile("Symbol", "source-code")
10586            .expect("set_fts_profile with source-code preset must succeed");
10587
10588        // Register an FTS schema for this kind — this calls create_or_replace_fts_kind_table
10589        // with the tokenizer from the profile row.
10590        let result = service.register_fts_property_schema("Symbol", &["$.name".to_owned()], None);
10591        assert!(
10592            result.is_ok(),
10593            "register_fts_property_schema must succeed when source-code profile is active: {:?}",
10594            result.err()
10595        );
10596    }
10597
10598    // --- 0.5.0 item 5: max_tokens() capacity ---
10599
10600    /// A stub embedder with `max_tokens=8192` can embed a pre-written chunk
10601    /// whose text exceeds 512 words without error. Verifies that `max_tokens()`
10602    /// advertises the correct capacity and that `regenerate_vector_embeddings`
10603    /// produces one vector row for one stored chunk, regardless of chunk length.
10604    /// (The engine does not re-chunk at regen time; splitting is the caller's
10605    /// responsibility at write time.)
10606    #[cfg(feature = "sqlite-vec")]
10607    #[test]
10608    fn embedder_max_tokens_8192_handles_chunk_exceeding_512_words() {
10609        let long_text = (0..600u32)
10610            .map(|i| format!("word{i}"))
10611            .collect::<Vec<_>>()
10612            .join(" ");
10613
10614        let db = NamedTempFile::new().expect("temp file");
10615        let schema = Arc::new(SchemaManager::new());
10616
10617        {
10618            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
10619            schema.bootstrap(&conn).expect("bootstrap");
10620            conn.execute(
10621                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
10622                 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'src-1')",
10623                [],
10624            )
10625            .expect("insert node");
10626            conn.execute(
10627                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
10628                 VALUES (?1, 'doc-1', ?2, 100)",
10629                rusqlite::params!["chunk-long", long_text],
10630            )
10631            .expect("insert long chunk");
10632        }
10633
10634        let embedder = LargeContextTestEmbedder::new("long-context-model", 4, 8192);
10635        let service = AdminService::new(db.path(), Arc::clone(&schema));
10636        let report = service
10637            .regenerate_vector_embeddings(
10638                &embedder,
10639                &VectorRegenerationConfig {
10640                    kind: "Document".to_owned(),
10641                    profile: "default".to_owned(),
10642                    chunking_policy: "per_chunk".to_owned(),
10643                    preprocessing_policy: "trim".to_owned(),
10644                },
10645            )
10646            .expect("regenerate with long-context embedder");
10647
10648        assert_eq!(
10649            report.total_chunks, 1,
10650            "600-word text pre-written as one chunk must result in exactly one embedded row"
10651        );
10652        assert_eq!(report.regenerated_rows, 1);
10653        assert_eq!(
10654            embedder.max_tokens(),
10655            8192,
10656            "embedder must advertise 8192 token capacity"
10657        );
10658    }
10659
10660    /// Stub embedder with a configurable `max_tokens` for long-context tests.
10661    #[cfg(feature = "sqlite-vec")]
10662    #[derive(Debug)]
10663    struct LargeContextTestEmbedder {
10664        identity: QueryEmbedderIdentity,
10665        vector: Vec<f32>,
10666        max_tokens: usize,
10667    }
10668
10669    #[cfg(feature = "sqlite-vec")]
10670    impl LargeContextTestEmbedder {
10671        fn new(model: &str, dimension: usize, max_tokens: usize) -> Self {
10672            Self {
10673                identity: QueryEmbedderIdentity {
10674                    model_identity: model.to_owned(),
10675                    model_version: "1.0.0".to_owned(),
10676                    dimension,
10677                    normalization_policy: "l2".to_owned(),
10678                },
10679                vector: vec![1.0; dimension],
10680                max_tokens,
10681            }
10682        }
10683    }
10684
10685    #[cfg(feature = "sqlite-vec")]
10686    impl QueryEmbedder for LargeContextTestEmbedder {
10687        fn embed_query(&self, _text: &str) -> Result<Vec<f32>, EmbedderError> {
10688            Ok(self.vector.clone())
10689        }
10690        fn identity(&self) -> QueryEmbedderIdentity {
10691            self.identity.clone()
10692        }
10693        fn max_tokens(&self) -> usize {
10694            self.max_tokens
10695        }
10696    }
10697
10698    /// Item 7 integration test: register schema, write nodes, call
10699    /// `regenerate_vector_embeddings_in_process`, verify contract row and
10700    /// that vec rows exist for every chunk.
10701    #[cfg(feature = "sqlite-vec")]
10702    #[test]
10703    #[allow(clippy::too_many_lines)]
10704    fn regenerate_vector_embeddings_in_process_writes_contract_and_vec_rows() {
10705        let db = NamedTempFile::new().expect("temp file");
10706        let schema = Arc::new(SchemaManager::new());
10707
10708        {
10709            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
10710            schema.bootstrap(&conn).expect("bootstrap");
10711            for (row_id, logical_id, created_at, src) in [
10712                ("r1", "node-1", 100, "src1"),
10713                ("r2", "node-2", 101, "src2"),
10714                ("r3", "node-3", 102, "src3"),
10715            ] {
10716                conn.execute(
10717                    "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
10718                     VALUES (?1, ?2, 'Doc', '{}', ?3, ?4)",
10719                    rusqlite::params![row_id, logical_id, created_at, src],
10720                )
10721                .expect("insert node");
10722            }
10723            for (chunk_id, node_id, text, created_at) in [
10724                ("c1", "node-1", "first document text", 100),
10725                ("c2", "node-2", "second document text", 101),
10726                ("c3", "node-3", "third document text", 102),
10727            ] {
10728                conn.execute(
10729                    "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
10730                     VALUES (?1, ?2, ?3, ?4)",
10731                    rusqlite::params![chunk_id, node_id, text, created_at],
10732                )
10733                .expect("insert chunk");
10734            }
10735        }
10736
10737        let service = AdminService::new(db.path(), Arc::clone(&schema));
10738        let embedder = TestEmbedder::new("batch-test-model", 4);
10739        let config = VectorRegenerationConfig {
10740            kind: "Doc".to_owned(),
10741            profile: "default".to_owned(),
10742            chunking_policy: "per_chunk".to_owned(),
10743            preprocessing_policy: "trim".to_owned(),
10744        };
10745        let report = service
10746            .regenerate_vector_embeddings_in_process(&embedder, &config)
10747            .expect("in-process regen must succeed");
10748
10749        assert_eq!(report.total_chunks, 3);
10750        assert_eq!(report.regenerated_rows, 3);
10751        assert!(report.contract_persisted);
10752
10753        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
10754        let vec_count: i64 = conn
10755            .query_row("SELECT count(*) FROM vec_doc", [], |row| row.get(0))
10756            .expect("vec_doc count");
10757        assert_eq!(vec_count, 3, "one vec row per chunk");
10758
10759        let model_identity: String = conn
10760            .query_row(
10761                "SELECT model_identity FROM vector_embedding_contracts WHERE profile = 'default'",
10762                [],
10763                |row| row.get(0),
10764            )
10765            .expect("contract row");
10766        assert_eq!(model_identity, "batch-test-model");
10767    }
10768
10769    // --- 0.5.0 item 6: per-kind vec regeneration ---
10770
10771    #[cfg(feature = "sqlite-vec")]
10772    #[test]
10773    #[allow(clippy::too_many_lines)]
10774    fn regenerate_vector_embeddings_targets_per_kind_table() {
10775        let db = NamedTempFile::new().expect("temp file");
10776        let schema = Arc::new(SchemaManager::new());
10777
10778        {
10779            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
10780            schema.bootstrap(&conn).expect("bootstrap");
10781            conn.execute(
10782                "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
10783                 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
10784                [],
10785            )
10786            .expect("insert node");
10787            conn.execute(
10788                "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
10789                 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
10790                [],
10791            )
10792            .expect("insert chunk");
10793        }
10794
10795        let service = AdminService::new(db.path(), Arc::clone(&schema));
10796        let embedder = TestEmbedder::new("test-model", 4);
10797        let report = service
10798            .regenerate_vector_embeddings(
10799                &embedder,
10800                &VectorRegenerationConfig {
10801                    kind: "Document".to_owned(),
10802                    profile: "default".to_owned(),
10803                    chunking_policy: "per_chunk".to_owned(),
10804                    preprocessing_policy: "trim".to_owned(),
10805                },
10806            )
10807            .expect("regenerate vectors");
10808
10809        assert_eq!(report.table_name, "vec_document");
10810        assert_eq!(report.regenerated_rows, 1);
10811
10812        let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
10813        let vec_count: i64 = conn
10814            .query_row("SELECT count(*) FROM vec_document", [], |row| row.get(0))
10815            .expect("vec_document count");
10816        assert_eq!(vec_count, 1, "rows must be in vec_document");
10817
10818        let old_count: i64 = conn
10819            .query_row(
10820                "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='vec_nodes_active'",
10821                [],
10822                |r| r.get(0),
10823            )
10824            .expect("sqlite_master check");
10825        assert_eq!(
10826            old_count, 0,
10827            "vec_nodes_active must NOT be created for per-kind regen"
10828        );
10829    }
10830
10831    // --- 0.5.0 item 6 step 5: get_vec_profile reads per-kind key ---
10832
10833    #[test]
10834    fn get_vec_profile_returns_none_when_no_profile_exists() {
10835        let (db, service) = setup();
10836        let _ = db;
10837        let result = service.get_vec_profile("MyKind").expect("should not error");
10838        assert!(
10839            result.is_none(),
10840            "must return None when no profile registered"
10841        );
10842    }
10843
10844    #[cfg(feature = "sqlite-vec")]
10845    #[test]
10846    fn get_vec_profile_returns_profile_for_registered_kind() {
10847        let db = NamedTempFile::new().expect("temp file");
10848        let schema = Arc::new(SchemaManager::new());
10849        {
10850            let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
10851            schema.bootstrap(&conn).expect("bootstrap");
10852            schema
10853                .ensure_vec_kind_profile(&conn, "MyKind", 128)
10854                .expect("ensure_vec_kind_profile");
10855        }
10856
10857        let service = AdminService::new(db.path(), Arc::clone(&schema));
10858        let profile = service.get_vec_profile("MyKind").expect("should not error");
10859        assert!(profile.is_some(), "must return profile after registration");
10860        assert_eq!(profile.unwrap().dimensions, 128);
10861    }
10862
10863    #[test]
10864    fn get_vec_profile_does_not_return_global_sentinel_row() {
10865        let (db, service) = setup();
10866        {
10867            let conn = sqlite::open_connection(db.path()).expect("conn");
10868            conn.execute(
10869                "INSERT INTO projection_profiles (kind, facet, config_json, active_at, created_at) \
10870                 VALUES ('*', 'vec', '{\"model_identity\":\"old-model\",\"dimensions\":384}', 0, 0)",
10871                [],
10872            )
10873            .expect("insert global sentinel");
10874        }
10875        let result = service
10876            .get_vec_profile("SomeKind")
10877            .expect("should not error");
10878        assert!(
10879            result.is_none(),
10880            "per-kind query must not return global ('*', 'vec') row"
10881        );
10882    }
10883}