1use std::fmt::Write as _;
2use std::fs;
3use std::io::{self, Read, Write};
4use std::path::{Path, PathBuf};
5use std::process::{Command, Stdio};
6use std::sync::Arc;
7use std::sync::mpsc;
8use std::thread;
9use std::time::{Duration, Instant, SystemTime};
10
11use fathomdb_schema::{SchemaError, SchemaManager};
12use rusqlite::{DatabaseName, OptionalExtension, TransactionBehavior};
13use serde::{Deserialize, Serialize};
14use sha2::{Digest, Sha256};
15
16use crate::{
17 EngineError, ProjectionRepairReport, ProjectionService, executable_trust,
18 ids::new_id,
19 operational::{
20 OperationalCollectionKind, OperationalCollectionRecord, OperationalCompactionReport,
21 OperationalCurrentRow, OperationalFilterClause, OperationalFilterField,
22 OperationalFilterFieldType, OperationalFilterMode, OperationalFilterValue,
23 OperationalHistoryValidationIssue, OperationalHistoryValidationReport,
24 OperationalMutationRow, OperationalPurgeReport, OperationalReadReport,
25 OperationalReadRequest, OperationalRegisterRequest, OperationalRepairReport,
26 OperationalRetentionActionKind, OperationalRetentionPlanItem,
27 OperationalRetentionPlanReport, OperationalRetentionRunItem, OperationalRetentionRunReport,
28 OperationalSecondaryIndexDefinition, OperationalSecondaryIndexRebuildReport,
29 OperationalTraceReport, extract_secondary_index_entries_for_current,
30 extract_secondary_index_entries_for_mutation, parse_operational_secondary_indexes_json,
31 parse_operational_validation_contract, validate_operational_payload_against_contract,
32 },
33 projection::ProjectionTarget,
34 sqlite,
35};
36
37#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
39pub struct IntegrityReport {
40 pub physical_ok: bool,
41 pub foreign_keys_ok: bool,
42 pub missing_fts_rows: usize,
43 pub missing_property_fts_rows: usize,
44 pub duplicate_active_logical_ids: usize,
45 pub operational_missing_collections: usize,
46 pub operational_missing_last_mutations: usize,
47 pub warnings: Vec<String>,
48}
49
50#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
52pub struct FtsPropertySchemaRecord {
53 pub kind: String,
55 pub property_paths: Vec<String>,
57 pub separator: String,
59 pub format_version: i64,
61}
62
63#[derive(Clone, Copy, Debug)]
65pub struct SafeExportOptions {
66 pub force_checkpoint: bool,
70}
71
72impl Default for SafeExportOptions {
73 fn default() -> Self {
74 Self {
75 force_checkpoint: true,
76 }
77 }
78}
79
80const EXPORT_PROTOCOL_VERSION: u32 = 1;
82
83#[derive(Clone, Debug, Serialize)]
85pub struct SafeExportManifest {
86 pub exported_at: u64,
88 pub sha256: String,
90 pub schema_version: u32,
92 pub protocol_version: u32,
94 pub page_count: u64,
96}
97
98#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
100pub struct TraceReport {
101 pub source_ref: String,
102 pub node_rows: usize,
103 pub edge_rows: usize,
104 pub action_rows: usize,
105 pub operational_mutation_rows: usize,
106 pub node_logical_ids: Vec<String>,
107 pub action_ids: Vec<String>,
108 pub operational_mutation_ids: Vec<String>,
109}
110
111#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
113pub struct SkippedEdge {
114 pub edge_logical_id: String,
115 pub missing_endpoint: String,
116}
117
118#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
120pub struct LogicalRestoreReport {
121 pub logical_id: String,
122 pub was_noop: bool,
123 pub restored_node_rows: usize,
124 pub restored_edge_rows: usize,
125 pub restored_chunk_rows: usize,
126 pub restored_fts_rows: usize,
127 pub restored_property_fts_rows: usize,
128 pub restored_vec_rows: usize,
129 pub skipped_edges: Vec<SkippedEdge>,
130 pub notes: Vec<String>,
131}
132
133#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
135pub struct LogicalPurgeReport {
136 pub logical_id: String,
137 pub was_noop: bool,
138 pub deleted_node_rows: usize,
139 pub deleted_edge_rows: usize,
140 pub deleted_chunk_rows: usize,
141 pub deleted_fts_rows: usize,
142 pub deleted_vec_rows: usize,
143 pub notes: Vec<String>,
144}
145
146#[derive(Clone, Debug, Serialize, Deserialize)]
148pub struct ProvenancePurgeOptions {
149 pub dry_run: bool,
150 #[serde(default)]
151 pub preserve_event_types: Vec<String>,
152}
153
154#[derive(Clone, Debug, Serialize)]
156pub struct ProvenancePurgeReport {
157 pub events_deleted: u64,
158 pub events_preserved: u64,
159 pub oldest_remaining: Option<i64>,
160}
161
162#[derive(Debug)]
164pub struct AdminService {
165 database_path: PathBuf,
166 schema_manager: Arc<SchemaManager>,
167 projections: ProjectionService,
168}
169
170#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
172pub struct SemanticReport {
173 pub orphaned_chunks: usize,
175 pub null_source_ref_nodes: usize,
177 pub broken_step_fk: usize,
179 pub broken_action_fk: usize,
181 pub stale_fts_rows: usize,
183 pub fts_rows_for_superseded_nodes: usize,
185 pub stale_property_fts_rows: usize,
187 pub orphaned_property_fts_rows: usize,
189 pub mismatched_kind_property_fts_rows: usize,
191 pub duplicate_property_fts_rows: usize,
193 pub drifted_property_fts_rows: usize,
195 pub dangling_edges: usize,
197 pub orphaned_supersession_chains: usize,
199 pub stale_vec_rows: usize,
201 pub vec_rows_for_superseded_nodes: usize,
203 pub missing_operational_current_rows: usize,
205 pub stale_operational_current_rows: usize,
207 pub disabled_collection_mutations: usize,
209 pub orphaned_last_access_metadata_rows: usize,
211 pub warnings: Vec<String>,
212}
213
214#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
216#[serde(rename_all = "snake_case")]
217pub struct VectorRegenerationConfig {
218 pub profile: String,
219 pub table_name: String,
220 pub model_identity: String,
221 pub model_version: String,
222 pub dimension: usize,
223 pub normalization_policy: String,
224 pub chunking_policy: String,
225 pub preprocessing_policy: String,
226 pub generator_command: Vec<String>,
227}
228
229#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
231pub struct VectorRegenerationReport {
232 pub profile: String,
233 pub table_name: String,
234 pub dimension: usize,
235 pub total_chunks: usize,
236 pub regenerated_rows: usize,
237 pub contract_persisted: bool,
238 pub notes: Vec<String>,
239}
240
241#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
243#[serde(rename_all = "snake_case")]
244pub struct VectorGeneratorPolicy {
245 pub timeout_ms: u64,
246 pub max_stdout_bytes: usize,
247 pub max_stderr_bytes: usize,
248 pub max_input_bytes: usize,
249 pub max_chunks: usize,
250 #[serde(default = "default_require_absolute_executable")]
251 pub require_absolute_executable: bool,
252 #[serde(default = "default_reject_world_writable_executable")]
253 pub reject_world_writable_executable: bool,
254 #[serde(default)]
255 pub allowed_executable_roots: Vec<String>,
256 #[serde(default)]
257 pub preserve_env_vars: Vec<String>,
258}
259
260impl Default for VectorGeneratorPolicy {
261 fn default() -> Self {
262 Self {
263 timeout_ms: 300_000,
264 max_stdout_bytes: 64 * 1024 * 1024,
265 max_stderr_bytes: 1024 * 1024,
266 max_input_bytes: 64 * 1024 * 1024,
267 max_chunks: 1_000_000,
268 require_absolute_executable: true,
269 reject_world_writable_executable: true,
270 allowed_executable_roots: vec![],
271 preserve_env_vars: vec![],
272 }
273 }
274}
275
276const fn default_require_absolute_executable() -> bool {
277 true
278}
279
280const fn default_reject_world_writable_executable() -> bool {
281 true
282}
283
284const CURRENT_VECTOR_CONTRACT_FORMAT_VERSION: i64 = 1;
285const MAX_PROFILE_LEN: usize = 128;
286const MAX_MODEL_IDENTITY_LEN: usize = 256;
287const MAX_MODEL_VERSION_LEN: usize = 128;
288const MAX_POLICY_LEN: usize = 128;
289const MAX_GENERATOR_COMMAND_ARG_LEN: usize = 4096;
290const MAX_GENERATOR_COMMAND_TOTAL_LEN: usize = 16 * 1024;
291const MAX_CONTRACT_JSON_BYTES: usize = 32 * 1024;
292const MAX_AUDIT_METADATA_BYTES: usize = 2048;
293const DEFAULT_OPERATIONAL_READ_LIMIT: usize = 100;
294const MAX_OPERATIONAL_READ_LIMIT: usize = 1000;
295
296#[derive(Clone, Debug)]
298pub struct AdminHandle {
299 inner: Arc<AdminService>,
300}
301
302impl AdminHandle {
303 #[must_use]
305 pub fn new(service: AdminService) -> Self {
306 Self {
307 inner: Arc::new(service),
308 }
309 }
310
311 #[must_use]
313 pub fn service(&self) -> Arc<AdminService> {
314 Arc::clone(&self.inner)
315 }
316}
317
318impl AdminService {
319 #[must_use]
321 pub fn new(path: impl AsRef<Path>, schema_manager: Arc<SchemaManager>) -> Self {
322 let database_path = path.as_ref().to_path_buf();
323 let projections = ProjectionService::new(&database_path, Arc::clone(&schema_manager));
324 Self {
325 database_path,
326 schema_manager,
327 projections,
328 }
329 }
330
331 fn connect(&self) -> Result<rusqlite::Connection, EngineError> {
332 #[cfg(feature = "sqlite-vec")]
333 let conn = sqlite::open_connection_with_vec(&self.database_path)?;
334 #[cfg(not(feature = "sqlite-vec"))]
335 let conn = sqlite::open_connection(&self.database_path)?;
336 self.schema_manager.bootstrap(&conn)?;
337 Ok(conn)
338 }
339
340 pub fn check_integrity(&self) -> Result<IntegrityReport, EngineError> {
343 let conn = self.connect()?;
344
345 let physical_result: String =
346 conn.query_row("PRAGMA integrity_check", [], |row| row.get(0))?;
347 let foreign_key_count: i64 =
348 conn.query_row("SELECT count(*) FROM pragma_foreign_key_check", [], |row| {
349 row.get(0)
350 })?;
351 let missing_fts_rows: i64 = conn.query_row(
352 r"
353 SELECT count(*)
354 FROM chunks c
355 JOIN nodes n
356 ON n.logical_id = c.node_logical_id
357 AND n.superseded_at IS NULL
358 WHERE NOT EXISTS (
359 SELECT 1
360 FROM fts_nodes f
361 WHERE f.chunk_id = c.id
362 )
363 ",
364 [],
365 |row| row.get(0),
366 )?;
367 let duplicate_active: i64 = conn.query_row(
368 r"
369 SELECT count(*)
370 FROM (
371 SELECT logical_id
372 FROM nodes
373 WHERE superseded_at IS NULL
374 GROUP BY logical_id
375 HAVING count(*) > 1
376 )
377 ",
378 [],
379 |row| row.get(0),
380 )?;
381 let operational_missing_collections: i64 = conn.query_row(
382 r"
383 SELECT (
384 SELECT count(*)
385 FROM operational_mutations m
386 LEFT JOIN operational_collections c ON c.name = m.collection_name
387 WHERE c.name IS NULL
388 ) + (
389 SELECT count(*)
390 FROM operational_current oc
391 LEFT JOIN operational_collections c ON c.name = oc.collection_name
392 WHERE c.name IS NULL
393 )
394 ",
395 [],
396 |row| row.get(0),
397 )?;
398 let operational_missing_last_mutations: i64 = conn.query_row(
399 r"
400 SELECT count(*)
401 FROM operational_current oc
402 LEFT JOIN operational_mutations m ON m.id = oc.last_mutation_id
403 WHERE m.id IS NULL
404 ",
405 [],
406 |row| row.get(0),
407 )?;
408
409 let missing_property_fts_rows = count_missing_property_fts_rows(&conn)?;
413
414 let mut warnings = Vec::new();
415 if missing_fts_rows > 0 {
416 warnings.push("missing FTS projections detected".to_owned());
417 }
418 if missing_property_fts_rows > 0 {
419 warnings.push("missing property FTS projections detected".to_owned());
420 }
421 if duplicate_active > 0 {
422 warnings.push("duplicate active logical_ids detected".to_owned());
423 }
424 if operational_missing_collections > 0 {
425 warnings.push("operational rows reference missing collections".to_owned());
426 }
427 if operational_missing_last_mutations > 0 {
428 warnings.push("operational current rows reference missing last mutations".to_owned());
429 }
430
431 Ok(IntegrityReport {
436 physical_ok: physical_result == "ok",
437 foreign_keys_ok: foreign_key_count == 0,
438 missing_fts_rows: i64_to_usize(missing_fts_rows),
439 missing_property_fts_rows: i64_to_usize(missing_property_fts_rows),
440 duplicate_active_logical_ids: i64_to_usize(duplicate_active),
441 operational_missing_collections: i64_to_usize(operational_missing_collections),
442 operational_missing_last_mutations: i64_to_usize(operational_missing_last_mutations),
443 warnings,
444 })
445 }
446
447 #[allow(clippy::too_many_lines)]
450 pub fn check_semantics(&self) -> Result<SemanticReport, EngineError> {
451 let conn = self.connect()?;
452
453 let orphaned_chunks: i64 = conn.query_row(
454 r"
455 SELECT count(*)
456 FROM chunks c
457 WHERE NOT EXISTS (
458 SELECT 1 FROM nodes n
459 WHERE n.logical_id = c.node_logical_id
460 )
461 ",
462 [],
463 |row| row.get(0),
464 )?;
465
466 let null_source_ref_nodes: i64 = conn.query_row(
467 "SELECT count(*) FROM nodes WHERE source_ref IS NULL AND superseded_at IS NULL",
468 [],
469 |row| row.get(0),
470 )?;
471
472 let broken_step_fk: i64 = conn.query_row(
473 r"
474 SELECT count(*) FROM steps s
475 WHERE NOT EXISTS (SELECT 1 FROM runs r WHERE r.id = s.run_id)
476 ",
477 [],
478 |row| row.get(0),
479 )?;
480
481 let broken_action_fk: i64 = conn.query_row(
482 r"
483 SELECT count(*) FROM actions a
484 WHERE NOT EXISTS (SELECT 1 FROM steps s WHERE s.id = a.step_id)
485 ",
486 [],
487 |row| row.get(0),
488 )?;
489
490 let stale_fts_rows: i64 = conn.query_row(
491 r"
492 SELECT count(*) FROM fts_nodes f
493 WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = f.chunk_id)
494 ",
495 [],
496 |row| row.get(0),
497 )?;
498
499 let fts_rows_for_superseded_nodes: i64 = conn.query_row(
500 r"
501 SELECT count(*) FROM fts_nodes f
502 WHERE NOT EXISTS (
503 SELECT 1 FROM nodes n
504 WHERE n.logical_id = f.node_logical_id AND n.superseded_at IS NULL
505 )
506 ",
507 [],
508 |row| row.get(0),
509 )?;
510
511 let stale_property_fts_rows: i64 = conn.query_row(
512 r"
513 SELECT count(*) FROM fts_node_properties fp
514 WHERE NOT EXISTS (
515 SELECT 1 FROM nodes n
516 WHERE n.logical_id = fp.node_logical_id AND n.superseded_at IS NULL
517 )
518 ",
519 [],
520 |row| row.get(0),
521 )?;
522
523 let orphaned_property_fts_rows: i64 = conn.query_row(
524 r"
525 SELECT count(*) FROM fts_node_properties fp
526 WHERE NOT EXISTS (
527 SELECT 1 FROM fts_property_schemas s WHERE s.kind = fp.kind
528 )
529 ",
530 [],
531 |row| row.get(0),
532 )?;
533
534 let mismatched_kind_property_fts_rows: i64 = conn.query_row(
535 r"
536 SELECT count(*) FROM fts_node_properties fp
537 JOIN nodes n ON n.logical_id = fp.node_logical_id AND n.superseded_at IS NULL
538 WHERE n.kind != fp.kind
539 ",
540 [],
541 |row| row.get(0),
542 )?;
543
544 let duplicate_property_fts_rows: i64 = conn.query_row(
545 r"
546 SELECT count(*) FROM (
547 SELECT node_logical_id FROM fts_node_properties
548 GROUP BY node_logical_id
549 HAVING count(*) > 1
550 )
551 ",
552 [],
553 |row| row.get(0),
554 )?;
555
556 let drifted_property_fts_rows = count_drifted_property_fts_rows(&conn)?;
557
558 let dangling_edges: i64 = conn.query_row(
559 r"
560 SELECT count(*) FROM edges e
561 WHERE e.superseded_at IS NULL AND (
562 NOT EXISTS (SELECT 1 FROM nodes n WHERE n.logical_id = e.source_logical_id AND n.superseded_at IS NULL)
563 OR
564 NOT EXISTS (SELECT 1 FROM nodes n WHERE n.logical_id = e.target_logical_id AND n.superseded_at IS NULL)
565 )
566 ",
567 [],
568 |row| row.get(0),
569 )?;
570
571 let orphaned_supersession_chains: i64 = conn.query_row(
572 r"
573 SELECT count(*) FROM (
574 SELECT logical_id FROM nodes
575 GROUP BY logical_id
576 HAVING count(*) > 0 AND sum(CASE WHEN superseded_at IS NULL THEN 1 ELSE 0 END) = 0
577 )
578 ",
579 [],
580 |row| row.get(0),
581 )?;
582
583 #[cfg(feature = "sqlite-vec")]
585 let stale_vec_rows: i64 = match conn.query_row(
586 r"
587 SELECT count(*) FROM vec_nodes_active v
588 WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = v.chunk_id)
589 ",
590 [],
591 |row| row.get(0),
592 ) {
593 Ok(n) => n,
594 Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
595 if msg.contains("vec_nodes_active") || msg.contains("no such module: vec0") =>
596 {
597 0
598 }
599 Err(e) => return Err(EngineError::Sqlite(e)),
600 };
601 #[cfg(not(feature = "sqlite-vec"))]
602 let stale_vec_rows: i64 = 0;
603
604 #[cfg(feature = "sqlite-vec")]
605 let vec_rows_for_superseded_nodes: i64 = match conn.query_row(
606 r"
607 SELECT count(*) FROM vec_nodes_active v
608 JOIN chunks c ON c.id = v.chunk_id
609 WHERE NOT EXISTS (
610 SELECT 1 FROM nodes n
611 WHERE n.logical_id = c.node_logical_id
612 )
613 ",
614 [],
615 |row| row.get(0),
616 ) {
617 Ok(n) => n,
618 Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
619 if msg.contains("vec_nodes_active") || msg.contains("no such module: vec0") =>
620 {
621 0
622 }
623 Err(e) => return Err(EngineError::Sqlite(e)),
624 };
625 #[cfg(not(feature = "sqlite-vec"))]
626 let vec_rows_for_superseded_nodes: i64 = 0;
627 let missing_operational_current_rows: i64 = conn.query_row(
628 r"
629 SELECT count(*)
630 FROM operational_mutations m
631 JOIN operational_collections c
632 ON c.name = m.collection_name
633 AND c.kind = 'latest_state'
634 WHERE m.op_kind = 'put'
635 AND NOT EXISTS (
636 SELECT 1
637 FROM operational_mutations newer
638 WHERE newer.collection_name = m.collection_name
639 AND newer.record_key = m.record_key
640 AND newer.mutation_order > m.mutation_order
641 )
642 AND NOT EXISTS (
643 SELECT 1
644 FROM operational_current oc
645 WHERE oc.collection_name = m.collection_name
646 AND oc.record_key = m.record_key
647 )
648 ",
649 [],
650 |row| row.get(0),
651 )?;
652 let stale_operational_current_rows: i64 = conn.query_row(
653 r"
654 SELECT count(*)
655 FROM operational_current oc
656 JOIN operational_collections c
657 ON c.name = oc.collection_name
658 AND c.kind = 'latest_state'
659 LEFT JOIN operational_mutations m ON m.id = oc.last_mutation_id
660 WHERE m.id IS NULL
661 OR m.collection_name != oc.collection_name
662 OR m.record_key != oc.record_key
663 OR m.op_kind != 'put'
664 OR m.payload_json != oc.payload_json
665 OR EXISTS (
666 SELECT 1
667 FROM operational_mutations newer
668 WHERE newer.collection_name = oc.collection_name
669 AND newer.record_key = oc.record_key
670 AND newer.mutation_order > m.mutation_order
671 )
672 ",
673 [],
674 |row| row.get(0),
675 )?;
676 let disabled_collection_mutations: i64 = conn.query_row(
677 r"
678 SELECT count(*)
679 FROM operational_mutations m
680 JOIN operational_collections c ON c.name = m.collection_name
681 WHERE c.disabled_at IS NOT NULL AND m.created_at > c.disabled_at
682 ",
683 [],
684 |row| row.get(0),
685 )?;
686 let orphaned_last_access_metadata_rows: i64 = conn.query_row(
687 r"
688 SELECT count(*)
689 FROM node_access_metadata am
690 WHERE NOT EXISTS (
691 SELECT 1 FROM nodes n WHERE n.logical_id = am.logical_id
692 )
693 ",
694 [],
695 |row| row.get(0),
696 )?;
697
698 let mut warnings = Vec::new();
699 if orphaned_chunks > 0 {
700 warnings.push(format!(
701 "{orphaned_chunks} orphaned chunk(s) with no surviving node history"
702 ));
703 }
704 if null_source_ref_nodes > 0 {
705 warnings.push(format!(
706 "{null_source_ref_nodes} active node(s) with null source_ref"
707 ));
708 }
709 if broken_step_fk > 0 {
710 warnings.push(format!(
711 "{broken_step_fk} step(s) referencing non-existent run"
712 ));
713 }
714 if broken_action_fk > 0 {
715 warnings.push(format!(
716 "{broken_action_fk} action(s) referencing non-existent step"
717 ));
718 }
719 if stale_fts_rows > 0 {
720 warnings.push(format!(
721 "{stale_fts_rows} stale FTS row(s) referencing missing chunk"
722 ));
723 }
724 if fts_rows_for_superseded_nodes > 0 {
725 warnings.push(format!(
726 "{fts_rows_for_superseded_nodes} FTS row(s) for superseded node(s)"
727 ));
728 }
729 if stale_property_fts_rows > 0 {
730 warnings.push(format!(
731 "{stale_property_fts_rows} stale property FTS row(s) for superseded/missing node(s)"
732 ));
733 }
734 if orphaned_property_fts_rows > 0 {
735 warnings.push(format!(
736 "{orphaned_property_fts_rows} orphaned property FTS row(s) for unregistered kind(s)"
737 ));
738 }
739 if mismatched_kind_property_fts_rows > 0 {
740 warnings.push(format!(
741 "{mismatched_kind_property_fts_rows} property FTS row(s) whose kind does not match the active node"
742 ));
743 }
744 if duplicate_property_fts_rows > 0 {
745 warnings.push(format!(
746 "{duplicate_property_fts_rows} active logical ID(s) with duplicate property FTS rows"
747 ));
748 }
749 if drifted_property_fts_rows > 0 {
750 warnings.push(format!(
751 "{drifted_property_fts_rows} property FTS row(s) with stale text_content"
752 ));
753 }
754 if dangling_edges > 0 {
755 warnings.push(format!(
756 "{dangling_edges} active edge(s) with missing endpoint node"
757 ));
758 }
759 if orphaned_supersession_chains > 0 {
760 warnings.push(format!(
761 "{orphaned_supersession_chains} logical_id(s) with all versions superseded"
762 ));
763 }
764 if stale_vec_rows > 0 {
765 warnings.push(format!(
766 "{stale_vec_rows} stale vec row(s) referencing missing chunk"
767 ));
768 }
769 if vec_rows_for_superseded_nodes > 0 {
770 warnings.push(format!(
771 "{vec_rows_for_superseded_nodes} vec row(s) whose node history is missing"
772 ));
773 }
774 if missing_operational_current_rows > 0 {
775 warnings.push(format!(
776 "{missing_operational_current_rows} latest-state key(s) missing operational_current rows"
777 ));
778 }
779 if stale_operational_current_rows > 0 {
780 warnings.push(format!(
781 "{stale_operational_current_rows} stale operational_current row(s)"
782 ));
783 }
784 if disabled_collection_mutations > 0 {
785 warnings.push(format!(
786 "{disabled_collection_mutations} mutation(s) were written after collection disable"
787 ));
788 }
789 if orphaned_last_access_metadata_rows > 0 {
790 warnings.push(format!(
791 "{orphaned_last_access_metadata_rows} last_access metadata row(s) reference missing node history"
792 ));
793 }
794
795 Ok(SemanticReport {
796 orphaned_chunks: i64_to_usize(orphaned_chunks),
797 null_source_ref_nodes: i64_to_usize(null_source_ref_nodes),
798 broken_step_fk: i64_to_usize(broken_step_fk),
799 broken_action_fk: i64_to_usize(broken_action_fk),
800 stale_fts_rows: i64_to_usize(stale_fts_rows),
801 fts_rows_for_superseded_nodes: i64_to_usize(fts_rows_for_superseded_nodes),
802 stale_property_fts_rows: i64_to_usize(stale_property_fts_rows),
803 orphaned_property_fts_rows: i64_to_usize(orphaned_property_fts_rows),
804 mismatched_kind_property_fts_rows: i64_to_usize(mismatched_kind_property_fts_rows),
805 duplicate_property_fts_rows: i64_to_usize(duplicate_property_fts_rows),
806 drifted_property_fts_rows: i64_to_usize(drifted_property_fts_rows),
807 dangling_edges: i64_to_usize(dangling_edges),
808 orphaned_supersession_chains: i64_to_usize(orphaned_supersession_chains),
809 stale_vec_rows: i64_to_usize(stale_vec_rows),
810 vec_rows_for_superseded_nodes: i64_to_usize(vec_rows_for_superseded_nodes),
811 missing_operational_current_rows: i64_to_usize(missing_operational_current_rows),
812 stale_operational_current_rows: i64_to_usize(stale_operational_current_rows),
813 disabled_collection_mutations: i64_to_usize(disabled_collection_mutations),
814 orphaned_last_access_metadata_rows: i64_to_usize(orphaned_last_access_metadata_rows),
815 warnings,
816 })
817 }
818
819 pub fn register_operational_collection(
822 &self,
823 request: &OperationalRegisterRequest,
824 ) -> Result<OperationalCollectionRecord, EngineError> {
825 if request.name.trim().is_empty() {
826 return Err(EngineError::InvalidWrite(
827 "operational collection name must not be empty".to_owned(),
828 ));
829 }
830 if request.schema_json.is_empty() {
831 return Err(EngineError::InvalidWrite(
832 "operational collection schema_json must not be empty".to_owned(),
833 ));
834 }
835 if request.retention_json.is_empty() {
836 return Err(EngineError::InvalidWrite(
837 "operational collection retention_json must not be empty".to_owned(),
838 ));
839 }
840 if request.filter_fields_json.is_empty() {
841 return Err(EngineError::InvalidWrite(
842 "operational collection filter_fields_json must not be empty".to_owned(),
843 ));
844 }
845 parse_operational_validation_contract(&request.validation_json)
846 .map_err(EngineError::InvalidWrite)?;
847 parse_operational_secondary_indexes_json(&request.secondary_indexes_json, request.kind)
848 .map_err(EngineError::InvalidWrite)?;
849 if request.format_version <= 0 {
850 return Err(EngineError::InvalidWrite(
851 "operational collection format_version must be positive".to_owned(),
852 ));
853 }
854 parse_operational_filter_fields(&request.filter_fields_json)
855 .map_err(EngineError::InvalidWrite)?;
856
857 let mut conn = self.connect()?;
858 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
859 tx.execute(
860 "INSERT INTO operational_collections \
861 (name, kind, schema_json, retention_json, filter_fields_json, validation_json, secondary_indexes_json, format_version, created_at) \
862 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, unixepoch())",
863 rusqlite::params![
864 request.name.as_str(),
865 request.kind.as_str(),
866 request.schema_json.as_str(),
867 request.retention_json.as_str(),
868 request.filter_fields_json.as_str(),
869 request.validation_json.as_str(),
870 request.secondary_indexes_json.as_str(),
871 request.format_version,
872 ],
873 )?;
874 persist_simple_provenance_event(
875 &tx,
876 "operational_collection_registered",
877 request.name.as_str(),
878 Some(serde_json::json!({
879 "kind": request.kind.as_str(),
880 "format_version": request.format_version,
881 })),
882 )?;
883 tx.commit()?;
884
885 self.describe_operational_collection(&request.name)?
886 .ok_or_else(|| {
887 EngineError::Bridge("registered collection missing after commit".to_owned())
888 })
889 }
890
891 pub fn describe_operational_collection(
894 &self,
895 name: &str,
896 ) -> Result<Option<OperationalCollectionRecord>, EngineError> {
897 let conn = self.connect()?;
898 load_operational_collection_record(&conn, name)
899 }
900
901 pub fn update_operational_collection_filters(
905 &self,
906 name: &str,
907 filter_fields_json: &str,
908 ) -> Result<OperationalCollectionRecord, EngineError> {
909 if filter_fields_json.is_empty() {
910 return Err(EngineError::InvalidWrite(
911 "operational collection filter_fields_json must not be empty".to_owned(),
912 ));
913 }
914 let declared_fields = parse_operational_filter_fields(filter_fields_json)
915 .map_err(EngineError::InvalidWrite)?;
916
917 let mut conn = self.connect()?;
918 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
919 load_operational_collection_record(&tx, name)?.ok_or_else(|| {
920 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
921 })?;
922 tx.execute(
923 "UPDATE operational_collections SET filter_fields_json = ?2 WHERE name = ?1",
924 rusqlite::params![name, filter_fields_json],
925 )?;
926 tx.execute(
927 "DELETE FROM operational_filter_values WHERE collection_name = ?1",
928 [name],
929 )?;
930
931 let mut mutation_stmt = tx.prepare(
932 "SELECT id, payload_json FROM operational_mutations \
933 WHERE collection_name = ?1 ORDER BY mutation_order",
934 )?;
935 let mutations = mutation_stmt
936 .query_map([name], |row| {
937 Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
938 })?
939 .collect::<Result<Vec<_>, _>>()?;
940 drop(mutation_stmt);
941
942 let mut insert_filter_value = tx.prepare_cached(
943 "INSERT INTO operational_filter_values \
944 (mutation_id, collection_name, field_name, string_value, integer_value) \
945 VALUES (?1, ?2, ?3, ?4, ?5)",
946 )?;
947 let mut inserted_values = 0usize;
948 for (mutation_id, payload_json) in &mutations {
949 for filter_value in
950 extract_operational_filter_values(&declared_fields, payload_json.as_str())
951 {
952 insert_filter_value.execute(rusqlite::params![
953 mutation_id,
954 name,
955 filter_value.field_name,
956 filter_value.string_value,
957 filter_value.integer_value,
958 ])?;
959 inserted_values += 1;
960 }
961 }
962 drop(insert_filter_value);
963
964 persist_simple_provenance_event(
965 &tx,
966 "operational_collection_filter_fields_updated",
967 name,
968 Some(serde_json::json!({
969 "field_count": declared_fields.len(),
970 "mutations_backfilled": mutations.len(),
971 "inserted_filter_values": inserted_values,
972 })),
973 )?;
974 let updated = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
975 EngineError::Bridge("operational collection missing after filter update".to_owned())
976 })?;
977 tx.commit()?;
978 Ok(updated)
979 }
980
981 pub fn update_operational_collection_validation(
984 &self,
985 name: &str,
986 validation_json: &str,
987 ) -> Result<OperationalCollectionRecord, EngineError> {
988 parse_operational_validation_contract(validation_json)
989 .map_err(EngineError::InvalidWrite)?;
990
991 let mut conn = self.connect()?;
992 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
993 load_operational_collection_record(&tx, name)?.ok_or_else(|| {
994 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
995 })?;
996 tx.execute(
997 "UPDATE operational_collections SET validation_json = ?2 WHERE name = ?1",
998 rusqlite::params![name, validation_json],
999 )?;
1000 persist_simple_provenance_event(
1001 &tx,
1002 "operational_collection_validation_updated",
1003 name,
1004 Some(serde_json::json!({
1005 "has_validation": !validation_json.is_empty(),
1006 })),
1007 )?;
1008 let updated = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1009 EngineError::Bridge("operational collection missing after validation update".to_owned())
1010 })?;
1011 tx.commit()?;
1012 Ok(updated)
1013 }
1014
1015 pub fn update_operational_collection_secondary_indexes(
1019 &self,
1020 name: &str,
1021 secondary_indexes_json: &str,
1022 ) -> Result<OperationalCollectionRecord, EngineError> {
1023 let mut conn = self.connect()?;
1024 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1025 let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1026 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1027 })?;
1028 let indexes = parse_operational_secondary_indexes_json(secondary_indexes_json, record.kind)
1029 .map_err(EngineError::InvalidWrite)?;
1030 tx.execute(
1031 "UPDATE operational_collections SET secondary_indexes_json = ?2 WHERE name = ?1",
1032 rusqlite::params![name, secondary_indexes_json],
1033 )?;
1034 let (mutation_entries_rebuilt, current_entries_rebuilt) =
1035 rebuild_operational_secondary_index_entries(&tx, &record.name, record.kind, &indexes)?;
1036 persist_simple_provenance_event(
1037 &tx,
1038 "operational_collection_secondary_indexes_updated",
1039 name,
1040 Some(serde_json::json!({
1041 "index_count": indexes.len(),
1042 "mutation_entries_rebuilt": mutation_entries_rebuilt,
1043 "current_entries_rebuilt": current_entries_rebuilt,
1044 })),
1045 )?;
1046 let updated = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1047 EngineError::Bridge(
1048 "operational collection missing after secondary index update".to_owned(),
1049 )
1050 })?;
1051 tx.commit()?;
1052 Ok(updated)
1053 }
1054
1055 pub fn rebuild_operational_secondary_indexes(
1058 &self,
1059 name: &str,
1060 ) -> Result<OperationalSecondaryIndexRebuildReport, EngineError> {
1061 let mut conn = self.connect()?;
1062 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1063 let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1064 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1065 })?;
1066 let indexes =
1067 parse_operational_secondary_indexes_json(&record.secondary_indexes_json, record.kind)
1068 .map_err(EngineError::InvalidWrite)?;
1069 let (mutation_entries_rebuilt, current_entries_rebuilt) =
1070 rebuild_operational_secondary_index_entries(&tx, &record.name, record.kind, &indexes)?;
1071 persist_simple_provenance_event(
1072 &tx,
1073 "operational_secondary_indexes_rebuilt",
1074 name,
1075 Some(serde_json::json!({
1076 "index_count": indexes.len(),
1077 "mutation_entries_rebuilt": mutation_entries_rebuilt,
1078 "current_entries_rebuilt": current_entries_rebuilt,
1079 })),
1080 )?;
1081 tx.commit()?;
1082 Ok(OperationalSecondaryIndexRebuildReport {
1083 collection_name: name.to_owned(),
1084 mutation_entries_rebuilt,
1085 current_entries_rebuilt,
1086 })
1087 }
1088
1089 pub fn validate_operational_collection_history(
1092 &self,
1093 name: &str,
1094 ) -> Result<OperationalHistoryValidationReport, EngineError> {
1095 let conn = self.connect()?;
1096 let record = load_operational_collection_record(&conn, name)?.ok_or_else(|| {
1097 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1098 })?;
1099 let Some(contract) = parse_operational_validation_contract(&record.validation_json)
1100 .map_err(EngineError::InvalidWrite)?
1101 else {
1102 return Err(EngineError::InvalidWrite(format!(
1103 "operational collection '{name}' has no validation_json configured"
1104 )));
1105 };
1106
1107 let mut stmt = conn.prepare(
1108 "SELECT id, record_key, op_kind, payload_json FROM operational_mutations \
1109 WHERE collection_name = ?1 ORDER BY mutation_order",
1110 )?;
1111 let rows = stmt
1112 .query_map([name], |row| {
1113 Ok((
1114 row.get::<_, String>(0)?,
1115 row.get::<_, String>(1)?,
1116 row.get::<_, String>(2)?,
1117 row.get::<_, String>(3)?,
1118 ))
1119 })?
1120 .collect::<Result<Vec<_>, _>>()?;
1121 drop(stmt);
1122
1123 let mut checked_rows = 0usize;
1124 let mut issues = Vec::new();
1125 for (mutation_id, record_key, op_kind, payload_json) in rows {
1126 if op_kind == "delete" {
1127 continue;
1128 }
1129 checked_rows += 1;
1130 if let Err(message) =
1131 validate_operational_payload_against_contract(&contract, payload_json.as_str())
1132 {
1133 issues.push(OperationalHistoryValidationIssue {
1134 mutation_id,
1135 record_key,
1136 op_kind,
1137 message,
1138 });
1139 }
1140 }
1141
1142 Ok(OperationalHistoryValidationReport {
1143 collection_name: name.to_owned(),
1144 checked_rows,
1145 invalid_row_count: issues.len(),
1146 issues,
1147 })
1148 }
1149
1150 pub fn disable_operational_collection(
1153 &self,
1154 name: &str,
1155 ) -> Result<OperationalCollectionRecord, EngineError> {
1156 let mut conn = self.connect()?;
1157 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1158 let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1159 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1160 })?;
1161 let changed = if record.disabled_at.is_none() {
1162 tx.execute(
1163 "UPDATE operational_collections SET disabled_at = unixepoch() WHERE name = ?1",
1164 [name],
1165 )?;
1166 true
1167 } else {
1168 false
1169 };
1170 let record = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1171 EngineError::Bridge("operational collection missing after disable".to_owned())
1172 })?;
1173 persist_simple_provenance_event(
1174 &tx,
1175 "operational_collection_disabled",
1176 name,
1177 Some(serde_json::json!({
1178 "disabled_at": record.disabled_at,
1179 "changed": changed,
1180 })),
1181 )?;
1182 tx.commit()?;
1183 Ok(record)
1184 }
1185
1186 pub fn compact_operational_collection(
1189 &self,
1190 name: &str,
1191 dry_run: bool,
1192 ) -> Result<OperationalCompactionReport, EngineError> {
1193 let mut conn = self.connect()?;
1194 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1195 let collection = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1196 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1197 })?;
1198 validate_append_only_operational_collection(&collection, "compact")?;
1199 let (mutation_ids, before_timestamp) =
1200 operational_compaction_candidates(&tx, &collection.retention_json, name)?;
1201 if dry_run {
1202 drop(tx);
1203 return Ok(OperationalCompactionReport {
1204 collection_name: name.to_owned(),
1205 deleted_mutations: mutation_ids.len(),
1206 dry_run: true,
1207 before_timestamp,
1208 });
1209 }
1210 let mut delete_stmt =
1211 tx.prepare_cached("DELETE FROM operational_mutations WHERE id = ?1")?;
1212 for mutation_id in &mutation_ids {
1213 delete_stmt.execute([mutation_id.as_str()])?;
1214 }
1215 drop(delete_stmt);
1216 persist_simple_provenance_event(
1217 &tx,
1218 "operational_collection_compacted",
1219 name,
1220 Some(serde_json::json!({
1221 "deleted_mutations": mutation_ids.len(),
1222 "before_timestamp": before_timestamp,
1223 })),
1224 )?;
1225 tx.commit()?;
1226 Ok(OperationalCompactionReport {
1227 collection_name: name.to_owned(),
1228 deleted_mutations: mutation_ids.len(),
1229 dry_run: false,
1230 before_timestamp,
1231 })
1232 }
1233
1234 pub fn purge_operational_collection(
1237 &self,
1238 name: &str,
1239 before_timestamp: i64,
1240 ) -> Result<OperationalPurgeReport, EngineError> {
1241 let mut conn = self.connect()?;
1242 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1243 let collection = load_operational_collection_record(&tx, name)?.ok_or_else(|| {
1244 EngineError::InvalidWrite(format!("operational collection '{name}' is not registered"))
1245 })?;
1246 validate_append_only_operational_collection(&collection, "purge")?;
1247 let deleted_mutations = tx.execute(
1248 "DELETE FROM operational_mutations WHERE collection_name = ?1 AND created_at < ?2",
1249 rusqlite::params![name, before_timestamp],
1250 )?;
1251 persist_simple_provenance_event(
1252 &tx,
1253 "operational_collection_purged",
1254 name,
1255 Some(serde_json::json!({
1256 "deleted_mutations": deleted_mutations,
1257 "before_timestamp": before_timestamp,
1258 })),
1259 )?;
1260 tx.commit()?;
1261 Ok(OperationalPurgeReport {
1262 collection_name: name.to_owned(),
1263 deleted_mutations,
1264 before_timestamp,
1265 })
1266 }
1267
1268 pub fn plan_operational_retention(
1271 &self,
1272 now_timestamp: i64,
1273 collection_names: Option<&[String]>,
1274 max_collections: Option<usize>,
1275 ) -> Result<OperationalRetentionPlanReport, EngineError> {
1276 let conn = self.connect()?;
1277 let records = load_operational_retention_records(&conn, collection_names, max_collections)?;
1278 let mut items = Vec::with_capacity(records.len());
1279 for record in records {
1280 items.push(plan_operational_retention_item(
1281 &conn,
1282 &record,
1283 now_timestamp,
1284 )?);
1285 }
1286 Ok(OperationalRetentionPlanReport {
1287 planned_at: now_timestamp,
1288 collections_examined: items.len(),
1289 items,
1290 })
1291 }
1292
1293 pub fn run_operational_retention(
1296 &self,
1297 now_timestamp: i64,
1298 collection_names: Option<&[String]>,
1299 max_collections: Option<usize>,
1300 dry_run: bool,
1301 ) -> Result<OperationalRetentionRunReport, EngineError> {
1302 let mut conn = self.connect()?;
1303 let records = load_operational_retention_records(&conn, collection_names, max_collections)?;
1304 let mut items = Vec::with_capacity(records.len());
1305 let mut collections_acted_on = 0usize;
1306
1307 for record in records {
1308 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1309 let item = run_operational_retention_item(&tx, &record, now_timestamp, dry_run)?;
1310 if item.deleted_mutations > 0 {
1311 collections_acted_on += 1;
1312 }
1313 if dry_run || item.action_kind == OperationalRetentionActionKind::Noop {
1314 drop(tx);
1315 } else {
1316 tx.commit()?;
1317 }
1318 items.push(item);
1319 }
1320
1321 Ok(OperationalRetentionRunReport {
1322 executed_at: now_timestamp,
1323 collections_examined: items.len(),
1324 collections_acted_on,
1325 dry_run,
1326 items,
1327 })
1328 }
1329
1330 pub fn trace_operational_collection(
1333 &self,
1334 collection_name: &str,
1335 record_key: Option<&str>,
1336 ) -> Result<OperationalTraceReport, EngineError> {
1337 let conn = self.connect()?;
1338 ensure_operational_collection_registered(&conn, collection_name)?;
1339 let mutations = if let Some(record_key) = record_key {
1340 let mut stmt = conn.prepare(
1341 "SELECT id, collection_name, record_key, op_kind, payload_json, source_ref, created_at \
1342 FROM operational_mutations \
1343 WHERE collection_name = ?1 AND record_key = ?2 \
1344 ORDER BY mutation_order",
1345 )?;
1346 stmt.query_map([collection_name, record_key], map_operational_mutation_row)?
1347 .collect::<Result<Vec<_>, _>>()?
1348 } else {
1349 let mut stmt = conn.prepare(
1350 "SELECT id, collection_name, record_key, op_kind, payload_json, source_ref, created_at \
1351 FROM operational_mutations \
1352 WHERE collection_name = ?1 \
1353 ORDER BY mutation_order",
1354 )?;
1355 stmt.query_map([collection_name], map_operational_mutation_row)?
1356 .collect::<Result<Vec<_>, _>>()?
1357 };
1358 let current_rows = if let Some(record_key) = record_key {
1359 let mut stmt = conn.prepare(
1360 "SELECT collection_name, record_key, payload_json, updated_at, last_mutation_id \
1361 FROM operational_current \
1362 WHERE collection_name = ?1 AND record_key = ?2 \
1363 ORDER BY updated_at, record_key",
1364 )?;
1365 stmt.query_map([collection_name, record_key], map_operational_current_row)?
1366 .collect::<Result<Vec<_>, _>>()?
1367 } else {
1368 let mut stmt = conn.prepare(
1369 "SELECT collection_name, record_key, payload_json, updated_at, last_mutation_id \
1370 FROM operational_current \
1371 WHERE collection_name = ?1 \
1372 ORDER BY updated_at, record_key",
1373 )?;
1374 stmt.query_map([collection_name], map_operational_current_row)?
1375 .collect::<Result<Vec<_>, _>>()?
1376 };
1377
1378 Ok(OperationalTraceReport {
1379 collection_name: collection_name.to_owned(),
1380 record_key: record_key.map(str::to_owned),
1381 mutation_count: mutations.len(),
1382 current_count: current_rows.len(),
1383 mutations,
1384 current_rows,
1385 })
1386 }
1387
1388 pub fn read_operational_collection(
1391 &self,
1392 request: &OperationalReadRequest,
1393 ) -> Result<OperationalReadReport, EngineError> {
1394 if request.collection_name.trim().is_empty() {
1395 return Err(EngineError::InvalidWrite(
1396 "operational read collection_name must not be empty".to_owned(),
1397 ));
1398 }
1399 if request.filters.is_empty() {
1400 return Err(EngineError::InvalidWrite(
1401 "operational read requires at least one filter clause".to_owned(),
1402 ));
1403 }
1404
1405 let conn = self.connect()?;
1406 let record = load_operational_collection_record(&conn, &request.collection_name)?
1407 .ok_or_else(|| {
1408 EngineError::InvalidWrite(format!(
1409 "operational collection '{}' is not registered",
1410 request.collection_name
1411 ))
1412 })?;
1413 validate_append_only_operational_collection(&record, "read")?;
1414 let declared_fields = parse_operational_filter_fields(&record.filter_fields_json)
1415 .map_err(EngineError::InvalidWrite)?;
1416 let secondary_indexes =
1417 parse_operational_secondary_indexes_json(&record.secondary_indexes_json, record.kind)
1418 .map_err(EngineError::InvalidWrite)?;
1419 let applied_limit = operational_read_limit(request.limit)?;
1420 let filters = compile_operational_read_filters(&request.filters, &declared_fields)?;
1421 if let Some(report) = execute_operational_secondary_index_read(
1422 &conn,
1423 &request.collection_name,
1424 &filters,
1425 &secondary_indexes,
1426 applied_limit,
1427 )? {
1428 return Ok(report);
1429 }
1430 execute_operational_filtered_read(&conn, &request.collection_name, &filters, applied_limit)
1431 }
1432
1433 pub fn rebuild_operational_current(
1436 &self,
1437 collection_name: Option<&str>,
1438 ) -> Result<OperationalRepairReport, EngineError> {
1439 let mut conn = self.connect()?;
1440 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1441 let collections = if let Some(name) = collection_name {
1442 let maybe_kind: Option<String> = tx
1443 .query_row(
1444 "SELECT kind FROM operational_collections WHERE name = ?1",
1445 [name],
1446 |row| row.get(0),
1447 )
1448 .optional()?;
1449 let Some(kind) = maybe_kind else {
1450 return Err(EngineError::InvalidWrite(format!(
1451 "operational collection '{name}' is not registered"
1452 )));
1453 };
1454 if kind != OperationalCollectionKind::LatestState.as_str() {
1455 return Err(EngineError::InvalidWrite(format!(
1456 "operational collection '{name}' is not latest_state"
1457 )));
1458 }
1459 vec![name.to_owned()]
1460 } else {
1461 let mut stmt = tx.prepare(
1462 "SELECT name FROM operational_collections WHERE kind = 'latest_state' ORDER BY name",
1463 )?;
1464 stmt.query_map([], |row| row.get::<_, String>(0))?
1465 .collect::<Result<Vec<_>, _>>()?
1466 };
1467
1468 let rebuilt_rows = rebuild_operational_current_rows(&tx, &collections)?;
1469 for collection in &collections {
1470 let record = load_operational_collection_record(&tx, collection)?.ok_or_else(|| {
1471 EngineError::Bridge(format!(
1472 "operational collection '{collection}' missing during current rebuild"
1473 ))
1474 })?;
1475 let indexes = parse_operational_secondary_indexes_json(
1476 &record.secondary_indexes_json,
1477 record.kind,
1478 )
1479 .map_err(EngineError::InvalidWrite)?;
1480 if !indexes.is_empty() {
1481 rebuild_operational_secondary_index_entries(
1482 &tx,
1483 &record.name,
1484 record.kind,
1485 &indexes,
1486 )?;
1487 }
1488 }
1489
1490 persist_simple_provenance_event(
1491 &tx,
1492 "operational_current_rebuilt",
1493 collection_name.unwrap_or("*"),
1494 Some(serde_json::json!({
1495 "collections_rebuilt": collections.len(),
1496 "current_rows_rebuilt": rebuilt_rows,
1497 })),
1498 )?;
1499 tx.commit()?;
1500
1501 Ok(OperationalRepairReport {
1502 collections_rebuilt: collections.len(),
1503 current_rows_rebuilt: rebuilt_rows,
1504 })
1505 }
1506
1507 pub fn rebuild_projections(
1510 &self,
1511 target: ProjectionTarget,
1512 ) -> Result<ProjectionRepairReport, EngineError> {
1513 self.projections.rebuild_projections(target)
1514 }
1515
1516 pub fn rebuild_missing_projections(&self) -> Result<ProjectionRepairReport, EngineError> {
1519 self.projections.rebuild_missing_projections()
1520 }
1521
1522 pub fn register_fts_property_schema(
1531 &self,
1532 kind: &str,
1533 property_paths: &[String],
1534 separator: Option<&str>,
1535 ) -> Result<FtsPropertySchemaRecord, EngineError> {
1536 validate_fts_property_paths(property_paths)?;
1537 let separator = separator.unwrap_or(" ");
1538 let paths_json = serde_json::to_string(property_paths).map_err(|e| {
1539 EngineError::InvalidWrite(format!("failed to serialize property paths: {e}"))
1540 })?;
1541
1542 let mut conn = self.connect()?;
1543 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1544 tx.execute(
1545 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
1546 VALUES (?1, ?2, ?3) \
1547 ON CONFLICT(kind) DO UPDATE SET property_paths_json = ?2, separator = ?3",
1548 rusqlite::params![kind, paths_json, separator],
1549 )?;
1550 persist_simple_provenance_event(
1551 &tx,
1552 "fts_property_schema_registered",
1553 kind,
1554 Some(serde_json::json!({
1555 "property_paths": property_paths,
1556 "separator": separator,
1557 })),
1558 )?;
1559 tx.commit()?;
1560
1561 self.describe_fts_property_schema(kind)?.ok_or_else(|| {
1562 EngineError::Bridge("registered FTS property schema missing after commit".to_owned())
1563 })
1564 }
1565
1566 pub fn describe_fts_property_schema(
1571 &self,
1572 kind: &str,
1573 ) -> Result<Option<FtsPropertySchemaRecord>, EngineError> {
1574 let conn = self.connect()?;
1575 load_fts_property_schema_record(&conn, kind)
1576 }
1577
1578 pub fn list_fts_property_schemas(&self) -> Result<Vec<FtsPropertySchemaRecord>, EngineError> {
1583 let conn = self.connect()?;
1584 let mut stmt = conn.prepare(
1585 "SELECT kind, property_paths_json, separator, format_version \
1586 FROM fts_property_schemas ORDER BY kind",
1587 )?;
1588 let records = stmt
1589 .query_map([], |row| {
1590 let paths_json: String = row.get(1)?;
1591 let paths: Vec<String> = serde_json::from_str(&paths_json).unwrap_or_default();
1592 Ok(FtsPropertySchemaRecord {
1593 kind: row.get(0)?,
1594 property_paths: paths,
1595 separator: row.get(2)?,
1596 format_version: row.get(3)?,
1597 })
1598 })?
1599 .collect::<Result<Vec<_>, _>>()?;
1600 Ok(records)
1601 }
1602
1603 pub fn remove_fts_property_schema(&self, kind: &str) -> Result<(), EngineError> {
1611 let mut conn = self.connect()?;
1612 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1613 let deleted = tx.execute("DELETE FROM fts_property_schemas WHERE kind = ?1", [kind])?;
1614 if deleted == 0 {
1615 return Err(EngineError::InvalidWrite(format!(
1616 "FTS property schema for kind '{kind}' is not registered"
1617 )));
1618 }
1619 persist_simple_provenance_event(&tx, "fts_property_schema_removed", kind, None)?;
1620 tx.commit()?;
1621 Ok(())
1622 }
1623
1624 pub fn restore_vector_profiles(&self) -> Result<ProjectionRepairReport, EngineError> {
1630 let conn = self.connect()?;
1631 let profiles: Vec<(String, String, i64)> = {
1632 let mut stmt = conn.prepare(
1633 "SELECT profile, table_name, dimension \
1634 FROM vector_profiles WHERE enabled = 1 ORDER BY profile",
1635 )?;
1636 stmt.query_map([], |row| {
1637 Ok((
1638 row.get::<_, String>(0)?,
1639 row.get::<_, String>(1)?,
1640 row.get::<_, i64>(2)?,
1641 ))
1642 })?
1643 .collect::<Result<Vec<_>, _>>()?
1644 };
1645
1646 for (profile, table_name, dimension) in &profiles {
1647 let dimension = usize::try_from(*dimension).map_err(|_| {
1648 EngineError::Bridge(format!("invalid vector profile dimension: {dimension}"))
1649 })?;
1650 self.schema_manager
1651 .ensure_vector_profile(&conn, profile, table_name, dimension)?;
1652 }
1653
1654 Ok(ProjectionRepairReport {
1655 targets: vec![ProjectionTarget::Vec],
1656 rebuilt_rows: profiles.len(),
1657 notes: vec![],
1658 })
1659 }
1660
1661 #[allow(clippy::too_many_lines)]
1672 pub fn regenerate_vector_embeddings(
1673 &self,
1674 config: &VectorRegenerationConfig,
1675 ) -> Result<VectorRegenerationReport, EngineError> {
1676 self.regenerate_vector_embeddings_with_policy(config, &VectorGeneratorPolicy::default())
1677 }
1678
1679 #[allow(clippy::too_many_lines)]
1684 pub fn regenerate_vector_embeddings_with_policy(
1685 &self,
1686 config: &VectorRegenerationConfig,
1687 policy: &VectorGeneratorPolicy,
1688 ) -> Result<VectorRegenerationReport, EngineError> {
1689 let conn = self.connect()?;
1690 let config = validate_vector_regeneration_config(&conn, config, policy)
1691 .map_err(|failure| failure.to_engine_error())?;
1692 let chunks = collect_regeneration_chunks(&conn)?;
1693 let payload = build_regeneration_input(&config, chunks.clone());
1694 let snapshot_hash = compute_snapshot_hash(&payload)?;
1695 let audit_metadata = VectorRegenerationAuditMetadata {
1696 profile: config.profile.clone(),
1697 model_identity: config.model_identity.clone(),
1698 model_version: config.model_version.clone(),
1699 chunk_count: chunks.len(),
1700 snapshot_hash: snapshot_hash.clone(),
1701 failure_class: None,
1702 };
1703 persist_vector_regeneration_event(
1704 &conn,
1705 "vector_regeneration_requested",
1706 &config.profile,
1707 &audit_metadata,
1708 )?;
1709 let notes = generator_policy_notes(policy);
1710 let generated = match run_vector_generator_bounded(&config, &payload, policy) {
1711 Ok(generated) => generated,
1712 Err(failure) => {
1713 self.persist_vector_regeneration_failure_best_effort(
1714 &config.profile,
1715 &audit_metadata,
1716 &failure,
1717 );
1718 return Err(failure.to_engine_error());
1719 }
1720 };
1721 let mut embedding_map = match validate_generated_embeddings(&config, &chunks, generated) {
1722 Ok(embedding_map) => embedding_map,
1723 Err(failure) => {
1724 self.persist_vector_regeneration_failure_best_effort(
1725 &config.profile,
1726 &audit_metadata,
1727 &failure,
1728 );
1729 return Err(failure.to_engine_error());
1730 }
1731 };
1732
1733 let mut conn = conn;
1734 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1735 match self.schema_manager.ensure_vector_profile(
1736 &tx,
1737 &config.profile,
1738 &config.table_name,
1739 config.dimension,
1740 ) {
1741 Ok(()) => {}
1742 Err(SchemaError::MissingCapability(message)) => {
1743 let failure = VectorRegenerationFailure::new(
1744 VectorRegenerationFailureClass::UnsupportedVecCapability,
1745 message,
1746 );
1747 drop(tx);
1748 self.persist_vector_regeneration_failure_best_effort(
1749 &config.profile,
1750 &audit_metadata,
1751 &failure,
1752 );
1753 return Err(failure.to_engine_error());
1754 }
1755 Err(error) => return Err(EngineError::Schema(error)),
1756 }
1757 let apply_chunks = collect_regeneration_chunks(&tx)?;
1758 let apply_payload = build_regeneration_input(&config, apply_chunks.clone());
1759 let apply_hash = compute_snapshot_hash(&apply_payload)?;
1760 if apply_hash != snapshot_hash {
1761 let failure = VectorRegenerationFailure::new(
1762 VectorRegenerationFailureClass::SnapshotDrift,
1763 "chunk snapshot changed during generation; retry".to_owned(),
1764 );
1765 drop(tx);
1766 self.persist_vector_regeneration_failure_best_effort(
1767 &config.profile,
1768 &audit_metadata,
1769 &failure,
1770 );
1771 return Err(failure.to_engine_error());
1772 }
1773 persist_vector_contract(&tx, &config, &snapshot_hash)?;
1774 tx.execute("DELETE FROM vec_nodes_active", [])?;
1775 let mut stmt = tx
1776 .prepare_cached("INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES (?1, ?2)")?;
1777 let mut regenerated_rows = 0usize;
1778 for chunk in &apply_chunks {
1779 let Some(embedding) = embedding_map.remove(&chunk.chunk_id) else {
1780 drop(stmt);
1781 drop(tx);
1782 let failure = VectorRegenerationFailure::new(
1783 VectorRegenerationFailureClass::MalformedGeneratorJson,
1784 format!(
1785 "generator did not return embedding for chunk '{}'",
1786 chunk.chunk_id
1787 ),
1788 );
1789 self.persist_vector_regeneration_failure_best_effort(
1790 &config.profile,
1791 &audit_metadata,
1792 &failure,
1793 );
1794 return Err(failure.to_engine_error());
1795 };
1796 stmt.execute(rusqlite::params![chunk.chunk_id.as_str(), embedding])?;
1797 regenerated_rows += 1;
1798 }
1799 drop(stmt);
1800 persist_vector_regeneration_event(
1801 &tx,
1802 "vector_regeneration_apply",
1803 &config.profile,
1804 &audit_metadata,
1805 )?;
1806 tx.commit()?;
1807
1808 Ok(VectorRegenerationReport {
1809 profile: config.profile.clone(),
1810 table_name: config.table_name.clone(),
1811 dimension: config.dimension,
1812 total_chunks: chunks.len(),
1813 regenerated_rows,
1814 contract_persisted: true,
1815 notes,
1816 })
1817 }
1818
1819 fn persist_vector_regeneration_failure_best_effort(
1820 &self,
1821 profile: &str,
1822 metadata: &VectorRegenerationAuditMetadata,
1823 failure: &VectorRegenerationFailure,
1824 ) {
1825 let Ok(conn) = self.connect() else {
1826 return;
1827 };
1828 let failure_metadata = VectorRegenerationAuditMetadata {
1829 profile: metadata.profile.clone(),
1830 model_identity: metadata.model_identity.clone(),
1831 model_version: metadata.model_version.clone(),
1832 chunk_count: metadata.chunk_count,
1833 snapshot_hash: metadata.snapshot_hash.clone(),
1834 failure_class: Some(failure.failure_class_label().to_owned()),
1835 };
1836 let _ = persist_vector_regeneration_event(
1837 &conn,
1838 "vector_regeneration_failed",
1839 profile,
1840 &failure_metadata,
1841 );
1842 }
1843
1844 pub fn trace_source(&self, source_ref: &str) -> Result<TraceReport, EngineError> {
1847 let conn = self.connect()?;
1848
1849 let node_logical_ids = collect_strings(
1850 &conn,
1851 "SELECT logical_id FROM nodes WHERE source_ref = ?1 ORDER BY created_at",
1852 source_ref,
1853 )?;
1854 let action_ids = collect_strings(
1855 &conn,
1856 "SELECT id FROM actions WHERE source_ref = ?1 ORDER BY created_at",
1857 source_ref,
1858 )?;
1859 let operational_mutation_ids = collect_strings(
1860 &conn,
1861 "SELECT id FROM operational_mutations WHERE source_ref = ?1 ORDER BY mutation_order",
1862 source_ref,
1863 )?;
1864
1865 Ok(TraceReport {
1866 source_ref: source_ref.to_owned(),
1867 node_rows: count_source_ref(&conn, "nodes", source_ref)?,
1868 edge_rows: count_source_ref(&conn, "edges", source_ref)?,
1869 action_rows: count_source_ref(&conn, "actions", source_ref)?,
1870 operational_mutation_rows: count_source_ref(
1871 &conn,
1872 "operational_mutations",
1873 source_ref,
1874 )?,
1875 node_logical_ids,
1876 action_ids,
1877 operational_mutation_ids,
1878 })
1879 }
1880
1881 #[allow(clippy::too_many_lines)]
1885 pub fn restore_logical_id(
1886 &self,
1887 logical_id: &str,
1888 ) -> Result<LogicalRestoreReport, EngineError> {
1889 let mut conn = self.connect()?;
1890 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
1891
1892 let active_count: i64 = tx.query_row(
1893 "SELECT count(*) FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL",
1894 [logical_id],
1895 |row| row.get(0),
1896 )?;
1897 if active_count > 0 {
1898 return Ok(LogicalRestoreReport {
1899 logical_id: logical_id.to_owned(),
1900 was_noop: true,
1901 restored_node_rows: 0,
1902 restored_edge_rows: 0,
1903 restored_chunk_rows: 0,
1904 restored_fts_rows: 0,
1905 restored_property_fts_rows: 0,
1906 restored_vec_rows: 0,
1907 skipped_edges: Vec::new(),
1908 notes: vec!["logical_id already active".to_owned()],
1909 });
1910 }
1911
1912 let restored_node: Option<(String, String)> = tx
1913 .query_row(
1914 "SELECT row_id, kind FROM nodes \
1915 WHERE logical_id = ?1 AND superseded_at IS NOT NULL \
1916 ORDER BY superseded_at DESC, created_at DESC, rowid DESC LIMIT 1",
1917 [logical_id],
1918 |row| Ok((row.get(0)?, row.get(1)?)),
1919 )
1920 .optional()?;
1921 let (restored_node_row_id, restored_kind) = restored_node.ok_or_else(|| {
1922 EngineError::InvalidWrite(format!("logical_id '{logical_id}' is not retired"))
1923 })?;
1924
1925 tx.execute(
1926 "UPDATE nodes SET superseded_at = NULL WHERE row_id = ?1",
1927 [restored_node_row_id.as_str()],
1928 )?;
1929
1930 let retire_scope: Option<(i64, Option<String>, i64)> = tx
1931 .query_row(
1932 "SELECT rowid, source_ref, created_at FROM provenance_events \
1933 WHERE event_type = 'node_retire' AND subject = ?1 \
1934 ORDER BY created_at DESC, rowid DESC LIMIT 1",
1935 [logical_id],
1936 |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
1937 )
1938 .optional()?;
1939 let (restored_edge_rows, skipped_edges) = if let Some((
1940 retire_event_rowid,
1941 retire_source_ref,
1942 retire_created_at,
1943 )) = retire_scope
1944 {
1945 restore_validated_edges(
1946 &tx,
1947 logical_id,
1948 retire_source_ref.as_deref(),
1949 retire_created_at,
1950 retire_event_rowid,
1951 )?
1952 } else {
1953 (0, Vec::new())
1954 };
1955
1956 let restored_chunk_rows: usize = tx
1957 .query_row(
1958 "SELECT count(*) FROM chunks WHERE node_logical_id = ?1",
1959 [logical_id],
1960 |row| row.get::<_, i64>(0),
1961 )
1962 .map(i64_to_usize)?;
1963 tx.execute(
1964 "DELETE FROM fts_nodes WHERE node_logical_id = ?1",
1965 [logical_id],
1966 )?;
1967 let restored_fts_rows = tx.execute(
1968 "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
1969 SELECT id, node_logical_id, ?2, text_content \
1970 FROM chunks WHERE node_logical_id = ?1",
1971 rusqlite::params![logical_id, restored_kind],
1972 )?;
1973 let restored_vec_rows = count_vec_rows_for_logical_id(&tx, logical_id)?;
1974
1975 tx.execute(
1977 "DELETE FROM fts_node_properties WHERE node_logical_id = ?1",
1978 [logical_id],
1979 )?;
1980 let restored_property_fts_rows =
1981 rebuild_single_node_property_fts(&tx, logical_id, &restored_kind)?;
1982
1983 persist_simple_provenance_event(
1984 &tx,
1985 "restore_logical_id",
1986 logical_id,
1987 Some(serde_json::json!({
1988 "restored_node_rows": 1,
1989 "restored_edge_rows": restored_edge_rows,
1990 "restored_chunk_rows": restored_chunk_rows,
1991 "restored_fts_rows": restored_fts_rows,
1992 "restored_property_fts_rows": restored_property_fts_rows,
1993 "restored_vec_rows": restored_vec_rows,
1994 })),
1995 )?;
1996 tx.commit()?;
1997
1998 Ok(LogicalRestoreReport {
1999 logical_id: logical_id.to_owned(),
2000 was_noop: false,
2001 restored_node_rows: 1,
2002 restored_edge_rows,
2003 restored_chunk_rows,
2004 restored_fts_rows,
2005 restored_property_fts_rows,
2006 restored_vec_rows,
2007 skipped_edges,
2008 notes: Vec::new(),
2009 })
2010 }
2011
2012 pub fn purge_logical_id(&self, logical_id: &str) -> Result<LogicalPurgeReport, EngineError> {
2016 let mut conn = self.connect()?;
2017 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2018
2019 let active_count: i64 = tx.query_row(
2020 "SELECT count(*) FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL",
2021 [logical_id],
2022 |row| row.get(0),
2023 )?;
2024 if active_count > 0 {
2025 return Ok(LogicalPurgeReport {
2026 logical_id: logical_id.to_owned(),
2027 was_noop: true,
2028 deleted_node_rows: 0,
2029 deleted_edge_rows: 0,
2030 deleted_chunk_rows: 0,
2031 deleted_fts_rows: 0,
2032 deleted_vec_rows: 0,
2033 notes: vec!["logical_id is active; purge skipped".to_owned()],
2034 });
2035 }
2036
2037 let node_rows: i64 = tx.query_row(
2038 "SELECT count(*) FROM nodes WHERE logical_id = ?1",
2039 [logical_id],
2040 |row| row.get(0),
2041 )?;
2042 if node_rows == 0 {
2043 return Err(EngineError::InvalidWrite(format!(
2044 "logical_id '{logical_id}' does not exist"
2045 )));
2046 }
2047
2048 let deleted_vec_rows = delete_vec_rows_for_logical_id(&tx, logical_id)?;
2049 let deleted_fts_rows = tx.execute(
2050 "DELETE FROM fts_nodes WHERE node_logical_id = ?1",
2051 [logical_id],
2052 )?;
2053 let deleted_edge_rows = tx.execute(
2054 "DELETE FROM edges WHERE source_logical_id = ?1 OR target_logical_id = ?1",
2055 [logical_id],
2056 )?;
2057 let deleted_chunk_rows = tx.execute(
2058 "DELETE FROM chunks WHERE node_logical_id = ?1",
2059 [logical_id],
2060 )?;
2061 let deleted_node_rows =
2062 tx.execute("DELETE FROM nodes WHERE logical_id = ?1", [logical_id])?;
2063 tx.execute(
2064 "DELETE FROM node_access_metadata WHERE logical_id = ?1",
2065 [logical_id],
2066 )?;
2067
2068 persist_simple_provenance_event(
2069 &tx,
2070 "purge_logical_id",
2071 logical_id,
2072 Some(serde_json::json!({
2073 "deleted_node_rows": deleted_node_rows,
2074 "deleted_edge_rows": deleted_edge_rows,
2075 "deleted_chunk_rows": deleted_chunk_rows,
2076 "deleted_fts_rows": deleted_fts_rows,
2077 "deleted_vec_rows": deleted_vec_rows,
2078 })),
2079 )?;
2080 tx.commit()?;
2081
2082 Ok(LogicalPurgeReport {
2083 logical_id: logical_id.to_owned(),
2084 was_noop: false,
2085 deleted_node_rows,
2086 deleted_edge_rows,
2087 deleted_chunk_rows,
2088 deleted_fts_rows,
2089 deleted_vec_rows,
2090 notes: Vec::new(),
2091 })
2092 }
2093
2094 pub fn purge_provenance_events(
2104 &self,
2105 before_timestamp: i64,
2106 options: &ProvenancePurgeOptions,
2107 ) -> Result<ProvenancePurgeReport, EngineError> {
2108 let mut conn = self.connect()?;
2109 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2110
2111 let preserved_types: Vec<&str> = if options.preserve_event_types.is_empty() {
2112 vec!["excise", "purge_logical_id"]
2113 } else {
2114 options
2115 .preserve_event_types
2116 .iter()
2117 .map(String::as_str)
2118 .collect()
2119 };
2120
2121 let placeholders: String = (0..preserved_types.len())
2123 .map(|i| format!("?{}", i + 2))
2124 .collect::<Vec<_>>()
2125 .join(", ");
2126 let count_query = format!(
2127 "SELECT count(*) FROM provenance_events \
2128 WHERE created_at < ?1 AND event_type NOT IN ({placeholders})"
2129 );
2130 let delete_query = format!(
2131 "DELETE FROM provenance_events WHERE rowid IN (\
2132 SELECT rowid FROM provenance_events \
2133 WHERE created_at < ?1 AND event_type NOT IN ({placeholders}) \
2134 LIMIT 10000)"
2135 );
2136
2137 let bind_params = |stmt: &mut rusqlite::Statement<'_>| -> Result<(), rusqlite::Error> {
2138 stmt.raw_bind_parameter(1, before_timestamp)?;
2139 for (i, event_type) in preserved_types.iter().enumerate() {
2140 stmt.raw_bind_parameter(i + 2, *event_type)?;
2141 }
2142 Ok(())
2143 };
2144
2145 let events_deleted = if options.dry_run {
2146 let mut stmt = tx.prepare(&count_query)?;
2147 bind_params(&mut stmt)?;
2148 stmt.raw_query()
2149 .next()?
2150 .map_or(0, |row| row.get::<_, u64>(0).unwrap_or(0))
2151 } else {
2152 let mut total_deleted: u64 = 0;
2153 loop {
2154 let mut stmt = tx.prepare(&delete_query)?;
2155 bind_params(&mut stmt)?;
2156 let deleted = stmt.raw_execute()?;
2157 if deleted == 0 {
2158 break;
2159 }
2160 total_deleted += deleted as u64;
2161 }
2162 total_deleted
2163 };
2164
2165 let total_after: u64 =
2166 tx.query_row("SELECT count(*) FROM provenance_events", [], |row| {
2167 row.get(0)
2168 })?;
2169
2170 let oldest_remaining: Option<i64> = tx
2171 .query_row("SELECT MIN(created_at) FROM provenance_events", [], |row| {
2172 row.get(0)
2173 })
2174 .optional()?
2175 .flatten();
2176
2177 if !options.dry_run {
2178 tx.commit()?;
2179 }
2180
2181 let events_preserved = if options.dry_run {
2184 total_after - events_deleted
2185 } else {
2186 total_after
2187 };
2188
2189 Ok(ProvenancePurgeReport {
2190 events_deleted,
2191 events_preserved,
2192 oldest_remaining,
2193 })
2194 }
2195
2196 #[allow(clippy::too_many_lines)]
2200 pub fn excise_source(&self, source_ref: &str) -> Result<TraceReport, EngineError> {
2201 let mut conn = self.connect()?;
2202
2203 let tx = conn.transaction_with_behavior(TransactionBehavior::Immediate)?;
2204 let affected_operational_collections = collect_strings_tx(
2205 &tx,
2206 "SELECT DISTINCT m.collection_name \
2207 FROM operational_mutations m \
2208 JOIN operational_collections c ON c.name = m.collection_name \
2209 WHERE m.source_ref = ?1 AND c.kind = 'latest_state' \
2210 ORDER BY m.collection_name",
2211 source_ref,
2212 )?;
2213
2214 let pairs: Vec<(String, String)> = {
2216 let mut stmt = tx.prepare(
2217 "SELECT row_id, logical_id FROM nodes \
2218 WHERE source_ref = ?1 AND superseded_at IS NULL",
2219 )?;
2220 stmt.query_map([source_ref], |row| {
2221 Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
2222 })?
2223 .collect::<Result<Vec<_>, _>>()?
2224 };
2225 let affected_logical_ids: Vec<String> = pairs
2226 .iter()
2227 .map(|(_, logical_id)| logical_id.clone())
2228 .collect();
2229
2230 tx.execute(
2232 "UPDATE nodes SET superseded_at = unixepoch() \
2233 WHERE source_ref = ?1 AND superseded_at IS NULL",
2234 [source_ref],
2235 )?;
2236 tx.execute(
2237 "UPDATE edges SET superseded_at = unixepoch() \
2238 WHERE source_ref = ?1 AND superseded_at IS NULL",
2239 [source_ref],
2240 )?;
2241 tx.execute(
2242 "UPDATE actions SET superseded_at = unixepoch() \
2243 WHERE source_ref = ?1 AND superseded_at IS NULL",
2244 [source_ref],
2245 )?;
2246 clear_operational_current_rows(&tx, &affected_operational_collections)?;
2247 tx.execute(
2248 "DELETE FROM operational_mutations WHERE source_ref = ?1",
2249 [source_ref],
2250 )?;
2251 for logical_id in &affected_logical_ids {
2252 delete_vec_rows_for_logical_id(&tx, logical_id)?;
2253 tx.execute(
2254 "DELETE FROM chunks WHERE node_logical_id = ?1",
2255 [logical_id.as_str()],
2256 )?;
2257 }
2258
2259 for (excised_row_id, logical_id) in &pairs {
2261 let prior: Option<String> = tx
2262 .query_row(
2263 "SELECT row_id FROM nodes \
2264 WHERE logical_id = ?1 AND row_id != ?2 \
2265 ORDER BY created_at DESC LIMIT 1",
2266 [logical_id.as_str(), excised_row_id.as_str()],
2267 |row| row.get(0),
2268 )
2269 .optional()?;
2270 if let Some(prior_id) = prior {
2271 tx.execute(
2272 "UPDATE nodes SET superseded_at = NULL WHERE row_id = ?1",
2273 [prior_id.as_str()],
2274 )?;
2275 }
2276 }
2277
2278 for logical_id in &affected_logical_ids {
2279 let has_active_node = tx
2280 .query_row(
2281 "SELECT 1 FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL LIMIT 1",
2282 [logical_id.as_str()],
2283 |row| row.get::<_, i64>(0),
2284 )
2285 .optional()?
2286 .is_some();
2287 if !has_active_node {
2288 tx.execute(
2289 "DELETE FROM node_access_metadata WHERE logical_id = ?1",
2290 [logical_id.as_str()],
2291 )?;
2292 }
2293 }
2294
2295 rebuild_operational_current_rows(&tx, &affected_operational_collections)?;
2296
2297 tx.execute("DELETE FROM fts_nodes", [])?;
2300 tx.execute(
2301 r"
2302 INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content)
2303 SELECT c.id, n.logical_id, n.kind, c.text_content
2304 FROM chunks c
2305 JOIN nodes n
2306 ON n.logical_id = c.node_logical_id
2307 AND n.superseded_at IS NULL
2308 ",
2309 [],
2310 )?;
2311
2312 rebuild_property_fts_in_tx(&tx)?;
2314
2315 tx.execute(
2319 "INSERT INTO provenance_events (id, event_type, subject, source_ref) \
2320 VALUES (?1, 'excise_source', ?2, ?2)",
2321 rusqlite::params![new_id(), source_ref],
2322 )?;
2323
2324 tx.commit()?;
2325
2326 self.trace_source(source_ref)
2327 }
2328
2329 pub fn safe_export(
2333 &self,
2334 destination_path: impl AsRef<Path>,
2335 options: SafeExportOptions,
2336 ) -> Result<SafeExportManifest, EngineError> {
2337 let destination_path = destination_path.as_ref();
2338
2339 let conn = self.connect()?;
2343
2344 if options.force_checkpoint {
2345 trace_info!("safe_export: wal checkpoint started");
2346 let (busy, log, checkpointed): (i64, i64, i64) =
2347 conn.query_row("PRAGMA wal_checkpoint(FULL)", [], |row| {
2348 Ok((row.get(0)?, row.get(1)?, row.get(2)?))
2349 })?;
2350 if busy != 0 {
2351 trace_warn!(
2352 busy,
2353 log_frames = log,
2354 checkpointed_frames = checkpointed,
2355 "safe_export: wal checkpoint blocked by active readers"
2356 );
2357 return Err(EngineError::Bridge(format!(
2358 "WAL checkpoint blocked: {busy} active reader(s) prevented a full checkpoint; \
2359 log frames={log}, checkpointed={checkpointed}; \
2360 retry export when no readers are active"
2361 )));
2362 }
2363 trace_info!(
2364 log_frames = log,
2365 checkpointed_frames = checkpointed,
2366 "safe_export: wal checkpoint completed"
2367 );
2368 }
2369
2370 let schema_version: u32 = conn
2371 .query_row(
2372 "SELECT COALESCE(MAX(version), 0) FROM fathom_schema_migrations",
2373 [],
2374 |row| row.get(0),
2375 )
2376 .unwrap_or(0);
2377
2378 if let Some(parent) = destination_path.parent() {
2381 fs::create_dir_all(parent)?;
2382 }
2383 conn.backup(DatabaseName::Main, destination_path, None)?;
2384
2385 drop(conn);
2386
2387 let page_count: u64 = {
2391 let export_conn = rusqlite::Connection::open_with_flags(
2392 destination_path,
2393 rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY
2394 | rusqlite::OpenFlags::SQLITE_OPEN_NO_MUTEX,
2395 )?;
2396 export_conn.query_row("PRAGMA page_count", [], |row| row.get(0))?
2397 };
2398
2399 let sha256 = {
2402 let mut file = fs::File::open(destination_path)?;
2403 let mut hasher = Sha256::new();
2404 io::copy(&mut file, &mut hasher)?;
2405 format!("{:x}", hasher.finalize())
2406 };
2407
2408 let exported_at = SystemTime::now()
2410 .duration_since(SystemTime::UNIX_EPOCH)
2411 .map_err(|e| EngineError::Bridge(format!("system clock error: {e}")))?
2412 .as_secs();
2413
2414 let manifest = SafeExportManifest {
2415 exported_at,
2416 sha256,
2417 schema_version,
2418 protocol_version: EXPORT_PROTOCOL_VERSION,
2419 page_count,
2420 };
2421
2422 let manifest_path = {
2424 let mut p = destination_path.to_path_buf();
2425 let stem = p
2426 .file_name()
2427 .map(|n| format!("{}.export-manifest.json", n.to_string_lossy()))
2428 .ok_or_else(|| {
2429 EngineError::Bridge("destination path has no filename".to_owned())
2430 })?;
2431 p.set_file_name(stem);
2432 p
2433 };
2434 let manifest_json =
2435 serde_json::to_string(&manifest).map_err(|e| EngineError::Bridge(e.to_string()))?;
2436
2437 let manifest_tmp = manifest_path.with_extension("json.tmp");
2440 if let Err(e) = fs::write(&manifest_tmp, &manifest_json)
2441 .and_then(|()| fs::rename(&manifest_tmp, &manifest_path))
2442 {
2443 let _ = fs::remove_file(&manifest_tmp);
2444 return Err(e.into());
2445 }
2446
2447 Ok(manifest)
2448 }
2449}
2450
2451#[allow(dead_code)]
2452#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2453struct VectorEmbeddingContractRecord {
2454 profile: String,
2455 table_name: String,
2456 model_identity: String,
2457 model_version: String,
2458 dimension: usize,
2459 normalization_policy: String,
2460 chunking_policy: String,
2461 preprocessing_policy: String,
2462 generator_command_json: String,
2463 applied_at: i64,
2464 snapshot_hash: String,
2465 contract_format_version: i64,
2466}
2467
2468#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
2469struct VectorRegenerationInputChunk {
2470 chunk_id: String,
2471 node_logical_id: String,
2472 kind: String,
2473 text_content: String,
2474 byte_start: Option<i64>,
2475 byte_end: Option<i64>,
2476 source_ref: Option<String>,
2477 created_at: i64,
2478}
2479
2480#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
2481struct VectorRegenerationInput {
2482 profile: String,
2483 table_name: String,
2484 model_identity: String,
2485 model_version: String,
2486 dimension: usize,
2487 normalization_policy: String,
2488 chunking_policy: String,
2489 preprocessing_policy: String,
2490 chunks: Vec<VectorRegenerationInputChunk>,
2491}
2492
2493#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
2494struct GeneratedEmbedding {
2495 chunk_id: String,
2496 embedding: Vec<f32>,
2497}
2498
2499#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
2500struct GeneratedEmbeddings {
2501 embeddings: Vec<GeneratedEmbedding>,
2502}
2503
2504#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2505pub(crate) enum VectorRegenerationFailureClass {
2506 InvalidContract,
2507 PayloadTooLarge,
2508 GeneratorTimeout,
2509 GeneratorStdoutOverflow,
2510 GeneratorStderrOverflow,
2511 GeneratorNonzeroExit,
2512 MalformedGeneratorJson,
2513 SnapshotDrift,
2514 UnsupportedVecCapability,
2515}
2516
2517impl VectorRegenerationFailureClass {
2518 fn label(self) -> &'static str {
2519 match self {
2520 Self::InvalidContract => "invalid contract",
2521 Self::PayloadTooLarge => "payload too large",
2522 Self::GeneratorTimeout => "generator timeout",
2523 Self::GeneratorStdoutOverflow => "generator stdout overflow",
2524 Self::GeneratorStderrOverflow => "generator stderr overflow",
2525 Self::GeneratorNonzeroExit => "generator nonzero exit",
2526 Self::MalformedGeneratorJson => "malformed generator json",
2527 Self::SnapshotDrift => "snapshot drift",
2528 Self::UnsupportedVecCapability => "unsupported vec capability",
2529 }
2530 }
2531
2532 fn retryable(self) -> bool {
2533 matches!(self, Self::SnapshotDrift)
2534 }
2535}
2536
2537#[derive(Clone, Debug, PartialEq, Eq)]
2538pub(crate) struct VectorRegenerationFailure {
2539 class: VectorRegenerationFailureClass,
2540 detail: String,
2541}
2542
2543impl VectorRegenerationFailure {
2544 pub(crate) fn new(class: VectorRegenerationFailureClass, detail: impl Into<String>) -> Self {
2545 Self {
2546 class,
2547 detail: detail.into(),
2548 }
2549 }
2550
2551 fn to_engine_error(&self) -> EngineError {
2552 let retry_suffix = if self.class.retryable() {
2553 " [retryable]"
2554 } else {
2555 ""
2556 };
2557 EngineError::Bridge(format!(
2558 "vector regeneration {}: {}{}",
2559 self.class.label(),
2560 self.detail,
2561 retry_suffix
2562 ))
2563 }
2564
2565 fn failure_class_label(&self) -> &'static str {
2566 self.class.label()
2567 }
2568}
2569
2570#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
2571struct VectorRegenerationAuditMetadata {
2572 profile: String,
2573 model_identity: String,
2574 model_version: String,
2575 chunk_count: usize,
2576 snapshot_hash: String,
2577 #[serde(skip_serializing_if = "Option::is_none")]
2578 failure_class: Option<String>,
2579}
2580
2581#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize)]
2582#[serde(tag = "mode", rename_all = "snake_case")]
2583enum OperationalRetentionPolicy {
2584 KeepAll,
2585 PurgeBeforeSeconds { max_age_seconds: i64 },
2586 KeepLast { max_rows: usize },
2587}
2588
2589pub fn load_vector_regeneration_config(
2592 path: impl AsRef<Path>,
2593) -> Result<VectorRegenerationConfig, EngineError> {
2594 let path = path.as_ref();
2595 let raw = fs::read_to_string(path)?;
2596 match path.extension().and_then(|ext| ext.to_str()) {
2597 Some("toml") => {
2598 toml::from_str(&raw).map_err(|error| EngineError::Bridge(error.to_string()))
2599 }
2600 Some("json") | None => {
2601 serde_json::from_str(&raw).map_err(|error| EngineError::Bridge(error.to_string()))
2602 }
2603 Some(other) => Err(EngineError::Bridge(format!(
2604 "unsupported vector regeneration config extension: {other}"
2605 ))),
2606 }
2607}
2608
2609fn validate_vector_regeneration_config(
2610 conn: &rusqlite::Connection,
2611 config: &VectorRegenerationConfig,
2612 policy: &VectorGeneratorPolicy,
2613) -> Result<VectorRegenerationConfig, VectorRegenerationFailure> {
2614 let profile = validate_bounded_text("profile", &config.profile, MAX_PROFILE_LEN)?;
2615 let table_name = validate_bounded_text("table_name", &config.table_name, MAX_PROFILE_LEN)?;
2616 if table_name != "vec_nodes_active" {
2617 return Err(VectorRegenerationFailure::new(
2618 VectorRegenerationFailureClass::InvalidContract,
2619 format!("table_name must be vec_nodes_active, got '{table_name}'"),
2620 ));
2621 }
2622 let model_identity = validate_bounded_text(
2623 "model_identity",
2624 &config.model_identity,
2625 MAX_MODEL_IDENTITY_LEN,
2626 )?;
2627 let model_version = validate_bounded_text(
2628 "model_version",
2629 &config.model_version,
2630 MAX_MODEL_VERSION_LEN,
2631 )?;
2632 if config.dimension == 0 {
2633 return Err(VectorRegenerationFailure::new(
2634 VectorRegenerationFailureClass::InvalidContract,
2635 "dimension must be greater than zero".to_owned(),
2636 ));
2637 }
2638 let normalization_policy = validate_bounded_text(
2639 "normalization_policy",
2640 &config.normalization_policy,
2641 MAX_POLICY_LEN,
2642 )?;
2643 let chunking_policy =
2644 validate_bounded_text("chunking_policy", &config.chunking_policy, MAX_POLICY_LEN)?;
2645 let preprocessing_policy = validate_bounded_text(
2646 "preprocessing_policy",
2647 &config.preprocessing_policy,
2648 MAX_POLICY_LEN,
2649 )?;
2650 let generator_command = validate_generator_command(&config.generator_command, policy)?;
2651
2652 if let Some(existing_dimension) = current_vector_profile_dimension(conn, &profile)?
2653 && existing_dimension != config.dimension
2654 {
2655 return Err(VectorRegenerationFailure::new(
2656 VectorRegenerationFailureClass::InvalidContract,
2657 format!(
2658 "dimension {} does not match existing vector profile dimension {}",
2659 config.dimension, existing_dimension
2660 ),
2661 ));
2662 }
2663
2664 validate_existing_contract_version(conn, &profile)?;
2665
2666 let normalized = VectorRegenerationConfig {
2667 profile,
2668 table_name,
2669 model_identity,
2670 model_version,
2671 dimension: config.dimension,
2672 normalization_policy,
2673 chunking_policy,
2674 preprocessing_policy,
2675 generator_command,
2676 };
2677 let serialized = serde_json::to_vec(&normalized).map_err(|error| {
2678 VectorRegenerationFailure::new(
2679 VectorRegenerationFailureClass::InvalidContract,
2680 error.to_string(),
2681 )
2682 })?;
2683 if serialized.len() > MAX_CONTRACT_JSON_BYTES {
2684 return Err(VectorRegenerationFailure::new(
2685 VectorRegenerationFailureClass::InvalidContract,
2686 format!("serialized contract exceeds {MAX_CONTRACT_JSON_BYTES} bytes"),
2687 ));
2688 }
2689
2690 Ok(normalized)
2691}
2692
2693#[allow(clippy::cast_possible_wrap)]
2694fn persist_vector_contract(
2695 conn: &rusqlite::Connection,
2696 config: &VectorRegenerationConfig,
2697 snapshot_hash: &str,
2698) -> Result<(), EngineError> {
2699 let generator_command_json = serde_json::to_string(&config.generator_command)
2700 .map_err(|error| EngineError::Bridge(error.to_string()))?;
2701 conn.execute(
2702 r"
2703 INSERT OR REPLACE INTO vector_embedding_contracts (
2704 profile,
2705 table_name,
2706 model_identity,
2707 model_version,
2708 dimension,
2709 normalization_policy,
2710 chunking_policy,
2711 preprocessing_policy,
2712 generator_command_json,
2713 applied_at,
2714 snapshot_hash,
2715 contract_format_version,
2716 updated_at
2717 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, unixepoch(), ?10, ?11, unixepoch())
2718 ",
2719 rusqlite::params![
2720 config.profile.as_str(),
2721 config.table_name.as_str(),
2722 config.model_identity.as_str(),
2723 config.model_version.as_str(),
2724 config.dimension as i64,
2725 config.normalization_policy.as_str(),
2726 config.chunking_policy.as_str(),
2727 config.preprocessing_policy.as_str(),
2728 generator_command_json,
2729 snapshot_hash,
2730 CURRENT_VECTOR_CONTRACT_FORMAT_VERSION,
2731 ],
2732 )?;
2733 Ok(())
2734}
2735
2736fn persist_vector_regeneration_event(
2737 conn: &rusqlite::Connection,
2738 event_type: &str,
2739 subject: &str,
2740 metadata: &VectorRegenerationAuditMetadata,
2741) -> Result<(), EngineError> {
2742 let metadata_json = serialize_audit_metadata(metadata)?;
2743 conn.execute(
2744 "INSERT INTO provenance_events (id, event_type, subject, metadata_json) VALUES (?1, ?2, ?3, ?4)",
2745 rusqlite::params![new_id(), event_type, subject, metadata_json],
2746 )?;
2747 Ok(())
2748}
2749
2750fn persist_simple_provenance_event(
2751 conn: &rusqlite::Connection,
2752 event_type: &str,
2753 subject: &str,
2754 metadata: Option<serde_json::Value>,
2755) -> Result<(), EngineError> {
2756 let metadata_json = metadata.map(|value| value.to_string()).unwrap_or_default();
2757 conn.execute(
2758 "INSERT INTO provenance_events (id, event_type, subject, metadata_json) VALUES (?1, ?2, ?3, ?4)",
2759 rusqlite::params![new_id(), event_type, subject, metadata_json],
2760 )?;
2761 Ok(())
2762}
2763
2764fn count_missing_property_fts_rows(conn: &rusqlite::Connection) -> Result<i64, EngineError> {
2768 let schemas = crate::writer::load_fts_property_schemas(conn)?;
2769 if schemas.is_empty() {
2770 return Ok(0);
2771 }
2772
2773 let mut missing = 0i64;
2774 for (kind, paths, separator) in &schemas {
2775 let mut stmt = conn.prepare(
2776 "SELECT n.logical_id, n.properties FROM nodes n \
2777 WHERE n.kind = ?1 AND n.superseded_at IS NULL \
2778 AND NOT EXISTS (SELECT 1 FROM fts_node_properties fp WHERE fp.node_logical_id = n.logical_id)",
2779 )?;
2780 let rows = stmt.query_map([kind.as_str()], |row| {
2781 Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
2782 })?;
2783 for row in rows {
2784 let (_logical_id, properties_str) = row?;
2785 let props: serde_json::Value =
2786 serde_json::from_str(&properties_str).unwrap_or_default();
2787 if crate::writer::compute_property_fts_text(&props, paths, separator).is_some() {
2788 missing += 1;
2789 }
2790 }
2791 }
2792 Ok(missing)
2793}
2794
2795fn count_drifted_property_fts_rows(conn: &rusqlite::Connection) -> Result<i64, EngineError> {
2800 let schemas = crate::writer::load_fts_property_schemas(conn)?;
2801 if schemas.is_empty() {
2802 return Ok(0);
2803 }
2804
2805 let mut drifted = 0i64;
2806 for (kind, paths, separator) in &schemas {
2807 let mut stmt = conn.prepare(
2808 "SELECT fp.node_logical_id, fp.text_content, n.properties \
2809 FROM fts_node_properties fp \
2810 JOIN nodes n ON n.logical_id = fp.node_logical_id AND n.superseded_at IS NULL \
2811 WHERE fp.kind = ?1 AND n.kind = ?1",
2812 )?;
2813 let rows = stmt.query_map([kind.as_str()], |row| {
2814 Ok((
2815 row.get::<_, String>(0)?,
2816 row.get::<_, String>(1)?,
2817 row.get::<_, String>(2)?,
2818 ))
2819 })?;
2820 for row in rows {
2821 let (_logical_id, stored_text, properties_str) = row?;
2822 let props: serde_json::Value =
2823 serde_json::from_str(&properties_str).unwrap_or_default();
2824 let expected = crate::writer::compute_property_fts_text(&props, paths, separator);
2825 match expected {
2826 Some(text) if text == stored_text => {}
2827 _ => drifted += 1,
2828 }
2829 }
2830 }
2831 Ok(drifted)
2832}
2833
2834fn rebuild_property_fts_in_tx(conn: &rusqlite::Connection) -> Result<usize, EngineError> {
2836 conn.execute("DELETE FROM fts_node_properties", [])?;
2837 let inserted = crate::projection::insert_property_fts_rows(
2838 conn,
2839 "SELECT logical_id, properties FROM nodes WHERE kind = ?1 AND superseded_at IS NULL",
2840 )?;
2841 Ok(inserted)
2842}
2843
2844fn rebuild_single_node_property_fts(
2847 conn: &rusqlite::Connection,
2848 logical_id: &str,
2849 kind: &str,
2850) -> Result<usize, EngineError> {
2851 let schema: Option<(Vec<String>, String)> = conn
2852 .query_row(
2853 "SELECT property_paths_json, separator FROM fts_property_schemas WHERE kind = ?1",
2854 [kind],
2855 |row| {
2856 let paths_json: String = row.get(0)?;
2857 let separator: String = row.get(1)?;
2858 let paths: Vec<String> = serde_json::from_str(&paths_json).unwrap_or_default();
2859 Ok((paths, separator))
2860 },
2861 )
2862 .optional()?;
2863 let Some((paths, separator)) = schema else {
2864 return Ok(0);
2865 };
2866 let properties_str: Option<String> = conn
2867 .query_row(
2868 "SELECT properties FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL",
2869 [logical_id],
2870 |row| row.get(0),
2871 )
2872 .optional()?;
2873 let Some(properties_str) = properties_str else {
2874 return Ok(0);
2875 };
2876 let props: serde_json::Value = serde_json::from_str(&properties_str).unwrap_or_default();
2877 let Some(text) = crate::writer::compute_property_fts_text(&props, &paths, &separator) else {
2878 return Ok(0);
2879 };
2880 conn.execute(
2881 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) VALUES (?1, ?2, ?3)",
2882 rusqlite::params![logical_id, kind, text],
2883 )?;
2884 Ok(1)
2885}
2886
2887fn validate_fts_property_paths(paths: &[String]) -> Result<(), EngineError> {
2888 if paths.is_empty() {
2889 return Err(EngineError::InvalidWrite(
2890 "FTS property paths must not be empty".to_owned(),
2891 ));
2892 }
2893 let mut seen = std::collections::HashSet::new();
2894 for path in paths {
2895 if !path.starts_with("$.") {
2896 return Err(EngineError::InvalidWrite(format!(
2897 "FTS property path must start with '$.' but got: {path}"
2898 )));
2899 }
2900 let after_prefix = &path[2..]; let segments: Vec<&str> = after_prefix.split('.').collect();
2902 if segments.is_empty() || segments.iter().any(|s| s.is_empty()) {
2903 return Err(EngineError::InvalidWrite(format!(
2904 "FTS property path has empty segment(s): {path}"
2905 )));
2906 }
2907 for seg in &segments {
2908 if !seg.chars().all(|c| c.is_alphanumeric() || c == '_') {
2909 return Err(EngineError::InvalidWrite(format!(
2910 "FTS property path segment contains invalid characters: {path}"
2911 )));
2912 }
2913 }
2914 if !seen.insert(path) {
2915 return Err(EngineError::InvalidWrite(format!(
2916 "duplicate FTS property path: {path}"
2917 )));
2918 }
2919 }
2920 Ok(())
2921}
2922
2923fn load_fts_property_schema_record(
2924 conn: &rusqlite::Connection,
2925 kind: &str,
2926) -> Result<Option<FtsPropertySchemaRecord>, EngineError> {
2927 let row = conn
2928 .query_row(
2929 "SELECT kind, property_paths_json, separator, format_version \
2930 FROM fts_property_schemas WHERE kind = ?1",
2931 [kind],
2932 |row| {
2933 let paths_json: String = row.get(1)?;
2934 let paths: Vec<String> = serde_json::from_str(&paths_json).unwrap_or_default();
2935 Ok(FtsPropertySchemaRecord {
2936 kind: row.get(0)?,
2937 property_paths: paths,
2938 separator: row.get(2)?,
2939 format_version: row.get(3)?,
2940 })
2941 },
2942 )
2943 .optional()?;
2944 Ok(row)
2945}
2946
2947fn build_regeneration_input(
2948 config: &VectorRegenerationConfig,
2949 chunks: Vec<VectorRegenerationInputChunk>,
2950) -> VectorRegenerationInput {
2951 VectorRegenerationInput {
2952 profile: config.profile.clone(),
2953 table_name: config.table_name.clone(),
2954 model_identity: config.model_identity.clone(),
2955 model_version: config.model_version.clone(),
2956 dimension: config.dimension,
2957 normalization_policy: config.normalization_policy.clone(),
2958 chunking_policy: config.chunking_policy.clone(),
2959 preprocessing_policy: config.preprocessing_policy.clone(),
2960 chunks,
2961 }
2962}
2963
2964fn compute_snapshot_hash(payload: &VectorRegenerationInput) -> Result<String, EngineError> {
2965 let bytes =
2966 serde_json::to_vec(payload).map_err(|error| EngineError::Bridge(error.to_string()))?;
2967 let mut hasher = Sha256::new();
2968 hasher.update(bytes);
2969 Ok(format!("{:x}", hasher.finalize()))
2970}
2971
2972fn collect_regeneration_chunks(
2973 conn: &rusqlite::Connection,
2974) -> Result<Vec<VectorRegenerationInputChunk>, EngineError> {
2975 let mut stmt = conn.prepare(
2976 r"
2977 SELECT c.id, c.node_logical_id, n.kind, c.text_content, c.byte_start, c.byte_end, n.source_ref, c.created_at
2978 FROM chunks c
2979 JOIN nodes n
2980 ON n.logical_id = c.node_logical_id
2981 AND n.superseded_at IS NULL
2982 ORDER BY c.created_at, c.id
2983 ",
2984 )?;
2985 let chunks = stmt
2986 .query_map([], |row| {
2987 Ok(VectorRegenerationInputChunk {
2988 chunk_id: row.get(0)?,
2989 node_logical_id: row.get(1)?,
2990 kind: row.get(2)?,
2991 text_content: row.get(3)?,
2992 byte_start: row.get(4)?,
2993 byte_end: row.get(5)?,
2994 source_ref: row.get(6)?,
2995 created_at: row.get(7)?,
2996 })
2997 })?
2998 .collect::<Result<Vec<_>, _>>()?;
2999 Ok(chunks)
3000}
3001
3002fn validate_generated_embeddings(
3003 config: &VectorRegenerationConfig,
3004 chunks: &[VectorRegenerationInputChunk],
3005 generated: GeneratedEmbeddings,
3006) -> Result<std::collections::HashMap<String, Vec<u8>>, VectorRegenerationFailure> {
3007 if generated.embeddings.len() != chunks.len() {
3008 return Err(VectorRegenerationFailure::new(
3009 VectorRegenerationFailureClass::MalformedGeneratorJson,
3010 format!(
3011 "generator returned {} embedding(s) for {} chunk(s)",
3012 generated.embeddings.len(),
3013 chunks.len()
3014 ),
3015 ));
3016 }
3017
3018 let mut embedding_map = std::collections::HashMap::new();
3019 for embedding in generated.embeddings {
3020 if embedding.embedding.len() != config.dimension {
3021 return Err(VectorRegenerationFailure::new(
3022 VectorRegenerationFailureClass::MalformedGeneratorJson,
3023 format!(
3024 "embedding for chunk '{}' has dimension {}, expected {}",
3025 embedding.chunk_id,
3026 embedding.embedding.len(),
3027 config.dimension
3028 ),
3029 ));
3030 }
3031 if embedding.embedding.iter().any(|value| !value.is_finite()) {
3032 return Err(VectorRegenerationFailure::new(
3033 VectorRegenerationFailureClass::MalformedGeneratorJson,
3034 format!(
3035 "embedding for chunk '{}' contains non-finite values",
3036 embedding.chunk_id
3037 ),
3038 ));
3039 }
3040 let bytes: Vec<u8> = embedding
3041 .embedding
3042 .iter()
3043 .flat_map(|value| value.to_le_bytes())
3044 .collect();
3045 if embedding_map
3046 .insert(embedding.chunk_id.clone(), bytes)
3047 .is_some()
3048 {
3049 return Err(VectorRegenerationFailure::new(
3050 VectorRegenerationFailureClass::MalformedGeneratorJson,
3051 format!(
3052 "duplicate embedding returned for chunk '{}'",
3053 embedding.chunk_id
3054 ),
3055 ));
3056 }
3057 }
3058
3059 Ok(embedding_map)
3060}
3061
3062fn generator_policy_notes(policy: &VectorGeneratorPolicy) -> Vec<String> {
3063 let mut notes = vec!["vector embeddings regenerated from application contract".to_owned()];
3064 if !policy.allowed_executable_roots.is_empty() {
3065 notes.push("generator executable roots enforced by operator policy".to_owned());
3066 }
3067 if !policy.preserve_env_vars.is_empty() {
3068 notes.push("generator environment reduced to preserved variables".to_owned());
3069 }
3070 notes
3071}
3072
3073enum GeneratorStream {
3074 Stdout,
3075 Stderr,
3076}
3077
3078enum StreamReadResult {
3079 Complete(Vec<u8>),
3080 Overflow,
3081 Io(io::Error),
3082}
3083
3084fn validate_bounded_text(
3085 field: &str,
3086 value: &str,
3087 max_len: usize,
3088) -> Result<String, VectorRegenerationFailure> {
3089 let trimmed = value.trim();
3090 if trimmed.is_empty() {
3091 return Err(VectorRegenerationFailure::new(
3092 VectorRegenerationFailureClass::InvalidContract,
3093 format!("{field} must not be empty"),
3094 ));
3095 }
3096 if trimmed.len() > max_len {
3097 return Err(VectorRegenerationFailure::new(
3098 VectorRegenerationFailureClass::InvalidContract,
3099 format!("{field} exceeds max length {max_len}"),
3100 ));
3101 }
3102 Ok(trimmed.to_owned())
3103}
3104
3105fn validate_generator_command(
3106 command: &[String],
3107 policy: &VectorGeneratorPolicy,
3108) -> Result<Vec<String>, VectorRegenerationFailure> {
3109 if command.is_empty() {
3110 return Err(VectorRegenerationFailure::new(
3111 VectorRegenerationFailureClass::InvalidContract,
3112 "generator_command must contain at least one element".to_owned(),
3113 ));
3114 }
3115 let mut total_len = 0usize;
3116 for argument in command {
3117 if argument.is_empty() {
3118 return Err(VectorRegenerationFailure::new(
3119 VectorRegenerationFailureClass::InvalidContract,
3120 "generator_command entries must not be empty".to_owned(),
3121 ));
3122 }
3123 if argument.len() > MAX_GENERATOR_COMMAND_ARG_LEN {
3124 return Err(VectorRegenerationFailure::new(
3125 VectorRegenerationFailureClass::InvalidContract,
3126 format!(
3127 "generator_command argument exceeds max length {MAX_GENERATOR_COMMAND_ARG_LEN}"
3128 ),
3129 ));
3130 }
3131 total_len += argument.len();
3132 }
3133 if total_len > MAX_GENERATOR_COMMAND_TOTAL_LEN {
3134 return Err(VectorRegenerationFailure::new(
3135 VectorRegenerationFailureClass::InvalidContract,
3136 format!(
3137 "generator_command exceeds max serialized length {MAX_GENERATOR_COMMAND_TOTAL_LEN}"
3138 ),
3139 ));
3140 }
3141 executable_trust::validate_generator_executable(&command[0], policy)?;
3142 Ok(command.to_vec())
3143}
3144
3145fn current_vector_profile_dimension(
3146 conn: &rusqlite::Connection,
3147 profile: &str,
3148) -> Result<Option<usize>, VectorRegenerationFailure> {
3149 let dimension: Option<i64> = conn
3150 .query_row(
3151 "SELECT dimension FROM vector_profiles WHERE profile = ?1 AND enabled = 1",
3152 [profile],
3153 |row| row.get(0),
3154 )
3155 .optional()
3156 .map_err(|error| {
3157 VectorRegenerationFailure::new(
3158 VectorRegenerationFailureClass::InvalidContract,
3159 error.to_string(),
3160 )
3161 })?;
3162 dimension
3163 .map(|value| {
3164 usize::try_from(value).map_err(|_| {
3165 VectorRegenerationFailure::new(
3166 VectorRegenerationFailureClass::InvalidContract,
3167 format!("stored vector profile dimension is invalid: {value}"),
3168 )
3169 })
3170 })
3171 .transpose()
3172}
3173
3174fn validate_existing_contract_version(
3175 conn: &rusqlite::Connection,
3176 profile: &str,
3177) -> Result<(), VectorRegenerationFailure> {
3178 let version: Option<i64> = conn
3179 .query_row(
3180 "SELECT contract_format_version FROM vector_embedding_contracts WHERE profile = ?1",
3181 [profile],
3182 |row| row.get(0),
3183 )
3184 .optional()
3185 .map_err(|error| {
3186 VectorRegenerationFailure::new(
3187 VectorRegenerationFailureClass::InvalidContract,
3188 error.to_string(),
3189 )
3190 })?;
3191 if let Some(version) = version
3192 && version > CURRENT_VECTOR_CONTRACT_FORMAT_VERSION
3193 {
3194 return Err(VectorRegenerationFailure::new(
3195 VectorRegenerationFailureClass::InvalidContract,
3196 format!(
3197 "persisted contract format version {version} is unsupported; supported version is {CURRENT_VECTOR_CONTRACT_FORMAT_VERSION}"
3198 ),
3199 ));
3200 }
3201 Ok(())
3202}
3203
3204fn serialize_audit_metadata(
3205 metadata: &VectorRegenerationAuditMetadata,
3206) -> Result<String, EngineError> {
3207 let json =
3208 serde_json::to_string(metadata).map_err(|error| EngineError::Bridge(error.to_string()))?;
3209 if json.len() > MAX_AUDIT_METADATA_BYTES {
3210 return Err(VectorRegenerationFailure::new(
3211 VectorRegenerationFailureClass::InvalidContract,
3212 format!("audit metadata exceeds {MAX_AUDIT_METADATA_BYTES} bytes"),
3213 )
3214 .to_engine_error());
3215 }
3216 Ok(json)
3217}
3218
3219#[allow(clippy::too_many_lines)]
3220fn run_vector_generator_bounded(
3221 config: &VectorRegenerationConfig,
3222 payload: &VectorRegenerationInput,
3223 policy: &VectorGeneratorPolicy,
3224) -> Result<GeneratedEmbeddings, VectorRegenerationFailure> {
3225 if payload.chunks.len() > policy.max_chunks {
3226 return Err(VectorRegenerationFailure::new(
3227 VectorRegenerationFailureClass::PayloadTooLarge,
3228 format!(
3229 "chunk count {} exceeds max_chunks {}",
3230 payload.chunks.len(),
3231 policy.max_chunks
3232 ),
3233 ));
3234 }
3235
3236 let input = serde_json::to_vec(payload).map_err(|error| {
3237 VectorRegenerationFailure::new(
3238 VectorRegenerationFailureClass::MalformedGeneratorJson,
3239 error.to_string(),
3240 )
3241 })?;
3242 if input.len() > policy.max_input_bytes {
3243 return Err(VectorRegenerationFailure::new(
3244 VectorRegenerationFailureClass::PayloadTooLarge,
3245 format!(
3246 "serialized input {} bytes exceeds max_input_bytes {}",
3247 input.len(),
3248 policy.max_input_bytes
3249 ),
3250 ));
3251 }
3252
3253 let mut command = Command::new(config.generator_command.first().ok_or_else(|| {
3254 VectorRegenerationFailure::new(
3255 VectorRegenerationFailureClass::InvalidContract,
3256 "missing generator executable",
3257 )
3258 })?);
3259 command.args(config.generator_command.iter().skip(1));
3260 command.stdin(Stdio::piped());
3261 command.stdout(Stdio::piped());
3262 command.stderr(Stdio::piped());
3263 command.env_clear();
3264 for env_var in &policy.preserve_env_vars {
3265 if let Some(value) = std::env::var_os(env_var) {
3266 command.env(env_var, value);
3267 }
3268 }
3269
3270 let mut child = command.spawn().map_err(|error| {
3271 VectorRegenerationFailure::new(
3272 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3273 format!("failed to spawn generator: {error}"),
3274 )
3275 })?;
3276 if let Some(mut stdin) = child.stdin.take() {
3277 stdin.write_all(&input).map_err(|error| {
3278 VectorRegenerationFailure::new(
3279 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3280 format!("failed to write generator stdin: {error}"),
3281 )
3282 })?;
3283 } else {
3284 return Err(VectorRegenerationFailure::new(
3285 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3286 "failed to open generator stdin",
3287 ));
3288 }
3289
3290 let stdout = child.stdout.take().ok_or_else(|| {
3291 VectorRegenerationFailure::new(
3292 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3293 "failed to open generator stdout",
3294 )
3295 })?;
3296 let stderr = child.stderr.take().ok_or_else(|| {
3297 VectorRegenerationFailure::new(
3298 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3299 "failed to open generator stderr",
3300 )
3301 })?;
3302
3303 let (tx, rx) = mpsc::channel();
3304 let stdout_handle = spawn_capped_reader(
3305 stdout,
3306 policy.max_stdout_bytes,
3307 GeneratorStream::Stdout,
3308 tx.clone(),
3309 );
3310 let stderr_handle =
3311 spawn_capped_reader(stderr, policy.max_stderr_bytes, GeneratorStream::Stderr, tx);
3312
3313 let start = Instant::now();
3314 let timeout = Duration::from_millis(policy.timeout_ms);
3315 let mut stdout_bytes: Option<Vec<u8>> = None;
3316 let mut stderr_bytes: Option<Vec<u8>> = None;
3317 let mut status = None;
3318 let mut stream_error: Option<VectorRegenerationFailure> = None;
3319
3320 while status.is_none() && stream_error.is_none() {
3321 while let Ok((stream, result)) = rx.try_recv() {
3322 match (stream, result) {
3323 (GeneratorStream::Stdout, StreamReadResult::Complete(bytes)) => {
3324 stdout_bytes = Some(bytes);
3325 }
3326 (GeneratorStream::Stderr, StreamReadResult::Complete(bytes)) => {
3327 stderr_bytes = Some(bytes);
3328 }
3329 (GeneratorStream::Stdout, StreamReadResult::Overflow) => {
3330 stream_error = Some(VectorRegenerationFailure::new(
3331 VectorRegenerationFailureClass::GeneratorStdoutOverflow,
3332 format!(
3333 "stdout exceeded max_stdout_bytes {}",
3334 policy.max_stdout_bytes
3335 ),
3336 ));
3337 }
3338 (GeneratorStream::Stderr, StreamReadResult::Overflow) => {
3339 stream_error = Some(VectorRegenerationFailure::new(
3340 VectorRegenerationFailureClass::GeneratorStderrOverflow,
3341 format!(
3342 "stderr exceeded max_stderr_bytes {}",
3343 policy.max_stderr_bytes
3344 ),
3345 ));
3346 }
3347 (_, StreamReadResult::Io(error)) => {
3348 stream_error = Some(VectorRegenerationFailure::new(
3349 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3350 format!("failed to read generator stream: {error}"),
3351 ));
3352 }
3353 }
3354 }
3355
3356 if stream_error.is_some() {
3357 let _ = child.kill();
3358 break;
3359 }
3360 if start.elapsed() > timeout {
3361 let _ = child.kill();
3362 stream_error = Some(VectorRegenerationFailure::new(
3363 VectorRegenerationFailureClass::GeneratorTimeout,
3364 format!("generator exceeded timeout after {}ms", policy.timeout_ms),
3365 ));
3366 break;
3367 }
3368 status = child.try_wait().map_err(|error| {
3369 VectorRegenerationFailure::new(
3370 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3371 format!("failed to poll generator status: {error}"),
3372 )
3373 })?;
3374 if status.is_none() {
3375 thread::sleep(Duration::from_millis(10));
3376 }
3377 }
3378
3379 let _ = child.wait();
3380 let _ = stdout_handle.join();
3381 let _ = stderr_handle.join();
3382
3383 while let Ok((stream, result)) = rx.try_recv() {
3384 match (stream, result) {
3385 (GeneratorStream::Stdout, StreamReadResult::Complete(bytes)) => {
3386 stdout_bytes = Some(bytes);
3387 }
3388 (GeneratorStream::Stderr, StreamReadResult::Complete(bytes)) => {
3389 stderr_bytes = Some(bytes);
3390 }
3391 (GeneratorStream::Stdout, StreamReadResult::Overflow) => {
3392 stream_error = Some(VectorRegenerationFailure::new(
3393 VectorRegenerationFailureClass::GeneratorStdoutOverflow,
3394 format!(
3395 "stdout exceeded max_stdout_bytes {}",
3396 policy.max_stdout_bytes
3397 ),
3398 ));
3399 }
3400 (GeneratorStream::Stderr, StreamReadResult::Overflow) => {
3401 stream_error = Some(VectorRegenerationFailure::new(
3402 VectorRegenerationFailureClass::GeneratorStderrOverflow,
3403 format!(
3404 "stderr exceeded max_stderr_bytes {}",
3405 policy.max_stderr_bytes
3406 ),
3407 ));
3408 }
3409 (_, StreamReadResult::Io(error)) => {
3410 stream_error = Some(VectorRegenerationFailure::new(
3411 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3412 format!("failed to read generator stream: {error}"),
3413 ));
3414 }
3415 }
3416 }
3417
3418 if let Some(error) = stream_error {
3419 return Err(error);
3420 }
3421
3422 let status = status.ok_or_else(|| {
3423 VectorRegenerationFailure::new(
3424 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3425 "vector generator exited without a status",
3426 )
3427 })?;
3428 if !status.success() {
3429 let stderr =
3430 truncate_error_text(&stderr_bytes.unwrap_or_default(), policy.max_stderr_bytes);
3431 return Err(VectorRegenerationFailure::new(
3432 VectorRegenerationFailureClass::GeneratorNonzeroExit,
3433 stderr,
3434 ));
3435 }
3436
3437 let stdout = stdout_bytes.unwrap_or_default();
3438 serde_json::from_slice(&stdout).map_err(|error| {
3439 VectorRegenerationFailure::new(
3440 VectorRegenerationFailureClass::MalformedGeneratorJson,
3441 format!("decode generator output: {error}"),
3442 )
3443 })
3444}
3445
3446fn spawn_capped_reader<R: Read + Send + 'static>(
3447 mut reader: R,
3448 max_bytes: usize,
3449 stream: GeneratorStream,
3450 tx: mpsc::Sender<(GeneratorStream, StreamReadResult)>,
3451) -> thread::JoinHandle<()> {
3452 thread::spawn(move || {
3453 let mut buffer = Vec::new();
3454 let mut chunk = [0u8; 8192];
3455 loop {
3456 match reader.read(&mut chunk) {
3457 Ok(0) => {
3458 let _ = tx.send((stream, StreamReadResult::Complete(buffer)));
3459 break;
3460 }
3461 Ok(read_bytes) => {
3462 if buffer.len() + read_bytes > max_bytes {
3463 let _ = tx.send((stream, StreamReadResult::Overflow));
3464 break;
3465 }
3466 buffer.extend_from_slice(&chunk[..read_bytes]);
3467 }
3468 Err(error) => {
3469 let _ = tx.send((stream, StreamReadResult::Io(error)));
3470 break;
3471 }
3472 }
3473 }
3474 })
3475}
3476
3477fn truncate_error_text(bytes: &[u8], max_bytes: usize) -> String {
3478 let mut text = String::from_utf8_lossy(bytes).into_owned();
3479 if bytes.len() > max_bytes {
3480 text.push_str(" [truncated]");
3481 }
3482 text
3483}
3484
3485fn count_source_ref(
3486 conn: &rusqlite::Connection,
3487 table: &str,
3488 source_ref: &str,
3489) -> Result<usize, EngineError> {
3490 let sql = match table {
3491 "nodes" => "SELECT count(*) FROM nodes WHERE source_ref = ?1",
3492 "edges" => "SELECT count(*) FROM edges WHERE source_ref = ?1",
3493 "actions" => "SELECT count(*) FROM actions WHERE source_ref = ?1",
3494 "operational_mutations" => {
3495 "SELECT count(*) FROM operational_mutations WHERE source_ref = ?1"
3496 }
3497 other => return Err(EngineError::Bridge(format!("unknown table: {other}"))),
3498 };
3499 let count: i64 = conn.query_row(sql, [source_ref], |row| row.get(0))?;
3500 usize::try_from(count)
3503 .map_err(|_| EngineError::Bridge(format!("count overflow for table {table}: {count}")))
3504}
3505
3506fn rebuild_operational_current_rows(
3507 tx: &rusqlite::Transaction<'_>,
3508 collections: &[String],
3509) -> Result<usize, EngineError> {
3510 let mut rebuilt_rows = 0usize;
3511 clear_operational_current_rows(tx, collections)?;
3512 let mut ins_current = tx.prepare_cached(
3513 "INSERT INTO operational_current \
3514 (collection_name, record_key, payload_json, updated_at, last_mutation_id) \
3515 VALUES (?1, ?2, ?3, ?4, ?5)",
3516 )?;
3517
3518 for collection in collections {
3519 let mut stmt = tx.prepare(
3520 "SELECT id, collection_name, record_key, op_kind, payload_json, source_ref, created_at \
3521 FROM operational_mutations \
3522 WHERE collection_name = ?1 \
3523 ORDER BY record_key, mutation_order",
3524 )?;
3525 let mut latest_by_key: std::collections::HashMap<String, Option<(String, i64, String)>> =
3526 std::collections::HashMap::new();
3527 let rows = stmt.query_map([collection], map_operational_mutation_row)?;
3528 for row in rows {
3529 let mutation = row?;
3530 match mutation.op_kind.as_str() {
3531 "put" => {
3532 latest_by_key.insert(
3533 mutation.record_key,
3534 Some((mutation.payload_json, mutation.created_at, mutation.id)),
3535 );
3536 }
3537 "delete" => {
3538 latest_by_key.insert(mutation.record_key, None);
3539 }
3540 _ => {}
3541 }
3542 }
3543
3544 for (record_key, state) in latest_by_key {
3545 if let Some((payload_json, updated_at, last_mutation_id)) = state {
3546 ins_current.execute(rusqlite::params![
3547 collection,
3548 record_key,
3549 payload_json,
3550 updated_at,
3551 last_mutation_id,
3552 ])?;
3553 rebuilt_rows += 1;
3554 }
3555 }
3556 }
3557
3558 drop(ins_current);
3559 Ok(rebuilt_rows)
3560}
3561
3562fn clear_operational_current_rows(
3563 tx: &rusqlite::Transaction<'_>,
3564 collections: &[String],
3565) -> Result<(), EngineError> {
3566 let mut delete_current =
3567 tx.prepare_cached("DELETE FROM operational_current WHERE collection_name = ?1")?;
3568 let mut delete_secondary_current = tx.prepare_cached(
3569 "DELETE FROM operational_secondary_index_entries \
3570 WHERE collection_name = ?1 AND subject_kind = 'current'",
3571 )?;
3572 for collection in collections {
3573 delete_secondary_current.execute([collection])?;
3574 delete_current.execute([collection])?;
3575 }
3576 drop(delete_secondary_current);
3577 drop(delete_current);
3578 Ok(())
3579}
3580
3581fn clear_operational_secondary_index_entries(
3582 tx: &rusqlite::Transaction<'_>,
3583 collection_name: &str,
3584) -> Result<(), EngineError> {
3585 tx.execute(
3586 "DELETE FROM operational_secondary_index_entries WHERE collection_name = ?1",
3587 [collection_name],
3588 )?;
3589 Ok(())
3590}
3591
3592fn insert_operational_secondary_index_entry(
3593 tx: &rusqlite::Transaction<'_>,
3594 collection_name: &str,
3595 subject_kind: &str,
3596 mutation_id: &str,
3597 record_key: &str,
3598 entry: &crate::operational::OperationalSecondaryIndexEntry,
3599) -> Result<(), EngineError> {
3600 tx.execute(
3601 "INSERT INTO operational_secondary_index_entries \
3602 (collection_name, index_name, subject_kind, mutation_id, record_key, sort_timestamp, \
3603 slot1_text, slot1_integer, slot2_text, slot2_integer, slot3_text, slot3_integer) \
3604 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
3605 rusqlite::params![
3606 collection_name,
3607 entry.index_name,
3608 subject_kind,
3609 mutation_id,
3610 record_key,
3611 entry.sort_timestamp,
3612 entry.slot1_text,
3613 entry.slot1_integer,
3614 entry.slot2_text,
3615 entry.slot2_integer,
3616 entry.slot3_text,
3617 entry.slot3_integer,
3618 ],
3619 )?;
3620 Ok(())
3621}
3622
3623fn rebuild_operational_secondary_index_entries(
3624 tx: &rusqlite::Transaction<'_>,
3625 collection_name: &str,
3626 collection_kind: OperationalCollectionKind,
3627 indexes: &[OperationalSecondaryIndexDefinition],
3628) -> Result<(usize, usize), EngineError> {
3629 clear_operational_secondary_index_entries(tx, collection_name)?;
3630
3631 let mut mutation_entries_rebuilt = 0usize;
3632 if collection_kind == OperationalCollectionKind::AppendOnlyLog {
3633 let mut stmt = tx.prepare(
3634 "SELECT id, record_key, payload_json FROM operational_mutations \
3635 WHERE collection_name = ?1 ORDER BY mutation_order",
3636 )?;
3637 let rows = stmt
3638 .query_map([collection_name], |row| {
3639 Ok((
3640 row.get::<_, String>(0)?,
3641 row.get::<_, String>(1)?,
3642 row.get::<_, String>(2)?,
3643 ))
3644 })?
3645 .collect::<Result<Vec<_>, _>>()?;
3646 drop(stmt);
3647 for (mutation_id, record_key, payload_json) in rows {
3648 for entry in extract_secondary_index_entries_for_mutation(indexes, &payload_json) {
3649 insert_operational_secondary_index_entry(
3650 tx,
3651 collection_name,
3652 "mutation",
3653 &mutation_id,
3654 &record_key,
3655 &entry,
3656 )?;
3657 mutation_entries_rebuilt += 1;
3658 }
3659 }
3660 }
3661
3662 let mut current_entries_rebuilt = 0usize;
3663 if collection_kind == OperationalCollectionKind::LatestState {
3664 let mut stmt = tx.prepare(
3665 "SELECT record_key, payload_json, updated_at, last_mutation_id FROM operational_current \
3666 WHERE collection_name = ?1 ORDER BY updated_at DESC, record_key",
3667 )?;
3668 let rows = stmt
3669 .query_map([collection_name], |row| {
3670 Ok((
3671 row.get::<_, String>(0)?,
3672 row.get::<_, String>(1)?,
3673 row.get::<_, i64>(2)?,
3674 row.get::<_, String>(3)?,
3675 ))
3676 })?
3677 .collect::<Result<Vec<_>, _>>()?;
3678 drop(stmt);
3679 for (record_key, payload_json, updated_at, last_mutation_id) in rows {
3680 for entry in
3681 extract_secondary_index_entries_for_current(indexes, &payload_json, updated_at)
3682 {
3683 insert_operational_secondary_index_entry(
3684 tx,
3685 collection_name,
3686 "current",
3687 &last_mutation_id,
3688 &record_key,
3689 &entry,
3690 )?;
3691 current_entries_rebuilt += 1;
3692 }
3693 }
3694 }
3695
3696 Ok((mutation_entries_rebuilt, current_entries_rebuilt))
3697}
3698
3699fn collect_strings_tx(
3700 tx: &rusqlite::Transaction<'_>,
3701 sql: &str,
3702 value: &str,
3703) -> Result<Vec<String>, EngineError> {
3704 let mut stmt = tx.prepare(sql)?;
3705 let rows = stmt.query_map([value], |row| row.get::<_, String>(0))?;
3706 rows.collect::<Result<Vec<_>, _>>()
3707 .map_err(EngineError::from)
3708}
3709
3710#[allow(clippy::expect_used)]
3713fn i64_to_usize(val: i64) -> usize {
3714 usize::try_from(val).expect("count(*) must be non-negative")
3715}
3716
3717fn collect_strings(
3724 conn: &rusqlite::Connection,
3725 sql: &str,
3726 param: &str,
3727) -> Result<Vec<String>, EngineError> {
3728 let mut stmt = conn.prepare(sql)?;
3729 let values = stmt
3730 .query_map([param], |row| row.get::<_, String>(0))?
3731 .collect::<Result<Vec<_>, _>>()?;
3732 Ok(values)
3733}
3734
3735fn collect_edge_logical_ids_for_restore(
3736 tx: &rusqlite::Transaction<'_>,
3737 logical_id: &str,
3738 retire_source_ref: Option<&str>,
3739 retire_created_at: i64,
3740 retire_event_rowid: i64,
3741) -> Result<Vec<String>, EngineError> {
3742 let mut stmt = tx.prepare(
3743 "SELECT DISTINCT e.logical_id \
3744 FROM edges e \
3745 JOIN provenance_events p \
3746 ON p.subject = e.logical_id \
3747 AND p.event_type = 'edge_retire' \
3748 AND ( \
3749 p.created_at > ?3 \
3750 OR (p.created_at = ?3 AND p.rowid >= ?4) \
3751 ) \
3752 AND ((?2 IS NULL AND p.source_ref IS NULL) OR p.source_ref = ?2) \
3753 WHERE e.superseded_at IS NOT NULL \
3754 AND (e.source_logical_id = ?1 OR e.target_logical_id = ?1) \
3755 AND NOT EXISTS ( \
3756 SELECT 1 FROM edges active \
3757 WHERE active.logical_id = e.logical_id \
3758 AND active.superseded_at IS NULL \
3759 ) \
3760 ORDER BY e.logical_id",
3761 )?;
3762 let edge_ids = stmt
3763 .query_map(
3764 rusqlite::params![
3765 logical_id,
3766 retire_source_ref,
3767 retire_created_at,
3768 retire_event_rowid
3769 ],
3770 |row| row.get::<_, String>(0),
3771 )?
3772 .collect::<Result<Vec<_>, _>>()?;
3773 Ok(edge_ids)
3774}
3775
3776fn restore_validated_edges(
3779 tx: &rusqlite::Transaction<'_>,
3780 logical_id: &str,
3781 retire_source_ref: Option<&str>,
3782 retire_created_at: i64,
3783 retire_event_rowid: i64,
3784) -> Result<(usize, Vec<SkippedEdge>), EngineError> {
3785 let edge_logical_ids = collect_edge_logical_ids_for_restore(
3786 tx,
3787 logical_id,
3788 retire_source_ref,
3789 retire_created_at,
3790 retire_event_rowid,
3791 )?;
3792 let mut restored = 0usize;
3793 let mut skipped = Vec::new();
3794 for edge_logical_id in &edge_logical_ids {
3795 let edge_detail: Option<(String, String, String)> = tx
3796 .query_row(
3797 "SELECT row_id, source_logical_id, target_logical_id FROM edges \
3798 WHERE logical_id = ?1 AND superseded_at IS NOT NULL \
3799 ORDER BY superseded_at DESC, created_at DESC, rowid DESC LIMIT 1",
3800 [edge_logical_id.as_str()],
3801 |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
3802 )
3803 .optional()?;
3804 let Some((edge_row_id, source_lid, target_lid)) = edge_detail else {
3805 continue;
3806 };
3807 let other_endpoint = if source_lid == logical_id {
3808 &target_lid
3809 } else {
3810 &source_lid
3811 };
3812 let endpoint_active: bool = tx
3813 .query_row(
3814 "SELECT 1 FROM nodes WHERE logical_id = ?1 AND superseded_at IS NULL LIMIT 1",
3815 [other_endpoint.as_str()],
3816 |_| Ok(true),
3817 )
3818 .optional()?
3819 .unwrap_or(false);
3820 if !endpoint_active {
3821 skipped.push(SkippedEdge {
3822 edge_logical_id: edge_logical_id.clone(),
3823 missing_endpoint: other_endpoint.clone(),
3824 });
3825 continue;
3826 }
3827 restored += tx.execute(
3828 "UPDATE edges SET superseded_at = NULL WHERE row_id = ?1",
3829 [edge_row_id.as_str()],
3830 )?;
3831 }
3832 Ok((restored, skipped))
3833}
3834
3835#[cfg(feature = "sqlite-vec")]
3836fn count_vec_rows_for_logical_id(
3837 tx: &rusqlite::Transaction<'_>,
3838 logical_id: &str,
3839) -> Result<usize, EngineError> {
3840 match tx.query_row(
3841 "SELECT count(*) FROM vec_nodes_active v \
3842 JOIN chunks c ON c.id = v.chunk_id \
3843 WHERE c.node_logical_id = ?1",
3844 [logical_id],
3845 |row| row.get::<_, i64>(0),
3846 ) {
3847 Ok(count) => Ok(i64_to_usize(count)),
3848 Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
3849 if msg.contains("vec_nodes_active") || msg.contains("no such module: vec0") =>
3850 {
3851 Ok(0)
3852 }
3853 Err(error) => Err(EngineError::Sqlite(error)),
3854 }
3855}
3856
3857#[cfg(not(feature = "sqlite-vec"))]
3858#[allow(clippy::unnecessary_wraps)]
3859fn count_vec_rows_for_logical_id(
3860 _tx: &rusqlite::Transaction<'_>,
3861 _logical_id: &str,
3862) -> Result<usize, EngineError> {
3863 Ok(0)
3864}
3865
3866#[cfg(feature = "sqlite-vec")]
3867fn delete_vec_rows_for_logical_id(
3868 tx: &rusqlite::Transaction<'_>,
3869 logical_id: &str,
3870) -> Result<usize, EngineError> {
3871 match tx.execute(
3872 "DELETE FROM vec_nodes_active \
3873 WHERE chunk_id IN (SELECT id FROM chunks WHERE node_logical_id = ?1)",
3874 [logical_id],
3875 ) {
3876 Ok(count) => Ok(count),
3877 Err(rusqlite::Error::SqliteFailure(_, Some(ref msg)))
3878 if msg.contains("vec_nodes_active") || msg.contains("no such module: vec0") =>
3879 {
3880 Ok(0)
3881 }
3882 Err(error) => Err(EngineError::Sqlite(error)),
3883 }
3884}
3885
3886#[cfg(not(feature = "sqlite-vec"))]
3887#[allow(clippy::unnecessary_wraps)]
3888fn delete_vec_rows_for_logical_id(
3889 _tx: &rusqlite::Transaction<'_>,
3890 _logical_id: &str,
3891) -> Result<usize, EngineError> {
3892 Ok(0)
3893}
3894
3895fn ensure_operational_collection_registered(
3896 conn: &rusqlite::Connection,
3897 collection_name: &str,
3898) -> Result<(), EngineError> {
3899 if load_operational_collection_record(conn, collection_name)?.is_none() {
3900 return Err(EngineError::InvalidWrite(format!(
3901 "operational collection '{collection_name}' is not registered"
3902 )));
3903 }
3904 Ok(())
3905}
3906
3907fn load_operational_collection_record(
3908 conn: &rusqlite::Connection,
3909 name: &str,
3910) -> Result<Option<OperationalCollectionRecord>, EngineError> {
3911 conn.query_row(
3912 "SELECT name, kind, schema_json, retention_json, filter_fields_json, validation_json, secondary_indexes_json, format_version, created_at, disabled_at \
3913 FROM operational_collections WHERE name = ?1",
3914 [name],
3915 map_operational_collection_row,
3916 )
3917 .optional()
3918 .map_err(EngineError::Sqlite)
3919}
3920
3921fn validate_append_only_operational_collection(
3922 record: &OperationalCollectionRecord,
3923 operation: &str,
3924) -> Result<(), EngineError> {
3925 if record.kind != OperationalCollectionKind::AppendOnlyLog {
3926 return Err(EngineError::InvalidWrite(format!(
3927 "operational collection '{}' must be append_only_log to {operation}",
3928 record.name
3929 )));
3930 }
3931 Ok(())
3932}
3933
3934#[derive(Clone, Debug, PartialEq, Eq)]
3935struct CompiledOperationalReadFilter {
3936 field: String,
3937 condition: OperationalReadCondition,
3938}
3939
3940#[derive(Clone, Debug)]
3941struct MatchedAppendOnlySecondaryIndexRead<'a> {
3942 index_name: &'a str,
3943 value_filter: &'a CompiledOperationalReadFilter,
3944 time_range: Option<&'a CompiledOperationalReadFilter>,
3945}
3946
3947#[derive(Clone, Debug, PartialEq, Eq)]
3948enum OperationalReadCondition {
3949 ExactString(String),
3950 ExactInteger(i64),
3951 Prefix(String),
3952 Range {
3953 lower: Option<i64>,
3954 upper: Option<i64>,
3955 },
3956}
3957
3958fn operational_read_limit(limit: Option<usize>) -> Result<usize, EngineError> {
3959 let applied_limit = limit.unwrap_or(DEFAULT_OPERATIONAL_READ_LIMIT);
3960 if applied_limit == 0 {
3961 return Err(EngineError::InvalidWrite(
3962 "operational read limit must be greater than zero".to_owned(),
3963 ));
3964 }
3965 Ok(applied_limit.min(MAX_OPERATIONAL_READ_LIMIT))
3966}
3967
3968fn parse_operational_filter_fields(
3969 filter_fields_json: &str,
3970) -> Result<Vec<OperationalFilterField>, String> {
3971 let fields: Vec<OperationalFilterField> = serde_json::from_str(filter_fields_json)
3972 .map_err(|error| format!("invalid filter_fields_json: {error}"))?;
3973 let mut seen = std::collections::HashSet::new();
3974 for field in &fields {
3975 if field.name.trim().is_empty() {
3976 return Err("filter_fields_json field names must not be empty".to_owned());
3977 }
3978 if !seen.insert(field.name.as_str()) {
3979 return Err(format!(
3980 "filter_fields_json contains duplicate field '{}'",
3981 field.name
3982 ));
3983 }
3984 if field.modes.is_empty() {
3985 return Err(format!(
3986 "filter_fields_json field '{}' must declare at least one mode",
3987 field.name
3988 ));
3989 }
3990 if field.modes.contains(&OperationalFilterMode::Prefix)
3991 && field.field_type != OperationalFilterFieldType::String
3992 {
3993 return Err(format!(
3994 "filter field '{}' only supports prefix for string types",
3995 field.name
3996 ));
3997 }
3998 }
3999 Ok(fields)
4000}
4001
4002fn compile_operational_read_filters(
4003 filters: &[OperationalFilterClause],
4004 declared_fields: &[OperationalFilterField],
4005) -> Result<Vec<CompiledOperationalReadFilter>, EngineError> {
4006 let field_map = declared_fields
4007 .iter()
4008 .map(|field| (field.name.as_str(), field))
4009 .collect::<std::collections::HashMap<_, _>>();
4010 filters
4011 .iter()
4012 .map(|filter| match filter {
4013 OperationalFilterClause::Exact { field, value } => {
4014 let declared = field_map.get(field.as_str()).ok_or_else(|| {
4015 EngineError::InvalidWrite(format!(
4016 "operational read filter uses undeclared field '{field}'"
4017 ))
4018 })?;
4019 if !declared.modes.contains(&OperationalFilterMode::Exact) {
4020 return Err(EngineError::InvalidWrite(format!(
4021 "operational read field '{field}' does not allow exact filters"
4022 )));
4023 }
4024 let condition = match (declared.field_type, value) {
4025 (OperationalFilterFieldType::String, OperationalFilterValue::String(value)) => {
4026 OperationalReadCondition::ExactString(value.clone())
4027 }
4028 (
4029 OperationalFilterFieldType::Integer | OperationalFilterFieldType::Timestamp,
4030 OperationalFilterValue::Integer(value),
4031 ) => OperationalReadCondition::ExactInteger(*value),
4032 _ => {
4033 return Err(EngineError::InvalidWrite(format!(
4034 "operational read field '{field}' received a value with the wrong type"
4035 )));
4036 }
4037 };
4038 Ok(CompiledOperationalReadFilter {
4039 field: field.clone(),
4040 condition,
4041 })
4042 }
4043 OperationalFilterClause::Prefix { field, value } => {
4044 let declared = field_map.get(field.as_str()).ok_or_else(|| {
4045 EngineError::InvalidWrite(format!(
4046 "operational read filter uses undeclared field '{field}'"
4047 ))
4048 })?;
4049 if !declared.modes.contains(&OperationalFilterMode::Prefix) {
4050 return Err(EngineError::InvalidWrite(format!(
4051 "operational read field '{field}' does not allow prefix filters"
4052 )));
4053 }
4054 if declared.field_type != OperationalFilterFieldType::String {
4055 return Err(EngineError::InvalidWrite(format!(
4056 "operational read field '{field}' only supports prefix filters for strings"
4057 )));
4058 }
4059 Ok(CompiledOperationalReadFilter {
4060 field: field.clone(),
4061 condition: OperationalReadCondition::Prefix(value.clone()),
4062 })
4063 }
4064 OperationalFilterClause::Range {
4065 field,
4066 lower,
4067 upper,
4068 } => {
4069 let declared = field_map.get(field.as_str()).ok_or_else(|| {
4070 EngineError::InvalidWrite(format!(
4071 "operational read filter uses undeclared field '{field}'"
4072 ))
4073 })?;
4074 if !declared.modes.contains(&OperationalFilterMode::Range) {
4075 return Err(EngineError::InvalidWrite(format!(
4076 "operational read field '{field}' does not allow range filters"
4077 )));
4078 }
4079 if !matches!(
4080 declared.field_type,
4081 OperationalFilterFieldType::Integer | OperationalFilterFieldType::Timestamp
4082 ) {
4083 return Err(EngineError::InvalidWrite(format!(
4084 "operational read field '{field}' only supports range filters for integer/timestamp fields"
4085 )));
4086 }
4087 if lower.is_none() && upper.is_none() {
4088 return Err(EngineError::InvalidWrite(format!(
4089 "operational read range filter for '{field}' must specify a lower or upper bound"
4090 )));
4091 }
4092 Ok(CompiledOperationalReadFilter {
4093 field: field.clone(),
4094 condition: OperationalReadCondition::Range {
4095 lower: *lower,
4096 upper: *upper,
4097 },
4098 })
4099 }
4100 })
4101 .collect()
4102}
4103
4104fn match_append_only_secondary_index_read<'a>(
4105 filters: &'a [CompiledOperationalReadFilter],
4106 indexes: &'a [OperationalSecondaryIndexDefinition],
4107) -> Option<MatchedAppendOnlySecondaryIndexRead<'a>> {
4108 indexes.iter().find_map(|index| {
4109 let OperationalSecondaryIndexDefinition::AppendOnlyFieldTime {
4110 name,
4111 field,
4112 value_type,
4113 time_field,
4114 } = index
4115 else {
4116 return None;
4117 };
4118 if !(1..=2).contains(&filters.len()) {
4119 return None;
4120 }
4121
4122 let mut value_filter = None;
4123 let mut time_range = None;
4124 for filter in filters {
4125 if filter.field == *field {
4126 let supported = matches!(
4127 (&filter.condition, value_type),
4128 (
4129 OperationalReadCondition::ExactString(_)
4130 | OperationalReadCondition::Prefix(_),
4131 crate::operational::OperationalSecondaryIndexValueType::String
4132 ) | (
4133 OperationalReadCondition::ExactInteger(_),
4134 crate::operational::OperationalSecondaryIndexValueType::Integer
4135 | crate::operational::OperationalSecondaryIndexValueType::Timestamp
4136 )
4137 );
4138 if !supported || value_filter.is_some() {
4139 return None;
4140 }
4141 value_filter = Some(filter);
4142 continue;
4143 }
4144 if filter.field == *time_field {
4145 if !matches!(filter.condition, OperationalReadCondition::Range { .. })
4146 || time_range.is_some()
4147 {
4148 return None;
4149 }
4150 time_range = Some(filter);
4151 continue;
4152 }
4153 return None;
4154 }
4155
4156 value_filter.map(|value_filter| MatchedAppendOnlySecondaryIndexRead {
4157 index_name: name.as_str(),
4158 value_filter,
4159 time_range,
4160 })
4161 })
4162}
4163
4164fn execute_operational_secondary_index_read(
4165 conn: &rusqlite::Connection,
4166 collection_name: &str,
4167 filters: &[CompiledOperationalReadFilter],
4168 indexes: &[OperationalSecondaryIndexDefinition],
4169 applied_limit: usize,
4170) -> Result<Option<OperationalReadReport>, EngineError> {
4171 use rusqlite::types::Value;
4172
4173 let Some(matched) = match_append_only_secondary_index_read(filters, indexes) else {
4174 return Ok(None);
4175 };
4176
4177 let mut sql = String::from(
4178 "SELECT m.id, m.collection_name, m.record_key, m.op_kind, m.payload_json, m.source_ref, m.created_at \
4179 FROM operational_secondary_index_entries s \
4180 JOIN operational_mutations m ON m.id = s.mutation_id \
4181 WHERE s.collection_name = ?1 AND s.index_name = ?2 AND s.subject_kind = 'mutation' ",
4182 );
4183 let mut params = vec![
4184 Value::from(collection_name.to_owned()),
4185 Value::from(matched.index_name.to_owned()),
4186 ];
4187
4188 match &matched.value_filter.condition {
4189 OperationalReadCondition::ExactString(value) => {
4190 let _ = write!(sql, "AND s.slot1_text = ?{} ", params.len() + 1);
4191 params.push(Value::from(value.clone()));
4192 }
4193 OperationalReadCondition::Prefix(value) => {
4194 let _ = write!(sql, "AND s.slot1_text GLOB ?{} ", params.len() + 1);
4195 params.push(Value::from(glob_prefix_pattern(value)));
4196 }
4197 OperationalReadCondition::ExactInteger(value) => {
4198 let _ = write!(sql, "AND s.slot1_integer = ?{} ", params.len() + 1);
4199 params.push(Value::from(*value));
4200 }
4201 OperationalReadCondition::Range { .. } => return Ok(None),
4202 }
4203
4204 if let Some(time_range) = matched.time_range
4205 && let OperationalReadCondition::Range { lower, upper } = &time_range.condition
4206 {
4207 if let Some(lower) = lower {
4208 let _ = write!(sql, "AND s.sort_timestamp >= ?{} ", params.len() + 1);
4209 params.push(Value::from(*lower));
4210 }
4211 if let Some(upper) = upper {
4212 let _ = write!(sql, "AND s.sort_timestamp <= ?{} ", params.len() + 1);
4213 params.push(Value::from(*upper));
4214 }
4215 }
4216
4217 let _ = write!(
4218 sql,
4219 "ORDER BY s.sort_timestamp DESC, m.mutation_order DESC LIMIT ?{}",
4220 params.len() + 1
4221 );
4222 params.push(Value::from(i64::try_from(applied_limit + 1).map_err(
4223 |_| EngineError::Bridge("operational read limit overflow".to_owned()),
4224 )?));
4225
4226 let mut stmt = conn.prepare(&sql)?;
4227 let mut rows = stmt
4228 .query_map(
4229 rusqlite::params_from_iter(params),
4230 map_operational_mutation_row,
4231 )?
4232 .collect::<Result<Vec<_>, _>>()?;
4233 let was_limited = rows.len() > applied_limit;
4234 if was_limited {
4235 rows.truncate(applied_limit);
4236 }
4237
4238 Ok(Some(OperationalReadReport {
4239 collection_name: collection_name.to_owned(),
4240 row_count: rows.len(),
4241 applied_limit,
4242 was_limited,
4243 rows,
4244 }))
4245}
4246
4247fn execute_operational_filtered_read(
4248 conn: &rusqlite::Connection,
4249 collection_name: &str,
4250 filters: &[CompiledOperationalReadFilter],
4251 applied_limit: usize,
4252) -> Result<OperationalReadReport, EngineError> {
4253 use rusqlite::types::Value;
4254
4255 let mut sql = String::from(
4256 "SELECT m.id, m.collection_name, m.record_key, m.op_kind, m.payload_json, m.source_ref, m.created_at \
4257 FROM operational_mutations m ",
4258 );
4259 let mut params = vec![Value::from(collection_name.to_owned())];
4260 for (index, filter) in filters.iter().enumerate() {
4261 let _ = write!(
4262 sql,
4263 "JOIN operational_filter_values f{index} \
4264 ON f{index}.mutation_id = m.id \
4265 AND f{index}.collection_name = m.collection_name "
4266 );
4267 match &filter.condition {
4268 OperationalReadCondition::ExactString(value) => {
4269 let _ = write!(
4270 sql,
4271 "AND f{index}.field_name = ?{} AND f{index}.string_value = ?{} ",
4272 params.len() + 1,
4273 params.len() + 2
4274 );
4275 params.push(Value::from(filter.field.clone()));
4276 params.push(Value::from(value.clone()));
4277 }
4278 OperationalReadCondition::ExactInteger(value) => {
4279 let _ = write!(
4280 sql,
4281 "AND f{index}.field_name = ?{} AND f{index}.integer_value = ?{} ",
4282 params.len() + 1,
4283 params.len() + 2
4284 );
4285 params.push(Value::from(filter.field.clone()));
4286 params.push(Value::from(*value));
4287 }
4288 OperationalReadCondition::Prefix(value) => {
4289 let _ = write!(
4290 sql,
4291 "AND f{index}.field_name = ?{} AND f{index}.string_value GLOB ?{} ",
4292 params.len() + 1,
4293 params.len() + 2
4294 );
4295 params.push(Value::from(filter.field.clone()));
4296 params.push(Value::from(glob_prefix_pattern(value)));
4297 }
4298 OperationalReadCondition::Range { lower, upper } => {
4299 let _ = write!(sql, "AND f{index}.field_name = ?{} ", params.len() + 1);
4300 params.push(Value::from(filter.field.clone()));
4301 if let Some(lower) = lower {
4302 let _ = write!(sql, "AND f{index}.integer_value >= ?{} ", params.len() + 1);
4303 params.push(Value::from(*lower));
4304 }
4305 if let Some(upper) = upper {
4306 let _ = write!(sql, "AND f{index}.integer_value <= ?{} ", params.len() + 1);
4307 params.push(Value::from(*upper));
4308 }
4309 }
4310 }
4311 }
4312 let _ = write!(
4313 sql,
4314 "WHERE m.collection_name = ?1 ORDER BY m.mutation_order DESC LIMIT ?{}",
4315 params.len() + 1
4316 );
4317 params.push(Value::from(i64::try_from(applied_limit + 1).map_err(
4318 |_| EngineError::Bridge("operational read limit overflow".to_owned()),
4319 )?));
4320
4321 let mut stmt = conn.prepare(&sql)?;
4322 let mut rows = stmt
4323 .query_map(
4324 rusqlite::params_from_iter(params),
4325 map_operational_mutation_row,
4326 )?
4327 .collect::<Result<Vec<_>, _>>()?;
4328 let was_limited = rows.len() > applied_limit;
4329 if was_limited {
4330 rows.truncate(applied_limit);
4331 }
4332 Ok(OperationalReadReport {
4333 collection_name: collection_name.to_owned(),
4334 row_count: rows.len(),
4335 applied_limit,
4336 was_limited,
4337 rows,
4338 })
4339}
4340
4341fn glob_prefix_pattern(value: &str) -> String {
4342 let mut pattern = String::with_capacity(value.len() + 1);
4343 for ch in value.chars() {
4344 match ch {
4345 '*' => pattern.push_str("[*]"),
4346 '?' => pattern.push_str("[?]"),
4347 '[' => pattern.push_str("[[]"),
4348 _ => pattern.push(ch),
4349 }
4350 }
4351 pattern.push('*');
4352 pattern
4353}
4354
4355#[derive(Clone, Debug, PartialEq, Eq)]
4356struct ExtractedOperationalFilterValue {
4357 field_name: String,
4358 string_value: Option<String>,
4359 integer_value: Option<i64>,
4360}
4361
4362fn extract_operational_filter_values(
4363 filter_fields: &[OperationalFilterField],
4364 payload_json: &str,
4365) -> Vec<ExtractedOperationalFilterValue> {
4366 let Ok(parsed) = serde_json::from_str::<serde_json::Value>(payload_json) else {
4367 return Vec::new();
4368 };
4369 let Some(object) = parsed.as_object() else {
4370 return Vec::new();
4371 };
4372
4373 filter_fields
4374 .iter()
4375 .filter_map(|field| {
4376 let value = object.get(&field.name)?;
4377 match field.field_type {
4378 OperationalFilterFieldType::String => {
4379 value
4380 .as_str()
4381 .map(|string_value| ExtractedOperationalFilterValue {
4382 field_name: field.name.clone(),
4383 string_value: Some(string_value.to_owned()),
4384 integer_value: None,
4385 })
4386 }
4387 OperationalFilterFieldType::Integer | OperationalFilterFieldType::Timestamp => {
4388 value
4389 .as_i64()
4390 .map(|integer_value| ExtractedOperationalFilterValue {
4391 field_name: field.name.clone(),
4392 string_value: None,
4393 integer_value: Some(integer_value),
4394 })
4395 }
4396 }
4397 })
4398 .collect()
4399}
4400
4401fn operational_compaction_candidates(
4402 conn: &rusqlite::Connection,
4403 retention_json: &str,
4404 collection_name: &str,
4405) -> Result<(Vec<String>, Option<i64>), EngineError> {
4406 operational_compaction_candidates_at(
4407 conn,
4408 retention_json,
4409 collection_name,
4410 current_unix_timestamp()?,
4411 )
4412}
4413
4414fn operational_compaction_candidates_at(
4415 conn: &rusqlite::Connection,
4416 retention_json: &str,
4417 collection_name: &str,
4418 now_timestamp: i64,
4419) -> Result<(Vec<String>, Option<i64>), EngineError> {
4420 let policy = parse_operational_retention_policy(retention_json)?;
4421 match policy {
4422 OperationalRetentionPolicy::KeepAll => Ok((Vec::new(), None)),
4423 OperationalRetentionPolicy::PurgeBeforeSeconds { max_age_seconds } => {
4424 let before_timestamp = now_timestamp - max_age_seconds;
4425 let mut stmt = conn.prepare(
4426 "SELECT id FROM operational_mutations \
4427 WHERE collection_name = ?1 AND created_at < ?2 \
4428 ORDER BY mutation_order",
4429 )?;
4430 let mutation_ids = stmt
4431 .query_map(
4432 rusqlite::params![collection_name, before_timestamp],
4433 |row| row.get::<_, String>(0),
4434 )?
4435 .collect::<Result<Vec<_>, _>>()?;
4436 Ok((mutation_ids, Some(before_timestamp)))
4437 }
4438 OperationalRetentionPolicy::KeepLast { max_rows } => {
4439 let mut stmt = conn.prepare(
4440 "SELECT id FROM operational_mutations \
4441 WHERE collection_name = ?1 \
4442 ORDER BY mutation_order DESC",
4443 )?;
4444 let ordered_ids = stmt
4445 .query_map([collection_name], |row| row.get::<_, String>(0))?
4446 .collect::<Result<Vec<_>, _>>()?;
4447 Ok((ordered_ids.into_iter().skip(max_rows).collect(), None))
4448 }
4449 }
4450}
4451
4452fn parse_operational_retention_policy(
4453 retention_json: &str,
4454) -> Result<OperationalRetentionPolicy, EngineError> {
4455 let policy: OperationalRetentionPolicy = serde_json::from_str(retention_json)
4456 .map_err(|error| EngineError::InvalidWrite(format!("invalid retention_json: {error}")))?;
4457 match policy {
4458 OperationalRetentionPolicy::KeepAll => Ok(policy),
4459 OperationalRetentionPolicy::PurgeBeforeSeconds { max_age_seconds } => {
4460 if max_age_seconds <= 0 {
4461 return Err(EngineError::InvalidWrite(
4462 "retention_json max_age_seconds must be greater than zero".to_owned(),
4463 ));
4464 }
4465 Ok(policy)
4466 }
4467 OperationalRetentionPolicy::KeepLast { max_rows } => {
4468 if max_rows == 0 {
4469 return Err(EngineError::InvalidWrite(
4470 "retention_json max_rows must be greater than zero".to_owned(),
4471 ));
4472 }
4473 Ok(policy)
4474 }
4475 }
4476}
4477
4478fn load_operational_retention_records(
4479 conn: &rusqlite::Connection,
4480 collection_names: Option<&[String]>,
4481 max_collections: Option<usize>,
4482) -> Result<Vec<OperationalCollectionRecord>, EngineError> {
4483 let limit = max_collections.unwrap_or(usize::MAX);
4484 if limit == 0 {
4485 return Err(EngineError::InvalidWrite(
4486 "max_collections must be greater than zero".to_owned(),
4487 ));
4488 }
4489
4490 let mut records = Vec::new();
4491 if let Some(collection_names) = collection_names {
4492 for name in collection_names.iter().take(limit) {
4493 let record = load_operational_collection_record(conn, name)?.ok_or_else(|| {
4494 EngineError::InvalidWrite(format!(
4495 "operational collection '{name}' is not registered"
4496 ))
4497 })?;
4498 records.push(record);
4499 }
4500 return Ok(records);
4501 }
4502
4503 let mut stmt = conn.prepare(
4504 "SELECT name, kind, schema_json, retention_json, filter_fields_json, validation_json, secondary_indexes_json, format_version, created_at, disabled_at \
4505 FROM operational_collections ORDER BY name",
4506 )?;
4507 let rows = stmt
4508 .query_map([], map_operational_collection_row)?
4509 .take(limit)
4510 .collect::<Result<Vec<_>, _>>()?;
4511 Ok(rows)
4512}
4513
4514fn last_operational_retention_run_at(
4515 conn: &rusqlite::Connection,
4516 collection_name: &str,
4517) -> Result<Option<i64>, EngineError> {
4518 conn.query_row(
4519 "SELECT MAX(executed_at) FROM operational_retention_runs WHERE collection_name = ?1",
4520 [collection_name],
4521 |row| row.get(0),
4522 )
4523 .optional()
4524 .map_err(EngineError::Sqlite)
4525 .map(Option::flatten)
4526}
4527
4528fn count_operational_mutations_for_collection(
4529 conn: &rusqlite::Connection,
4530 collection_name: &str,
4531) -> Result<usize, EngineError> {
4532 let count: i64 = conn.query_row(
4533 "SELECT count(*) FROM operational_mutations WHERE collection_name = ?1",
4534 [collection_name],
4535 |row| row.get(0),
4536 )?;
4537 usize::try_from(count).map_err(|_| {
4538 EngineError::Bridge(format!("count overflow for collection {collection_name}"))
4539 })
4540}
4541
4542fn retention_action_kind_and_limit(
4543 policy: &OperationalRetentionPolicy,
4544) -> (OperationalRetentionActionKind, Option<usize>) {
4545 match policy {
4546 OperationalRetentionPolicy::KeepAll => (OperationalRetentionActionKind::Noop, None),
4547 OperationalRetentionPolicy::PurgeBeforeSeconds { .. } => {
4548 (OperationalRetentionActionKind::PurgeBeforeSeconds, None)
4549 }
4550 OperationalRetentionPolicy::KeepLast { max_rows } => {
4551 (OperationalRetentionActionKind::KeepLast, Some(*max_rows))
4552 }
4553 }
4554}
4555
4556fn plan_operational_retention_item(
4557 conn: &rusqlite::Connection,
4558 record: &OperationalCollectionRecord,
4559 now_timestamp: i64,
4560) -> Result<OperationalRetentionPlanItem, EngineError> {
4561 let last_run_at = last_operational_retention_run_at(conn, &record.name)?;
4562 if record.kind != OperationalCollectionKind::AppendOnlyLog {
4563 return Ok(OperationalRetentionPlanItem {
4564 collection_name: record.name.clone(),
4565 action_kind: OperationalRetentionActionKind::Noop,
4566 candidate_deletions: 0,
4567 before_timestamp: None,
4568 max_rows: None,
4569 last_run_at,
4570 });
4571 }
4572 let policy = parse_operational_retention_policy(&record.retention_json)?;
4573 let (action_kind, max_rows) = retention_action_kind_and_limit(&policy);
4574 let (candidate_ids, before_timestamp) = operational_compaction_candidates_at(
4575 conn,
4576 &record.retention_json,
4577 &record.name,
4578 now_timestamp,
4579 )?;
4580 Ok(OperationalRetentionPlanItem {
4581 collection_name: record.name.clone(),
4582 action_kind,
4583 candidate_deletions: candidate_ids.len(),
4584 before_timestamp,
4585 max_rows,
4586 last_run_at,
4587 })
4588}
4589
4590fn run_operational_retention_item(
4591 tx: &rusqlite::Transaction<'_>,
4592 record: &OperationalCollectionRecord,
4593 now_timestamp: i64,
4594 dry_run: bool,
4595) -> Result<OperationalRetentionRunItem, EngineError> {
4596 let plan = plan_operational_retention_item(tx, record, now_timestamp)?;
4597 let mut deleted_mutations = 0usize;
4598 if record.kind == OperationalCollectionKind::AppendOnlyLog
4599 && plan.action_kind != OperationalRetentionActionKind::Noop
4600 && plan.candidate_deletions > 0
4601 && !dry_run
4602 {
4603 let (candidate_ids, _) = operational_compaction_candidates_at(
4604 tx,
4605 &record.retention_json,
4606 &record.name,
4607 now_timestamp,
4608 )?;
4609 let mut delete_stmt =
4610 tx.prepare_cached("DELETE FROM operational_mutations WHERE id = ?1")?;
4611 for mutation_id in &candidate_ids {
4612 delete_stmt.execute([mutation_id.as_str()])?;
4613 deleted_mutations += 1;
4614 }
4615 drop(delete_stmt);
4616
4617 persist_simple_provenance_event(
4618 tx,
4619 "operational_retention_run",
4620 &record.name,
4621 Some(serde_json::json!({
4622 "action_kind": plan.action_kind,
4623 "deleted_mutations": deleted_mutations,
4624 "before_timestamp": plan.before_timestamp,
4625 "max_rows": plan.max_rows,
4626 "executed_at": now_timestamp,
4627 })),
4628 )?;
4629 }
4630
4631 let live_rows_remaining = count_operational_mutations_for_collection(tx, &record.name)?;
4632 let effective_deleted_mutations = if dry_run {
4633 plan.candidate_deletions
4634 } else {
4635 deleted_mutations
4636 };
4637 let rows_remaining = if dry_run {
4638 live_rows_remaining.saturating_sub(effective_deleted_mutations)
4639 } else {
4640 live_rows_remaining
4641 };
4642 if !dry_run && plan.action_kind != OperationalRetentionActionKind::Noop {
4643 tx.execute(
4644 "INSERT INTO operational_retention_runs \
4645 (id, collection_name, executed_at, action_kind, dry_run, deleted_mutations, rows_remaining, metadata_json) \
4646 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
4647 rusqlite::params![
4648 new_id(),
4649 record.name,
4650 now_timestamp,
4651 serde_json::to_string(&plan.action_kind)
4652 .unwrap_or_else(|_| "\"noop\"".to_owned())
4653 .trim_matches('"')
4654 .to_owned(),
4655 i32::from(dry_run),
4656 deleted_mutations,
4657 rows_remaining,
4658 serde_json::json!({
4659 "before_timestamp": plan.before_timestamp,
4660 "max_rows": plan.max_rows,
4661 })
4662 .to_string(),
4663 ],
4664 )?;
4665 }
4666
4667 Ok(OperationalRetentionRunItem {
4668 collection_name: plan.collection_name,
4669 action_kind: plan.action_kind,
4670 deleted_mutations: effective_deleted_mutations,
4671 before_timestamp: plan.before_timestamp,
4672 max_rows: plan.max_rows,
4673 rows_remaining,
4674 })
4675}
4676
4677fn current_unix_timestamp() -> Result<i64, EngineError> {
4678 let now = SystemTime::now()
4679 .duration_since(SystemTime::UNIX_EPOCH)
4680 .map_err(|error| EngineError::Bridge(format!("system clock error: {error}")))?;
4681 i64::try_from(now.as_secs())
4682 .map_err(|_| EngineError::Bridge("unix timestamp overflow".to_owned()))
4683}
4684
4685fn map_operational_collection_row(
4686 row: &rusqlite::Row<'_>,
4687) -> Result<OperationalCollectionRecord, rusqlite::Error> {
4688 let kind_text: String = row.get(1)?;
4689 let kind = OperationalCollectionKind::try_from(kind_text.as_str()).map_err(|message| {
4690 rusqlite::Error::FromSqlConversionFailure(
4691 1,
4692 rusqlite::types::Type::Text,
4693 Box::new(io::Error::new(io::ErrorKind::InvalidData, message)),
4694 )
4695 })?;
4696 Ok(OperationalCollectionRecord {
4697 name: row.get(0)?,
4698 kind,
4699 schema_json: row.get(2)?,
4700 retention_json: row.get(3)?,
4701 filter_fields_json: row.get(4)?,
4702 validation_json: row.get(5)?,
4703 secondary_indexes_json: row.get(6)?,
4704 format_version: row.get(7)?,
4705 created_at: row.get(8)?,
4706 disabled_at: row.get(9)?,
4707 })
4708}
4709
4710fn map_operational_mutation_row(
4711 row: &rusqlite::Row<'_>,
4712) -> Result<OperationalMutationRow, rusqlite::Error> {
4713 Ok(OperationalMutationRow {
4714 id: row.get(0)?,
4715 collection_name: row.get(1)?,
4716 record_key: row.get(2)?,
4717 op_kind: row.get(3)?,
4718 payload_json: row.get(4)?,
4719 source_ref: row.get(5)?,
4720 created_at: row.get(6)?,
4721 })
4722}
4723
4724fn map_operational_current_row(
4725 row: &rusqlite::Row<'_>,
4726) -> Result<OperationalCurrentRow, rusqlite::Error> {
4727 Ok(OperationalCurrentRow {
4728 collection_name: row.get(0)?,
4729 record_key: row.get(1)?,
4730 payload_json: row.get(2)?,
4731 updated_at: row.get(3)?,
4732 last_mutation_id: row.get(4)?,
4733 })
4734}
4735
4736#[cfg(test)]
4737#[allow(clippy::expect_used)]
4738mod tests {
4739 use std::fs;
4740 use std::sync::Arc;
4741
4742 use fathomdb_schema::SchemaManager;
4743 use tempfile::NamedTempFile;
4744
4745 use super::{AdminService, SafeExportOptions, VectorRegenerationConfig};
4746 use crate::projection::ProjectionTarget;
4747 use crate::sqlite;
4748 use crate::{
4749 EngineError, ExecutionCoordinator, OperationalCollectionKind, OperationalRegisterRequest,
4750 TelemetryCounters,
4751 };
4752
4753 use fathomdb_query::QueryBuilder;
4754
4755 #[cfg(feature = "sqlite-vec")]
4756 use super::{VectorGeneratorPolicy, load_vector_regeneration_config};
4757
4758 #[allow(dead_code)]
4759 #[cfg(unix)]
4760 fn set_file_mode(path: &std::path::Path, mode: u32) {
4761 use std::os::unix::fs::PermissionsExt;
4762
4763 let mut permissions = fs::metadata(path).expect("script metadata").permissions();
4764 permissions.set_mode(mode);
4765 fs::set_permissions(path, permissions).expect("chmod");
4766 }
4767
4768 #[allow(dead_code)]
4769 #[cfg(not(unix))]
4770 fn set_file_mode(_path: &std::path::Path, _mode: u32) {}
4771
4772 fn setup() -> (NamedTempFile, AdminService) {
4773 let db = NamedTempFile::new().expect("temp file");
4774 let schema = Arc::new(SchemaManager::new());
4775 {
4776 let conn = sqlite::open_connection(db.path()).expect("connection");
4777 schema.bootstrap(&conn).expect("bootstrap");
4778 }
4779 let service = AdminService::new(db.path(), Arc::clone(&schema));
4780 (db, service)
4781 }
4782
4783 #[test]
4784 fn check_integrity_includes_active_uniqueness_count() {
4785 let (_db, service) = setup();
4786 let report = service.check_integrity().expect("integrity check");
4787 assert_eq!(report.duplicate_active_logical_ids, 0);
4788 assert_eq!(report.operational_missing_collections, 0);
4789 assert_eq!(report.operational_missing_last_mutations, 0);
4790 }
4791
4792 #[test]
4793 fn trace_source_returns_node_logical_ids() {
4794 let (db, service) = setup();
4795 {
4796 let conn = sqlite::open_connection(db.path()).expect("conn");
4797 conn.execute(
4798 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
4799 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 'source-1')",
4800 [],
4801 )
4802 .expect("insert node");
4803 }
4804 let report = service.trace_source("source-1").expect("trace");
4805 assert_eq!(report.node_rows, 1);
4806 assert_eq!(report.node_logical_ids, vec!["lg1"]);
4807 }
4808
4809 #[test]
4810 fn trace_source_includes_operational_mutations() {
4811 let (db, service) = setup();
4812 {
4813 let conn = sqlite::open_connection(db.path()).expect("conn");
4814 conn.execute(
4815 "INSERT INTO operational_collections \
4816 (name, kind, schema_json, retention_json, format_version, created_at) \
4817 VALUES ('connector_health', 'latest_state', '{}', '{}', 1, 100)",
4818 [],
4819 )
4820 .expect("insert collection");
4821 conn.execute(
4822 "INSERT INTO operational_mutations \
4823 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
4824 VALUES ('m1', 'connector_health', 'gmail', 'put', '{\"status\":\"ok\"}', 'source-1', 100, 1)",
4825 [],
4826 )
4827 .expect("insert mutation");
4828 }
4829
4830 let report = service.trace_source("source-1").expect("trace");
4831 assert_eq!(report.operational_mutation_rows, 1);
4832 assert_eq!(report.operational_mutation_ids, vec!["m1"]);
4833 }
4834
4835 #[test]
4836 fn excise_source_restores_prior_active_node() {
4837 let (db, service) = setup();
4838 {
4839 let conn = sqlite::open_connection(db.path()).expect("conn");
4840 conn.execute(
4841 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
4842 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 200, 'source-1')",
4843 [],
4844 )
4845 .expect("insert v1 superseded");
4846 conn.execute(
4847 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
4848 VALUES ('r2', 'lg1', 'Meeting', '{}', 200, 'source-2')",
4849 [],
4850 )
4851 .expect("insert v2 active");
4852 }
4853 service.excise_source("source-2").expect("excise");
4854 {
4855 let conn = sqlite::open_connection(db.path()).expect("conn");
4856 let active_row_id: String = conn
4857 .query_row(
4858 "SELECT row_id FROM nodes WHERE logical_id = 'lg1' AND superseded_at IS NULL",
4859 [],
4860 |row| row.get(0),
4861 )
4862 .expect("active row exists after excise");
4863 assert_eq!(active_row_id, "r1");
4864 }
4865 }
4866
4867 #[test]
4868 fn excise_source_deletes_operational_mutations_and_repairs_latest_state_current() {
4869 let (db, service) = setup();
4870 {
4871 let conn = sqlite::open_connection(db.path()).expect("conn");
4872 conn.execute(
4873 "INSERT INTO operational_collections \
4874 (name, kind, schema_json, retention_json, format_version, created_at) \
4875 VALUES ('connector_health', 'latest_state', '{}', '{}', 1, 100)",
4876 [],
4877 )
4878 .expect("insert collection");
4879 conn.execute(
4880 "INSERT INTO operational_mutations \
4881 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
4882 VALUES ('m1', 'connector_health', 'gmail', 'put', '{\"status\":\"old\"}', 'source-1', 100, 1)",
4883 [],
4884 )
4885 .expect("insert prior mutation");
4886 conn.execute(
4887 "INSERT INTO operational_mutations \
4888 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
4889 VALUES ('m2', 'connector_health', 'gmail', 'put', '{\"status\":\"new\"}', 'source-2', 200, 2)",
4890 [],
4891 )
4892 .expect("insert excised mutation");
4893 conn.execute(
4894 "INSERT INTO operational_current \
4895 (collection_name, record_key, payload_json, updated_at, last_mutation_id) \
4896 VALUES ('connector_health', 'gmail', '{\"status\":\"new\"}', 200, 'm2')",
4897 [],
4898 )
4899 .expect("insert current row");
4900 }
4901
4902 let traced = service
4903 .trace_source("source-2")
4904 .expect("trace before excise");
4905 assert_eq!(traced.operational_mutation_rows, 1);
4906 assert_eq!(traced.operational_mutation_ids, vec!["m2"]);
4907
4908 let excised = service.excise_source("source-2").expect("excise");
4909 assert_eq!(excised.operational_mutation_rows, 0);
4910 assert!(excised.operational_mutation_ids.is_empty());
4911
4912 {
4913 let conn = sqlite::open_connection(db.path()).expect("conn");
4914 let remaining: i64 = conn
4915 .query_row(
4916 "SELECT count(*) FROM operational_mutations WHERE source_ref = 'source-2'",
4917 [],
4918 |row| row.get(0),
4919 )
4920 .expect("remaining count");
4921 assert_eq!(remaining, 0);
4922
4923 let current: (String, String) = conn
4924 .query_row(
4925 "SELECT payload_json, last_mutation_id FROM operational_current \
4926 WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
4927 [],
4928 |row| Ok((row.get(0)?, row.get(1)?)),
4929 )
4930 .expect("rebuilt current row");
4931 assert_eq!(current.0, "{\"status\":\"old\"}");
4932 assert_eq!(current.1, "m1");
4933 }
4934 }
4935
4936 #[test]
4937 fn restore_logical_id_reestablishes_last_pre_retire_content_and_attached_edges() {
4938 let (db, service) = setup();
4939 {
4940 let conn = sqlite::open_connection(db.path()).expect("conn");
4941 conn.execute(
4942 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
4943 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 'seed')",
4944 [],
4945 )
4946 .expect("insert node");
4947 conn.execute(
4948 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
4949 VALUES ('node-row-topic', 'topic-1', 'Topic', '{}', 100, 'seed')",
4950 [],
4951 )
4952 .expect("insert target node");
4953 conn.execute(
4954 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
4955 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
4956 [],
4957 )
4958 .expect("insert chunk");
4959 conn.execute(
4960 "INSERT INTO edges \
4961 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
4962 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'topic-1', 'TAGGED', '{}', 100, 'seed')",
4963 [],
4964 )
4965 .expect("insert edge");
4966 conn.execute(
4967 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
4968 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
4969 [],
4970 )
4971 .expect("insert node retire event");
4972 conn.execute(
4973 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
4974 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
4975 [],
4976 )
4977 .expect("insert edge retire event");
4978 conn.execute(
4979 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
4980 [],
4981 )
4982 .expect("retire node");
4983 conn.execute(
4984 "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
4985 [],
4986 )
4987 .expect("retire edge");
4988 conn.execute("DELETE FROM fts_nodes", [])
4989 .expect("clear fts");
4990 }
4991
4992 let report = service.restore_logical_id("doc-1").expect("restore");
4993 assert_eq!(report.logical_id, "doc-1");
4994 assert!(!report.was_noop);
4995 assert_eq!(report.restored_node_rows, 1);
4996 assert_eq!(report.restored_edge_rows, 1);
4997 assert_eq!(report.restored_chunk_rows, 1);
4998 assert_eq!(report.restored_fts_rows, 1);
4999
5000 let conn = sqlite::open_connection(db.path()).expect("conn");
5001 let active_node_count: i64 = conn
5002 .query_row(
5003 "SELECT count(*) FROM nodes WHERE logical_id = 'doc-1' AND superseded_at IS NULL",
5004 [],
5005 |row| row.get(0),
5006 )
5007 .expect("active node count");
5008 assert_eq!(active_node_count, 1);
5009 let active_edge_count: i64 = conn
5010 .query_row(
5011 "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
5012 [],
5013 |row| row.get(0),
5014 )
5015 .expect("active edge count");
5016 assert_eq!(active_edge_count, 1);
5017 let fts_count: i64 = conn
5018 .query_row(
5019 "SELECT count(*) FROM fts_nodes WHERE chunk_id = 'chunk-1'",
5020 [],
5021 |row| row.get(0),
5022 )
5023 .expect("fts count");
5024 assert_eq!(fts_count, 1);
5025 }
5026
5027 #[test]
5028 fn restore_logical_id_restores_edges_retired_after_the_node_retire_event() {
5029 let (db, service) = setup();
5030 {
5031 let conn = sqlite::open_connection(db.path()).expect("conn");
5032 conn.execute(
5033 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5034 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 'seed')",
5035 [],
5036 )
5037 .expect("insert node");
5038 conn.execute(
5039 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5040 VALUES ('node-row-topic', 'topic-1', 'Topic', '{}', 100, 'seed')",
5041 [],
5042 )
5043 .expect("insert target node");
5044 conn.execute(
5045 "INSERT INTO edges \
5046 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
5047 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'topic-1', 'TAGGED', '{}', 100, 'seed')",
5048 [],
5049 )
5050 .expect("insert edge");
5051 conn.execute(
5052 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5053 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5054 [],
5055 )
5056 .expect("insert node retire event");
5057 conn.execute(
5058 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5059 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 201, '')",
5060 [],
5061 )
5062 .expect("insert edge retire event");
5063 conn.execute(
5064 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
5065 [],
5066 )
5067 .expect("retire node");
5068 conn.execute(
5069 "UPDATE edges SET superseded_at = 201 WHERE logical_id = 'edge-1'",
5070 [],
5071 )
5072 .expect("retire edge");
5073 }
5074
5075 let report = service.restore_logical_id("doc-1").expect("restore");
5076 assert_eq!(report.restored_edge_rows, 1);
5077
5078 let conn = sqlite::open_connection(db.path()).expect("conn");
5079 let active_edge_count: i64 = conn
5080 .query_row(
5081 "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
5082 [],
5083 |row| row.get(0),
5084 )
5085 .expect("active edge count");
5086 assert_eq!(active_edge_count, 1);
5087 }
5088
5089 #[test]
5090 fn restore_logical_id_prefers_latest_retired_revision_when_timestamps_tie() {
5091 let (db, service) = setup();
5092 {
5093 let conn = sqlite::open_connection(db.path()).expect("conn");
5094 conn.execute(
5095 "INSERT INTO nodes \
5096 (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5097 VALUES ('node-row-older', 'doc-1', 'Document', '{\"title\":\"older\"}', 100, 200, 'forget-1')",
5098 [],
5099 )
5100 .expect("insert older retired node");
5101 conn.execute(
5102 "INSERT INTO nodes \
5103 (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5104 VALUES ('node-row-newer', 'doc-1', 'Document', '{\"title\":\"newer\"}', 100, 200, 'forget-1')",
5105 [],
5106 )
5107 .expect("insert newer retired node");
5108 conn.execute(
5109 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5110 VALUES ('evt-retire-older', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5111 [],
5112 )
5113 .expect("insert older retire event");
5114 conn.execute(
5115 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5116 VALUES ('evt-retire-newer', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5117 [],
5118 )
5119 .expect("insert newer retire event");
5120 }
5121
5122 let report = service.restore_logical_id("doc-1").expect("restore");
5123
5124 assert!(!report.was_noop);
5125 let conn = sqlite::open_connection(db.path()).expect("conn");
5126 let active_row: (String, String) = conn
5127 .query_row(
5128 "SELECT row_id, properties FROM nodes \
5129 WHERE logical_id = 'doc-1' AND superseded_at IS NULL",
5130 [],
5131 |row| Ok((row.get(0)?, row.get(1)?)),
5132 )
5133 .expect("restored active row");
5134 assert_eq!(active_row.0, "node-row-newer");
5135 assert_eq!(active_row.1, "{\"title\":\"newer\"}");
5136 }
5137
5138 #[test]
5139 fn purge_logical_id_removes_retired_content_and_records_tombstone() {
5140 let (db, service) = setup();
5141 {
5142 let conn = sqlite::open_connection(db.path()).expect("conn");
5143 conn.execute(
5144 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5145 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 200, 'seed')",
5146 [],
5147 )
5148 .expect("insert retired node");
5149 conn.execute(
5150 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5151 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
5152 [],
5153 )
5154 .expect("insert chunk");
5155 conn.execute(
5156 "INSERT INTO edges \
5157 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, superseded_at, source_ref) \
5158 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'topic-1', 'TAGGED', '{}', 100, 200, 'seed')",
5159 [],
5160 )
5161 .expect("insert retired edge");
5162 conn.execute(
5163 "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
5164 VALUES ('chunk-1', 'doc-1', 'Document', 'budget narrative')",
5165 [],
5166 )
5167 .expect("insert fts");
5168 }
5169
5170 let report = service.purge_logical_id("doc-1").expect("purge");
5171 assert_eq!(report.logical_id, "doc-1");
5172 assert!(!report.was_noop);
5173 assert_eq!(report.deleted_node_rows, 1);
5174 assert_eq!(report.deleted_edge_rows, 1);
5175 assert_eq!(report.deleted_chunk_rows, 1);
5176 assert_eq!(report.deleted_fts_rows, 1);
5177
5178 let conn = sqlite::open_connection(db.path()).expect("conn");
5179 let remaining_nodes: i64 = conn
5180 .query_row(
5181 "SELECT count(*) FROM nodes WHERE logical_id = 'doc-1'",
5182 [],
5183 |row| row.get(0),
5184 )
5185 .expect("remaining nodes");
5186 assert_eq!(remaining_nodes, 0);
5187 let remaining_edges: i64 = conn
5188 .query_row(
5189 "SELECT count(*) FROM edges WHERE logical_id = 'edge-1'",
5190 [],
5191 |row| row.get(0),
5192 )
5193 .expect("remaining edges");
5194 assert_eq!(remaining_edges, 0);
5195 let remaining_chunks: i64 = conn
5196 .query_row(
5197 "SELECT count(*) FROM chunks WHERE id = 'chunk-1'",
5198 [],
5199 |row| row.get(0),
5200 )
5201 .expect("remaining chunks");
5202 assert_eq!(remaining_chunks, 0);
5203 let purge_events: i64 = conn
5204 .query_row(
5205 "SELECT count(*) FROM provenance_events WHERE event_type = 'purge_logical_id' AND subject = 'doc-1'",
5206 [],
5207 |row| row.get(0),
5208 )
5209 .expect("purge events");
5210 assert_eq!(purge_events, 1);
5211 }
5212
5213 #[test]
5214 fn check_semantics_accepts_preserved_retired_chunks() {
5215 let (db, service) = setup();
5216 {
5217 let conn = sqlite::open_connection(db.path()).expect("conn");
5218 conn.execute(
5219 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5220 VALUES ('node-row-1', 'doc-1', 'Document', '{}', 100, 200, 'seed')",
5221 [],
5222 )
5223 .expect("insert retired node");
5224 conn.execute(
5225 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5226 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
5227 [],
5228 )
5229 .expect("insert chunk");
5230 }
5231
5232 let report = service.check_semantics().expect("semantics");
5233 assert_eq!(report.orphaned_chunks, 0);
5234 }
5235
5236 #[test]
5237 fn check_semantics_detects_missing_retired_node_history_for_preserved_chunks() {
5238 let (db, service) = setup();
5239 {
5240 let conn = sqlite::open_connection(db.path()).expect("conn");
5241 conn.execute(
5242 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5243 VALUES ('chunk-1', 'ghost-doc', 'budget narrative', 100)",
5244 [],
5245 )
5246 .expect("insert orphaned chunk");
5247 }
5248
5249 let report = service.check_semantics().expect("semantics");
5250 assert_eq!(report.orphaned_chunks, 1);
5251 }
5252
5253 #[cfg(feature = "sqlite-vec")]
5254 #[test]
5255 fn check_semantics_detects_missing_retired_node_history_for_preserved_vec_rows() {
5256 let (db, service) = setup();
5257 {
5258 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
5259 service
5260 .schema_manager
5261 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
5262 .expect("ensure vec profile");
5263 conn.execute(
5264 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5265 VALUES ('chunk-1', 'ghost-doc', 'budget narrative', 100)",
5266 [],
5267 )
5268 .expect("insert orphaned chunk");
5269 conn.execute(
5270 "INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
5271 [],
5272 )
5273 .expect("insert vec row");
5274 }
5275
5276 let report = service.check_semantics().expect("semantics");
5277 assert_eq!(report.orphaned_chunks, 1);
5278 assert_eq!(report.vec_rows_for_superseded_nodes, 1);
5279 }
5280
5281 #[cfg(feature = "sqlite-vec")]
5282 #[test]
5283 fn restore_logical_id_reestablishes_vector_search_without_reingest() {
5284 let (db, service) = setup();
5285 {
5286 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
5287 service
5288 .schema_manager
5289 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
5290 .expect("ensure vec profile");
5291 conn.execute(
5292 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5293 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 200, 'seed')",
5294 [],
5295 )
5296 .expect("insert retired node");
5297 conn.execute(
5298 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5299 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
5300 [],
5301 )
5302 .expect("insert chunk");
5303 conn.execute(
5304 "INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
5305 [],
5306 )
5307 .expect("insert vec row");
5308 conn.execute(
5309 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5310 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5311 [],
5312 )
5313 .expect("insert retire event");
5314 }
5315
5316 let report = service.restore_logical_id("doc-1").expect("restore");
5317 assert_eq!(report.restored_vec_rows, 1);
5318
5319 let coordinator = ExecutionCoordinator::open(
5320 db.path(),
5321 Arc::new(SchemaManager::new()),
5322 Some(4),
5323 1,
5324 Arc::new(TelemetryCounters::default()),
5325 )
5326 .expect("coordinator");
5327 let compiled = QueryBuilder::nodes("Document")
5328 .vector_search("[0.0, 0.0, 0.0, 0.0]", 5)
5329 .compile()
5330 .expect("compile");
5331 let rows = coordinator
5332 .execute_compiled_read(&compiled)
5333 .expect("vector read");
5334 assert!(
5335 rows.nodes.iter().any(|row| row.logical_id == "doc-1"),
5336 "restore should make the preserved vec row visible again without re-ingest"
5337 );
5338 }
5339
5340 #[cfg(feature = "sqlite-vec")]
5341 #[test]
5342 fn purge_logical_id_deletes_vec_rows_for_retired_content() {
5343 let (db, service) = setup();
5344 {
5345 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
5346 service
5347 .schema_manager
5348 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
5349 .expect("ensure vec profile");
5350 conn.execute(
5351 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
5352 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 200, 'seed')",
5353 [],
5354 )
5355 .expect("insert retired node");
5356 conn.execute(
5357 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5358 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
5359 [],
5360 )
5361 .expect("insert chunk");
5362 conn.execute(
5363 "INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
5364 [],
5365 )
5366 .expect("insert vec row");
5367 }
5368
5369 let report = service.purge_logical_id("doc-1").expect("purge");
5370 assert_eq!(report.deleted_vec_rows, 1);
5371
5372 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
5373 let vec_count: i64 = conn
5374 .query_row("SELECT count(*) FROM vec_nodes_active", [], |row| {
5375 row.get(0)
5376 })
5377 .expect("vec count");
5378 assert_eq!(vec_count, 0);
5379 }
5380
5381 #[cfg(feature = "sqlite-vec")]
5382 #[test]
5383 fn restore_logical_id_restores_visibility_of_regenerated_vectors() {
5384 let (db, service) = setup();
5385 let temp_dir = tempfile::tempdir().expect("temp dir");
5386 let script_path = temp_dir.path().join("vector-generator-restore.sh");
5387 fs::write(
5388 &script_path,
5389 r#"#!/usr/bin/env bash
5390set -euo pipefail
5391python3 -c 'import json, sys
5392payload = json.load(sys.stdin)
5393json.dump({"embeddings": [{"chunk_id": payload["chunks"][0]["chunk_id"], "embedding": [0.0, 0.0, 0.0, 0.0]}]}, sys.stdout)'
5394"#,
5395 )
5396 .expect("write script");
5397 set_file_mode(&script_path, 0o755);
5398
5399 {
5400 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
5401 service
5402 .schema_manager
5403 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
5404 .expect("ensure vec profile");
5405 conn.execute(
5406 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
5407 VALUES ('node-row-1', 'doc-1', 'Document', '{\"title\":\"Budget\"}', 100, 'seed')",
5408 [],
5409 )
5410 .expect("insert node");
5411 conn.execute(
5412 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
5413 VALUES ('chunk-1', 'doc-1', 'budget narrative', 100)",
5414 [],
5415 )
5416 .expect("insert chunk");
5417 }
5418
5419 service
5420 .regenerate_vector_embeddings(&VectorRegenerationConfig {
5421 profile: "default".to_owned(),
5422 table_name: "vec_nodes_active".to_owned(),
5423 model_identity: "model".to_owned(),
5424 model_version: "1.0.0".to_owned(),
5425 dimension: 4,
5426 normalization_policy: "l2".to_owned(),
5427 chunking_policy: "per_chunk".to_owned(),
5428 preprocessing_policy: "trim".to_owned(),
5429 generator_command: vec![script_path.to_string_lossy().to_string()],
5430 })
5431 .expect("regenerate");
5432
5433 {
5434 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
5435 conn.execute(
5436 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
5437 VALUES ('evt-node-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
5438 [],
5439 )
5440 .expect("insert retire event");
5441 conn.execute(
5442 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
5443 [],
5444 )
5445 .expect("retire node");
5446 }
5447
5448 let report = service.restore_logical_id("doc-1").expect("restore");
5449 assert_eq!(report.restored_vec_rows, 1);
5450
5451 let coordinator = ExecutionCoordinator::open(
5452 db.path(),
5453 Arc::new(SchemaManager::new()),
5454 Some(4),
5455 1,
5456 Arc::new(TelemetryCounters::default()),
5457 )
5458 .expect("coordinator");
5459 let compiled = QueryBuilder::nodes("Document")
5460 .vector_search("[0.0, 0.0, 0.0, 0.0]", 5)
5461 .compile()
5462 .expect("compile");
5463 let rows = coordinator
5464 .execute_compiled_read(&compiled)
5465 .expect("vector read");
5466 assert!(
5467 rows.nodes.iter().any(|row| row.logical_id == "doc-1"),
5468 "restored logical_id should become visible through regenerated vectors"
5469 );
5470 }
5471
5472 #[test]
5473 fn check_semantics_clean_db_returns_zeros() {
5474 let (_db, service) = setup();
5475 let report = service.check_semantics().expect("semantics check");
5476 assert_eq!(report.orphaned_chunks, 0);
5477 assert_eq!(report.null_source_ref_nodes, 0);
5478 assert_eq!(report.broken_step_fk, 0);
5479 assert_eq!(report.broken_action_fk, 0);
5480 assert_eq!(report.stale_fts_rows, 0);
5481 assert_eq!(report.fts_rows_for_superseded_nodes, 0);
5482 assert_eq!(report.dangling_edges, 0);
5483 assert_eq!(report.orphaned_supersession_chains, 0);
5484 assert_eq!(report.stale_vec_rows, 0);
5485 assert_eq!(report.vec_rows_for_superseded_nodes, 0);
5486 assert_eq!(report.missing_operational_current_rows, 0);
5487 assert_eq!(report.stale_operational_current_rows, 0);
5488 assert_eq!(report.disabled_collection_mutations, 0);
5489 assert_eq!(report.mismatched_kind_property_fts_rows, 0);
5490 assert_eq!(report.duplicate_property_fts_rows, 0);
5491 assert_eq!(report.drifted_property_fts_rows, 0);
5492 assert!(report.warnings.is_empty());
5493 }
5494
5495 #[test]
5496 fn register_operational_collection_persists_and_emits_provenance() {
5497 let (db, service) = setup();
5498 let record = service
5499 .register_operational_collection(&OperationalRegisterRequest {
5500 name: "connector_health".to_owned(),
5501 kind: OperationalCollectionKind::LatestState,
5502 schema_json: "{}".to_owned(),
5503 retention_json: "{}".to_owned(),
5504 filter_fields_json: "[]".to_owned(),
5505 validation_json: String::new(),
5506 secondary_indexes_json: "[]".to_owned(),
5507 format_version: 1,
5508 })
5509 .expect("register collection");
5510
5511 assert_eq!(record.name, "connector_health");
5512 assert_eq!(record.kind, OperationalCollectionKind::LatestState);
5513 assert_eq!(record.schema_json, "{}");
5514 assert_eq!(record.retention_json, "{}");
5515 assert_eq!(record.filter_fields_json, "[]");
5516 assert!(record.created_at > 0);
5517 assert_eq!(record.disabled_at, None);
5518
5519 let described = service
5520 .describe_operational_collection("connector_health")
5521 .expect("describe collection")
5522 .expect("collection exists");
5523 assert_eq!(described, record);
5524
5525 let conn = sqlite::open_connection(db.path()).expect("conn");
5526 let provenance_count: i64 = conn
5527 .query_row(
5528 "SELECT count(*) FROM provenance_events \
5529 WHERE event_type = 'operational_collection_registered' AND subject = 'connector_health'",
5530 [],
5531 |row| row.get(0),
5532 )
5533 .expect("provenance count");
5534 assert_eq!(provenance_count, 1);
5535 }
5536
5537 #[test]
5538 fn register_and_update_operational_collection_validation_round_trip() {
5539 let (db, service) = setup();
5540 let record = service
5541 .register_operational_collection(&OperationalRegisterRequest {
5542 name: "connector_health".to_owned(),
5543 kind: OperationalCollectionKind::LatestState,
5544 schema_json: "{}".to_owned(),
5545 retention_json: "{}".to_owned(),
5546 filter_fields_json: "[]".to_owned(),
5547 validation_json: String::new(),
5548 secondary_indexes_json: "[]".to_owned(),
5549 format_version: 1,
5550 })
5551 .expect("register collection");
5552 assert_eq!(record.validation_json, "");
5553
5554 let validation_json = r#"{"format_version":1,"mode":"enforce","additional_properties":false,"fields":[{"name":"status","type":"string","required":true,"enum":["ok","failed"]}]}"#;
5555 let updated = service
5556 .update_operational_collection_validation("connector_health", validation_json)
5557 .expect("update validation");
5558 assert_eq!(updated.validation_json, validation_json);
5559
5560 let described = service
5561 .describe_operational_collection("connector_health")
5562 .expect("describe collection")
5563 .expect("collection exists");
5564 assert_eq!(described.validation_json, validation_json);
5565
5566 let conn = sqlite::open_connection(db.path()).expect("conn");
5567 let provenance_count: i64 = conn
5568 .query_row(
5569 "SELECT count(*) FROM provenance_events \
5570 WHERE event_type = 'operational_collection_validation_updated' \
5571 AND subject = 'connector_health'",
5572 [],
5573 |row| row.get(0),
5574 )
5575 .expect("provenance count");
5576 assert_eq!(provenance_count, 1);
5577 }
5578
5579 #[test]
5580 fn register_update_and_rebuild_operational_secondary_indexes_round_trip() {
5581 let (db, service) = setup();
5582 let record = service
5583 .register_operational_collection(&OperationalRegisterRequest {
5584 name: "audit_log".to_owned(),
5585 kind: OperationalCollectionKind::AppendOnlyLog,
5586 schema_json: "{}".to_owned(),
5587 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
5588 filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#.to_owned(),
5589 validation_json: String::new(),
5590 secondary_indexes_json: "[]".to_owned(),
5591 format_version: 1,
5592 })
5593 .expect("register collection");
5594 assert_eq!(record.secondary_indexes_json, "[]");
5595
5596 {
5597 let writer = crate::WriterActor::start(
5598 db.path(),
5599 Arc::new(SchemaManager::new()),
5600 crate::ProvenanceMode::Warn,
5601 Arc::new(crate::TelemetryCounters::default()),
5602 )
5603 .expect("writer");
5604 writer
5605 .submit(crate::WriteRequest {
5606 label: "secondary-index-seed".to_owned(),
5607 nodes: vec![],
5608 node_retires: vec![],
5609 edges: vec![],
5610 edge_retires: vec![],
5611 chunks: vec![],
5612 runs: vec![],
5613 steps: vec![],
5614 actions: vec![],
5615 optional_backfills: vec![],
5616 vec_inserts: vec![],
5617 operational_writes: vec![
5618 crate::OperationalWrite::Append {
5619 collection: "audit_log".to_owned(),
5620 record_key: "evt-1".to_owned(),
5621 payload_json: r#"{"actor":"alice","ts":100}"#.to_owned(),
5622 source_ref: Some("src-1".to_owned()),
5623 },
5624 crate::OperationalWrite::Append {
5625 collection: "audit_log".to_owned(),
5626 record_key: "evt-2".to_owned(),
5627 payload_json: r#"{"actor":"bob","ts":200}"#.to_owned(),
5628 source_ref: Some("src-2".to_owned()),
5629 },
5630 ],
5631 })
5632 .expect("seed writes");
5633 }
5634
5635 let secondary_indexes_json = r#"[{"name":"actor_ts","kind":"append_only_field_time","field":"actor","value_type":"string","time_field":"ts"}]"#;
5636 let updated = service
5637 .update_operational_collection_secondary_indexes("audit_log", secondary_indexes_json)
5638 .expect("update secondary indexes");
5639 assert_eq!(updated.secondary_indexes_json, secondary_indexes_json);
5640
5641 let conn = sqlite::open_connection(db.path()).expect("conn");
5642 let entry_count: i64 = conn
5643 .query_row(
5644 "SELECT count(*) FROM operational_secondary_index_entries \
5645 WHERE collection_name = 'audit_log' AND index_name = 'actor_ts'",
5646 [],
5647 |row| row.get(0),
5648 )
5649 .expect("secondary index count");
5650 assert_eq!(entry_count, 2);
5651 conn.execute(
5652 "DELETE FROM operational_secondary_index_entries WHERE collection_name = 'audit_log'",
5653 [],
5654 )
5655 .expect("clear index entries");
5656 drop(conn);
5657
5658 let rebuild = service
5659 .rebuild_operational_secondary_indexes("audit_log")
5660 .expect("rebuild secondary indexes");
5661 assert_eq!(rebuild.collection_name, "audit_log");
5662 assert_eq!(rebuild.mutation_entries_rebuilt, 2);
5663 assert_eq!(rebuild.current_entries_rebuilt, 0);
5664 }
5665
5666 #[test]
5667 fn register_operational_collection_rejects_invalid_validation_contract() {
5668 let (_db, service) = setup();
5669
5670 let error = service
5671 .register_operational_collection(&OperationalRegisterRequest {
5672 name: "connector_health".to_owned(),
5673 kind: OperationalCollectionKind::LatestState,
5674 schema_json: "{}".to_owned(),
5675 retention_json: "{}".to_owned(),
5676 filter_fields_json: "[]".to_owned(),
5677 validation_json: r#"{"format_version":1,"mode":"enforce","fields":[{"name":"status","type":"string","minimum":0}]}"#
5678 .to_owned(),
5679 secondary_indexes_json: "[]".to_owned(),
5680 format_version: 1,
5681 })
5682 .expect_err("invalid validation contract should reject");
5683
5684 assert!(matches!(error, EngineError::InvalidWrite(_)));
5685 assert!(error.to_string().contains("minimum/maximum"));
5686 }
5687
5688 #[test]
5689 fn validate_operational_collection_history_reports_invalid_rows_without_mutation() {
5690 let (db, service) = setup();
5691 service
5692 .register_operational_collection(&OperationalRegisterRequest {
5693 name: "audit_log".to_owned(),
5694 kind: OperationalCollectionKind::AppendOnlyLog,
5695 schema_json: "{}".to_owned(),
5696 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
5697 filter_fields_json: "[]".to_owned(),
5698 validation_json: r#"{"format_version":1,"mode":"disabled","additional_properties":false,"fields":[{"name":"status","type":"string","required":true,"enum":["ok","failed"]}]}"#
5699 .to_owned(),
5700 secondary_indexes_json: "[]".to_owned(),
5701 format_version: 1,
5702 })
5703 .expect("register collection");
5704 {
5705 let writer = crate::WriterActor::start(
5706 db.path(),
5707 Arc::new(SchemaManager::new()),
5708 crate::ProvenanceMode::Warn,
5709 Arc::new(crate::TelemetryCounters::default()),
5710 )
5711 .expect("writer");
5712 writer
5713 .submit(crate::WriteRequest {
5714 label: "history-validation".to_owned(),
5715 nodes: vec![],
5716 node_retires: vec![],
5717 edges: vec![],
5718 edge_retires: vec![],
5719 chunks: vec![],
5720 runs: vec![],
5721 steps: vec![],
5722 actions: vec![],
5723 optional_backfills: vec![],
5724 vec_inserts: vec![],
5725 operational_writes: vec![
5726 crate::OperationalWrite::Append {
5727 collection: "audit_log".to_owned(),
5728 record_key: "evt-1".to_owned(),
5729 payload_json: r#"{"status":"ok"}"#.to_owned(),
5730 source_ref: Some("src-1".to_owned()),
5731 },
5732 crate::OperationalWrite::Append {
5733 collection: "audit_log".to_owned(),
5734 record_key: "evt-2".to_owned(),
5735 payload_json: r#"{"status":"bogus"}"#.to_owned(),
5736 source_ref: Some("src-2".to_owned()),
5737 },
5738 ],
5739 })
5740 .expect("write");
5741 }
5742
5743 let report = service
5744 .validate_operational_collection_history("audit_log")
5745 .expect("validate history");
5746 assert_eq!(report.collection_name, "audit_log");
5747 assert_eq!(report.checked_rows, 2);
5748 assert_eq!(report.invalid_row_count, 1);
5749 assert_eq!(report.issues.len(), 1);
5750 assert_eq!(report.issues[0].record_key, "evt-2");
5751 assert!(report.issues[0].message.contains("must be one of"));
5752
5753 let trace = service
5754 .trace_operational_collection("audit_log", None)
5755 .expect("trace");
5756 assert_eq!(trace.mutation_count, 2);
5757
5758 let conn = sqlite::open_connection(db.path()).expect("conn");
5759 let provenance_count: i64 = conn
5760 .query_row(
5761 "SELECT count(*) FROM provenance_events \
5762 WHERE event_type = 'operational_collection_history_validated' \
5763 AND subject = 'audit_log'",
5764 [],
5765 |row| row.get(0),
5766 )
5767 .expect("provenance count");
5768 assert_eq!(provenance_count, 0);
5769 }
5770
5771 #[test]
5772 fn trace_operational_collection_returns_mutations_and_current_rows() {
5773 let (db, service) = setup();
5774 service
5775 .register_operational_collection(&OperationalRegisterRequest {
5776 name: "connector_health".to_owned(),
5777 kind: OperationalCollectionKind::LatestState,
5778 schema_json: "{}".to_owned(),
5779 retention_json: "{}".to_owned(),
5780 filter_fields_json: "[]".to_owned(),
5781 validation_json: String::new(),
5782 secondary_indexes_json: "[]".to_owned(),
5783 format_version: 1,
5784 })
5785 .expect("register collection");
5786 {
5787 let writer = crate::WriterActor::start(
5788 db.path(),
5789 Arc::new(SchemaManager::new()),
5790 crate::ProvenanceMode::Warn,
5791 Arc::new(crate::TelemetryCounters::default()),
5792 )
5793 .expect("writer");
5794 writer
5795 .submit(crate::WriteRequest {
5796 label: "operational".to_owned(),
5797 nodes: vec![],
5798 node_retires: vec![],
5799 edges: vec![],
5800 edge_retires: vec![],
5801 chunks: vec![],
5802 runs: vec![],
5803 steps: vec![],
5804 actions: vec![],
5805 optional_backfills: vec![],
5806 vec_inserts: vec![],
5807 operational_writes: vec![crate::OperationalWrite::Put {
5808 collection: "connector_health".to_owned(),
5809 record_key: "gmail".to_owned(),
5810 payload_json: r#"{"status":"ok"}"#.to_owned(),
5811 source_ref: Some("src-1".to_owned()),
5812 }],
5813 })
5814 .expect("write");
5815 }
5816
5817 let report = service
5818 .trace_operational_collection("connector_health", Some("gmail"))
5819 .expect("trace");
5820 assert_eq!(report.collection_name, "connector_health");
5821 assert_eq!(report.record_key.as_deref(), Some("gmail"));
5822 assert_eq!(report.mutation_count, 1);
5823 assert_eq!(report.current_count, 1);
5824 assert_eq!(report.mutations[0].op_kind, "put");
5825 assert_eq!(report.current_rows[0].payload_json, r#"{"status":"ok"}"#);
5826 }
5827
5828 #[test]
5829 fn trace_operational_collection_rejects_unknown_collection() {
5830 let (_db, service) = setup();
5831
5832 let error = service
5833 .trace_operational_collection("missing_collection", None)
5834 .expect_err("unknown collection should fail");
5835
5836 assert!(matches!(error, EngineError::InvalidWrite(_)));
5837 assert!(error.to_string().contains("is not registered"));
5838 }
5839
5840 #[test]
5841 fn rebuild_operational_current_repairs_missing_latest_state_rows() {
5842 let (db, service) = setup();
5843 service
5844 .register_operational_collection(&OperationalRegisterRequest {
5845 name: "connector_health".to_owned(),
5846 kind: OperationalCollectionKind::LatestState,
5847 schema_json: "{}".to_owned(),
5848 retention_json: "{}".to_owned(),
5849 filter_fields_json: "[]".to_owned(),
5850 validation_json: String::new(),
5851 secondary_indexes_json: "[]".to_owned(),
5852 format_version: 1,
5853 })
5854 .expect("register collection");
5855 {
5856 let writer = crate::WriterActor::start(
5857 db.path(),
5858 Arc::new(SchemaManager::new()),
5859 crate::ProvenanceMode::Warn,
5860 Arc::new(crate::TelemetryCounters::default()),
5861 )
5862 .expect("writer");
5863 writer
5864 .submit(crate::WriteRequest {
5865 label: "operational".to_owned(),
5866 nodes: vec![],
5867 node_retires: vec![],
5868 edges: vec![],
5869 edge_retires: vec![],
5870 chunks: vec![],
5871 runs: vec![],
5872 steps: vec![],
5873 actions: vec![],
5874 optional_backfills: vec![],
5875 vec_inserts: vec![],
5876 operational_writes: vec![crate::OperationalWrite::Put {
5877 collection: "connector_health".to_owned(),
5878 record_key: "gmail".to_owned(),
5879 payload_json: r#"{"status":"ok"}"#.to_owned(),
5880 source_ref: Some("src-1".to_owned()),
5881 }],
5882 })
5883 .expect("write");
5884 }
5885 {
5886 let conn = sqlite::open_connection(db.path()).expect("conn");
5887 conn.execute(
5888 "DELETE FROM operational_current WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
5889 [],
5890 )
5891 .expect("delete current row");
5892 }
5893
5894 let before = service.check_semantics().expect("semantics before rebuild");
5895 assert_eq!(before.missing_operational_current_rows, 1);
5896
5897 let repair = service
5898 .rebuild_operational_current(Some("connector_health"))
5899 .expect("rebuild current");
5900 assert_eq!(repair.collections_rebuilt, 1);
5901 assert_eq!(repair.current_rows_rebuilt, 1);
5902
5903 let after = service.check_semantics().expect("semantics after rebuild");
5904 assert_eq!(after.missing_operational_current_rows, 0);
5905
5906 let conn = sqlite::open_connection(db.path()).expect("conn");
5907 let payload: String = conn
5908 .query_row(
5909 "SELECT payload_json FROM operational_current \
5910 WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
5911 [],
5912 |row| row.get(0),
5913 )
5914 .expect("restored payload");
5915 assert_eq!(payload, r#"{"status":"ok"}"#);
5916 }
5917
5918 #[test]
5919 fn rebuild_operational_current_restores_latest_state_secondary_index_entries() {
5920 let (db, service) = setup();
5921 service
5922 .register_operational_collection(&OperationalRegisterRequest {
5923 name: "connector_health".to_owned(),
5924 kind: OperationalCollectionKind::LatestState,
5925 schema_json: "{}".to_owned(),
5926 retention_json: "{}".to_owned(),
5927 filter_fields_json: "[]".to_owned(),
5928 validation_json: String::new(),
5929 secondary_indexes_json: r#"[{"name":"status_current","kind":"latest_state_field","field":"status","value_type":"string"}]"#.to_owned(),
5930 format_version: 1,
5931 })
5932 .expect("register collection");
5933 {
5934 let writer = crate::WriterActor::start(
5935 db.path(),
5936 Arc::new(SchemaManager::new()),
5937 crate::ProvenanceMode::Warn,
5938 Arc::new(crate::TelemetryCounters::default()),
5939 )
5940 .expect("writer");
5941 writer
5942 .submit(crate::WriteRequest {
5943 label: "operational".to_owned(),
5944 nodes: vec![],
5945 node_retires: vec![],
5946 edges: vec![],
5947 edge_retires: vec![],
5948 chunks: vec![],
5949 runs: vec![],
5950 steps: vec![],
5951 actions: vec![],
5952 optional_backfills: vec![],
5953 vec_inserts: vec![],
5954 operational_writes: vec![crate::OperationalWrite::Put {
5955 collection: "connector_health".to_owned(),
5956 record_key: "gmail".to_owned(),
5957 payload_json: r#"{"status":"ok"}"#.to_owned(),
5958 source_ref: Some("src-1".to_owned()),
5959 }],
5960 })
5961 .expect("write");
5962 }
5963 {
5964 let conn = sqlite::open_connection(db.path()).expect("conn");
5965 let entry_count: i64 = conn
5966 .query_row(
5967 "SELECT count(*) FROM operational_secondary_index_entries \
5968 WHERE collection_name = 'connector_health' AND subject_kind = 'current'",
5969 [],
5970 |row| row.get(0),
5971 )
5972 .expect("secondary index count before repair");
5973 assert_eq!(entry_count, 1);
5974 conn.execute(
5975 "DELETE FROM operational_current WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
5976 [],
5977 )
5978 .expect("delete current row");
5979 }
5980
5981 service
5982 .rebuild_operational_current(Some("connector_health"))
5983 .expect("rebuild current");
5984
5985 let conn = sqlite::open_connection(db.path()).expect("conn");
5986 let entry_count: i64 = conn
5987 .query_row(
5988 "SELECT count(*) FROM operational_secondary_index_entries \
5989 WHERE collection_name = 'connector_health' AND subject_kind = 'current'",
5990 [],
5991 |row| row.get(0),
5992 )
5993 .expect("secondary index count after repair");
5994 assert_eq!(entry_count, 1);
5995 }
5996
5997 #[test]
5998 fn operational_current_semantics_and_rebuild_follow_mutation_order() {
5999 let (db, service) = setup();
6000 {
6001 let conn = sqlite::open_connection(db.path()).expect("conn");
6002 conn.execute(
6003 "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
6004 VALUES ('connector_health', 'latest_state', '{}', '{}', 1, 100)",
6005 [],
6006 )
6007 .expect("seed collection");
6008 conn.execute(
6009 "INSERT INTO operational_mutations \
6010 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6011 VALUES ('m3', 'connector_health', 'gmail', 'put', '{\"status\":\"old\"}', 'src-1', 100, 1)",
6012 [],
6013 )
6014 .expect("seed first put");
6015 conn.execute(
6016 "INSERT INTO operational_mutations \
6017 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6018 VALUES ('m2', 'connector_health', 'gmail', 'delete', '', 'src-2', 100, 2)",
6019 [],
6020 )
6021 .expect("seed delete");
6022 conn.execute(
6023 "INSERT INTO operational_mutations \
6024 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6025 VALUES ('m1', 'connector_health', 'gmail', 'put', '{\"status\":\"new\"}', 'src-3', 100, 3)",
6026 [],
6027 )
6028 .expect("seed final put");
6029 conn.execute(
6030 "INSERT INTO operational_current \
6031 (collection_name, record_key, payload_json, updated_at, last_mutation_id) \
6032 VALUES ('connector_health', 'gmail', '{\"status\":\"new\"}', 100, 'm1')",
6033 [],
6034 )
6035 .expect("seed current");
6036 }
6037
6038 let before = service.check_semantics().expect("semantics before rebuild");
6039 assert_eq!(before.missing_operational_current_rows, 0);
6040 assert_eq!(before.stale_operational_current_rows, 0);
6041
6042 {
6043 let conn = sqlite::open_connection(db.path()).expect("conn");
6044 conn.execute(
6045 "DELETE FROM operational_current WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
6046 [],
6047 )
6048 .expect("delete current row");
6049 }
6050
6051 let missing = service.check_semantics().expect("semantics after delete");
6052 assert_eq!(missing.missing_operational_current_rows, 1);
6053 assert_eq!(missing.stale_operational_current_rows, 0);
6054
6055 service
6056 .rebuild_operational_current(Some("connector_health"))
6057 .expect("rebuild current");
6058
6059 let after = service.check_semantics().expect("semantics after rebuild");
6060 assert_eq!(after.missing_operational_current_rows, 0);
6061 assert_eq!(after.stale_operational_current_rows, 0);
6062
6063 let conn = sqlite::open_connection(db.path()).expect("conn");
6064 let payload: String = conn
6065 .query_row(
6066 "SELECT payload_json FROM operational_current \
6067 WHERE collection_name = 'connector_health' AND record_key = 'gmail'",
6068 [],
6069 |row| row.get(0),
6070 )
6071 .expect("restored payload");
6072 assert_eq!(payload, r#"{"status":"new"}"#);
6073 }
6074
6075 #[test]
6076 fn disable_operational_collection_sets_disabled_at_and_emits_provenance() {
6077 let (db, service) = setup();
6078 service
6079 .register_operational_collection(&OperationalRegisterRequest {
6080 name: "audit_log".to_owned(),
6081 kind: OperationalCollectionKind::AppendOnlyLog,
6082 schema_json: "{}".to_owned(),
6083 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6084 filter_fields_json: "[]".to_owned(),
6085 validation_json: String::new(),
6086 secondary_indexes_json: "[]".to_owned(),
6087 format_version: 1,
6088 })
6089 .expect("register collection");
6090
6091 let record = service
6092 .disable_operational_collection("audit_log")
6093 .expect("disable collection");
6094 assert_eq!(record.name, "audit_log");
6095 assert!(record.disabled_at.is_some());
6096
6097 let disabled_at = record.disabled_at.expect("disabled_at");
6098 let described = service
6099 .describe_operational_collection("audit_log")
6100 .expect("describe collection")
6101 .expect("collection exists");
6102 assert_eq!(described.disabled_at, Some(disabled_at));
6103
6104 let writer = crate::WriterActor::start(
6105 db.path(),
6106 Arc::new(SchemaManager::new()),
6107 crate::ProvenanceMode::Warn,
6108 Arc::new(crate::TelemetryCounters::default()),
6109 )
6110 .expect("writer");
6111 let error = writer
6112 .submit(crate::WriteRequest {
6113 label: "disabled-operational".to_owned(),
6114 nodes: vec![],
6115 node_retires: vec![],
6116 edges: vec![],
6117 edge_retires: vec![],
6118 chunks: vec![],
6119 runs: vec![],
6120 steps: vec![],
6121 actions: vec![],
6122 optional_backfills: vec![],
6123 vec_inserts: vec![],
6124 operational_writes: vec![crate::OperationalWrite::Append {
6125 collection: "audit_log".to_owned(),
6126 record_key: "evt-1".to_owned(),
6127 payload_json: r#"{"type":"sync"}"#.to_owned(),
6128 source_ref: Some("src-1".to_owned()),
6129 }],
6130 })
6131 .expect_err("disabled collection should reject writes");
6132 assert!(matches!(error, EngineError::InvalidWrite(_)));
6133 assert!(error.to_string().contains("is disabled"));
6134
6135 let conn = sqlite::open_connection(db.path()).expect("conn");
6136 let provenance_count: i64 = conn
6137 .query_row(
6138 "SELECT count(*) FROM provenance_events \
6139 WHERE event_type = 'operational_collection_disabled' AND subject = 'audit_log'",
6140 [],
6141 |row| row.get(0),
6142 )
6143 .expect("provenance count");
6144 assert_eq!(provenance_count, 1);
6145 }
6146
6147 #[test]
6148 fn purge_operational_collection_deletes_append_only_rows_before_cutoff() {
6149 let (db, service) = setup();
6150 {
6151 let conn = sqlite::open_connection(db.path()).expect("conn");
6152 conn.execute(
6153 "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
6154 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_all\"}', 1, 100)",
6155 [],
6156 )
6157 .expect("seed collection");
6158 conn.execute(
6159 "INSERT INTO operational_mutations \
6160 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6161 VALUES ('evt-1', 'audit_log', 'evt-1', 'append', '{\"seq\":1}', 'src-1', 100, 1)",
6162 [],
6163 )
6164 .expect("seed event 1");
6165 conn.execute(
6166 "INSERT INTO operational_mutations \
6167 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6168 VALUES ('evt-2', 'audit_log', 'evt-2', 'append', '{\"seq\":2}', 'src-2', 200, 2)",
6169 [],
6170 )
6171 .expect("seed event 2");
6172 conn.execute(
6173 "INSERT INTO operational_mutations \
6174 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6175 VALUES ('evt-3', 'audit_log', 'evt-3', 'append', '{\"seq\":3}', 'src-3', 300, 3)",
6176 [],
6177 )
6178 .expect("seed event 3");
6179 }
6180
6181 let report = service
6182 .purge_operational_collection("audit_log", 250)
6183 .expect("purge collection");
6184 assert_eq!(report.collection_name, "audit_log");
6185 assert_eq!(report.deleted_mutations, 2);
6186 assert_eq!(report.before_timestamp, 250);
6187
6188 let conn = sqlite::open_connection(db.path()).expect("conn");
6189 let remaining: Vec<String> = {
6190 let mut stmt = conn
6191 .prepare(
6192 "SELECT id FROM operational_mutations \
6193 WHERE collection_name = 'audit_log' ORDER BY mutation_order",
6194 )
6195 .expect("stmt");
6196 stmt.query_map([], |row| row.get(0))
6197 .expect("rows")
6198 .collect::<Result<_, _>>()
6199 .expect("collect")
6200 };
6201 assert_eq!(remaining, vec!["evt-3".to_owned()]);
6202 let provenance_count: i64 = conn
6203 .query_row(
6204 "SELECT count(*) FROM provenance_events \
6205 WHERE event_type = 'operational_collection_purged' AND subject = 'audit_log'",
6206 [],
6207 |row| row.get(0),
6208 )
6209 .expect("provenance count");
6210 assert_eq!(provenance_count, 1);
6211 }
6212
6213 #[test]
6214 fn compact_operational_collection_dry_run_reports_without_mutation() {
6215 let (db, service) = setup();
6216 {
6217 let conn = sqlite::open_connection(db.path()).expect("conn");
6218 conn.execute(
6219 "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
6220 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
6221 [],
6222 )
6223 .expect("seed collection");
6224 for (index, created_at) in [(1_i64, 100_i64), (2, 200), (3, 300)] {
6225 conn.execute(
6226 "INSERT INTO operational_mutations \
6227 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6228 VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
6229 rusqlite::params![
6230 format!("evt-{index}"),
6231 format!("{{\"seq\":{index}}}"),
6232 created_at,
6233 index,
6234 ],
6235 )
6236 .expect("seed event");
6237 }
6238 }
6239
6240 let report = service
6241 .compact_operational_collection("audit_log", true)
6242 .expect("compact collection");
6243 assert_eq!(report.collection_name, "audit_log");
6244 assert_eq!(report.deleted_mutations, 1);
6245 assert!(report.dry_run);
6246 assert_eq!(report.before_timestamp, None);
6247
6248 let conn = sqlite::open_connection(db.path()).expect("conn");
6249 let remaining_count: i64 = conn
6250 .query_row(
6251 "SELECT count(*) FROM operational_mutations WHERE collection_name = 'audit_log'",
6252 [],
6253 |row| row.get(0),
6254 )
6255 .expect("remaining count");
6256 assert_eq!(remaining_count, 3);
6257 let provenance_count: i64 = conn
6258 .query_row(
6259 "SELECT count(*) FROM provenance_events \
6260 WHERE event_type = 'operational_collection_compacted' AND subject = 'audit_log'",
6261 [],
6262 |row| row.get(0),
6263 )
6264 .expect("provenance count");
6265 assert_eq!(provenance_count, 0);
6266 }
6267
6268 #[test]
6269 fn compact_operational_collection_keep_last_deletes_oldest_rows() {
6270 let (db, service) = setup();
6271 {
6272 let conn = sqlite::open_connection(db.path()).expect("conn");
6273 conn.execute(
6274 "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
6275 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
6276 [],
6277 )
6278 .expect("seed collection");
6279 for (index, created_at) in [(1_i64, 100_i64), (2, 200), (3, 300)] {
6280 conn.execute(
6281 "INSERT INTO operational_mutations \
6282 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6283 VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
6284 rusqlite::params![
6285 format!("evt-{index}"),
6286 format!("{{\"seq\":{index}}}"),
6287 created_at,
6288 index,
6289 ],
6290 )
6291 .expect("seed event");
6292 }
6293 }
6294
6295 let report = service
6296 .compact_operational_collection("audit_log", false)
6297 .expect("compact collection");
6298 assert_eq!(report.deleted_mutations, 1);
6299 assert!(!report.dry_run);
6300
6301 let conn = sqlite::open_connection(db.path()).expect("conn");
6302 let remaining: Vec<String> = {
6303 let mut stmt = conn
6304 .prepare(
6305 "SELECT id FROM operational_mutations \
6306 WHERE collection_name = 'audit_log' ORDER BY mutation_order",
6307 )
6308 .expect("stmt");
6309 stmt.query_map([], |row| row.get(0))
6310 .expect("rows")
6311 .collect::<Result<_, _>>()
6312 .expect("collect")
6313 };
6314 assert_eq!(remaining, vec!["evt-2".to_owned(), "evt-3".to_owned()]);
6315 let provenance_count: i64 = conn
6316 .query_row(
6317 "SELECT count(*) FROM provenance_events \
6318 WHERE event_type = 'operational_collection_compacted' AND subject = 'audit_log'",
6319 [],
6320 |row| row.get(0),
6321 )
6322 .expect("provenance count");
6323 assert_eq!(provenance_count, 1);
6324 }
6325
6326 #[test]
6327 fn plan_and_run_operational_retention_keep_last() {
6328 let (db, service) = setup();
6329 {
6330 let conn = sqlite::open_connection(db.path()).expect("conn");
6331 conn.execute(
6332 "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
6333 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
6334 [],
6335 )
6336 .expect("seed collection");
6337 for (index, created_at) in [(1_i64, 100_i64), (2, 200), (3, 300)] {
6338 conn.execute(
6339 "INSERT INTO operational_mutations \
6340 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6341 VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
6342 rusqlite::params![
6343 format!("evt-{index}"),
6344 format!("{{\"seq\":{index}}}"),
6345 created_at,
6346 index,
6347 ],
6348 )
6349 .expect("seed event");
6350 }
6351 }
6352
6353 let plan = service
6354 .plan_operational_retention(1_000, None, Some(10))
6355 .expect("plan retention");
6356 assert_eq!(plan.collections_examined, 1);
6357 assert_eq!(plan.items[0].collection_name, "audit_log");
6358 assert_eq!(
6359 plan.items[0].action_kind,
6360 crate::operational::OperationalRetentionActionKind::KeepLast
6361 );
6362 assert_eq!(plan.items[0].candidate_deletions, 1);
6363 assert_eq!(plan.items[0].max_rows, Some(2));
6364 assert_eq!(plan.items[0].last_run_at, None);
6365
6366 let dry_run = service
6367 .run_operational_retention(1_000, None, Some(10), true)
6368 .expect("dry-run retention");
6369 assert!(dry_run.dry_run);
6370 assert_eq!(dry_run.collections_acted_on, 1);
6371 assert_eq!(dry_run.items[0].deleted_mutations, 1);
6372 assert_eq!(dry_run.items[0].rows_remaining, 2);
6373
6374 let conn = sqlite::open_connection(db.path()).expect("conn");
6375 let remaining_count: i64 = conn
6376 .query_row(
6377 "SELECT count(*) FROM operational_mutations WHERE collection_name = 'audit_log'",
6378 [],
6379 |row| row.get(0),
6380 )
6381 .expect("remaining count after dry run");
6382 assert_eq!(remaining_count, 3);
6383 let retention_run_count: i64 = conn
6384 .query_row(
6385 "SELECT count(*) FROM operational_retention_runs WHERE collection_name = 'audit_log'",
6386 [],
6387 |row| row.get(0),
6388 )
6389 .expect("retention run count");
6390 assert_eq!(retention_run_count, 0);
6391 drop(conn);
6392
6393 let executed = service
6394 .run_operational_retention(1_000, None, Some(10), false)
6395 .expect("execute retention");
6396 assert_eq!(executed.collections_acted_on, 1);
6397 assert_eq!(executed.items[0].deleted_mutations, 1);
6398 assert_eq!(executed.items[0].rows_remaining, 2);
6399
6400 let conn = sqlite::open_connection(db.path()).expect("conn");
6401 let remaining: Vec<String> = {
6402 let mut stmt = conn
6403 .prepare(
6404 "SELECT id FROM operational_mutations \
6405 WHERE collection_name = 'audit_log' ORDER BY mutation_order",
6406 )
6407 .expect("stmt");
6408 stmt.query_map([], |row| row.get(0))
6409 .expect("rows")
6410 .collect::<Result<_, _>>()
6411 .expect("collect")
6412 };
6413 assert_eq!(remaining, vec!["evt-2".to_owned(), "evt-3".to_owned()]);
6414 let last_run_at: i64 = conn
6415 .query_row(
6416 "SELECT executed_at FROM operational_retention_runs \
6417 WHERE collection_name = 'audit_log' ORDER BY executed_at DESC LIMIT 1",
6418 [],
6419 |row| row.get(0),
6420 )
6421 .expect("last run at");
6422 assert_eq!(last_run_at, 1_000);
6423 }
6424
6425 #[test]
6426 fn dry_run_operational_retention_does_not_mark_noop_collection_as_acted_on() {
6427 let (db, service) = setup();
6428 let conn = sqlite::open_connection(db.path()).expect("conn");
6429 conn.execute(
6430 "INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at) \
6431 VALUES ('audit_log', 'append_only_log', '{}', '{\"mode\":\"keep_last\",\"max_rows\":2}', 1, 100)",
6432 [],
6433 )
6434 .expect("seed collection");
6435 for (index, created_at) in [(1_i64, 100_i64), (2, 200)] {
6436 conn.execute(
6437 "INSERT INTO operational_mutations \
6438 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order) \
6439 VALUES (?1, 'audit_log', ?1, 'append', ?2, 'src', ?3, ?4)",
6440 rusqlite::params![
6441 format!("evt-{index}"),
6442 format!("{{\"seq\":{index}}}"),
6443 created_at,
6444 index,
6445 ],
6446 )
6447 .expect("seed event");
6448 }
6449 drop(conn);
6450
6451 let dry_run = service
6452 .run_operational_retention(1_000, None, Some(10), true)
6453 .expect("dry-run retention");
6454 assert!(dry_run.dry_run);
6455 assert_eq!(dry_run.collections_acted_on, 0);
6456 assert_eq!(dry_run.items[0].deleted_mutations, 0);
6457 assert_eq!(dry_run.items[0].rows_remaining, 2);
6458 }
6459
6460 #[test]
6461 fn compact_operational_collection_rejects_latest_state() {
6462 let (_db, service) = setup();
6463 service
6464 .register_operational_collection(&OperationalRegisterRequest {
6465 name: "connector_health".to_owned(),
6466 kind: OperationalCollectionKind::LatestState,
6467 schema_json: "{}".to_owned(),
6468 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6469 filter_fields_json: "[]".to_owned(),
6470 validation_json: String::new(),
6471 secondary_indexes_json: "[]".to_owned(),
6472 format_version: 1,
6473 })
6474 .expect("register collection");
6475
6476 let error = service
6477 .compact_operational_collection("connector_health", false)
6478 .expect_err("latest_state compaction should be rejected");
6479 assert!(matches!(error, EngineError::InvalidWrite(_)));
6480 assert!(error.to_string().contains("append_only_log"));
6481 }
6482
6483 #[test]
6484 fn register_operational_collection_persists_filter_fields_json() {
6485 let (_db, service) = setup();
6486
6487 let record = service
6488 .register_operational_collection(&OperationalRegisterRequest {
6489 name: "audit_log".to_owned(),
6490 kind: OperationalCollectionKind::AppendOnlyLog,
6491 schema_json: "{}".to_owned(),
6492 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6493 filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#.to_owned(),
6494 validation_json: String::new(),
6495 secondary_indexes_json: "[]".to_owned(),
6496 format_version: 1,
6497 })
6498 .expect("register collection");
6499
6500 assert_eq!(
6501 record.filter_fields_json,
6502 r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#
6503 );
6504 }
6505
6506 #[test]
6507 fn read_operational_collection_filters_append_only_rows_by_declared_fields() {
6508 let (db, service) = setup();
6509 service
6510 .register_operational_collection(&OperationalRegisterRequest {
6511 name: "audit_log".to_owned(),
6512 kind: OperationalCollectionKind::AppendOnlyLog,
6513 schema_json: "{}".to_owned(),
6514 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6515 filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"seq","type":"integer","modes":["exact","range"]},{"name":"ts","type":"timestamp","modes":["exact","range"]}]"#.to_owned(),
6516 validation_json: String::new(),
6517 secondary_indexes_json: "[]".to_owned(),
6518 format_version: 1,
6519 })
6520 .expect("register collection");
6521 {
6522 let writer = crate::WriterActor::start(
6523 db.path(),
6524 Arc::new(SchemaManager::new()),
6525 crate::ProvenanceMode::Warn,
6526 Arc::new(crate::TelemetryCounters::default()),
6527 )
6528 .expect("writer");
6529 writer
6530 .submit(crate::WriteRequest {
6531 label: "operational".to_owned(),
6532 nodes: vec![],
6533 node_retires: vec![],
6534 edges: vec![],
6535 edge_retires: vec![],
6536 chunks: vec![],
6537 runs: vec![],
6538 steps: vec![],
6539 actions: vec![],
6540 optional_backfills: vec![],
6541 vec_inserts: vec![],
6542 operational_writes: vec![
6543 crate::OperationalWrite::Append {
6544 collection: "audit_log".to_owned(),
6545 record_key: "evt-1".to_owned(),
6546 payload_json: r#"{"actor":"alice","seq":1,"ts":100}"#.to_owned(),
6547 source_ref: Some("src-1".to_owned()),
6548 },
6549 crate::OperationalWrite::Append {
6550 collection: "audit_log".to_owned(),
6551 record_key: "evt-2".to_owned(),
6552 payload_json: r#"{"actor":"alice-admin","seq":2,"ts":200}"#.to_owned(),
6553 source_ref: Some("src-2".to_owned()),
6554 },
6555 crate::OperationalWrite::Append {
6556 collection: "audit_log".to_owned(),
6557 record_key: "evt-3".to_owned(),
6558 payload_json: r#"{"actor":"bob","seq":3,"ts":300}"#.to_owned(),
6559 source_ref: Some("src-3".to_owned()),
6560 },
6561 ],
6562 })
6563 .expect("write");
6564 }
6565
6566 let report = service
6567 .read_operational_collection(&crate::operational::OperationalReadRequest {
6568 collection_name: "audit_log".to_owned(),
6569 filters: vec![
6570 crate::operational::OperationalFilterClause::Prefix {
6571 field: "actor".to_owned(),
6572 value: "alice".to_owned(),
6573 },
6574 crate::operational::OperationalFilterClause::Range {
6575 field: "ts".to_owned(),
6576 lower: Some(150),
6577 upper: Some(250),
6578 },
6579 ],
6580 limit: Some(10),
6581 })
6582 .expect("filtered read");
6583
6584 assert_eq!(report.collection_name, "audit_log");
6585 assert_eq!(report.row_count, 1);
6586 assert!(!report.was_limited);
6587 assert_eq!(report.rows.len(), 1);
6588 assert_eq!(report.rows[0].record_key, "evt-2");
6589 assert_eq!(
6590 report.rows[0].payload_json,
6591 r#"{"actor":"alice-admin","seq":2,"ts":200}"#
6592 );
6593 }
6594
6595 #[test]
6596 fn read_operational_collection_uses_secondary_index_when_filter_values_are_missing() {
6597 let (db, service) = setup();
6598 service
6599 .register_operational_collection(&OperationalRegisterRequest {
6600 name: "audit_log".to_owned(),
6601 kind: OperationalCollectionKind::AppendOnlyLog,
6602 schema_json: "{}".to_owned(),
6603 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6604 filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact","prefix"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#.to_owned(),
6605 validation_json: String::new(),
6606 secondary_indexes_json: r#"[{"name":"actor_ts","kind":"append_only_field_time","field":"actor","value_type":"string","time_field":"ts"}]"#.to_owned(),
6607 format_version: 1,
6608 })
6609 .expect("register collection");
6610 {
6611 let writer = crate::WriterActor::start(
6612 db.path(),
6613 Arc::new(SchemaManager::new()),
6614 crate::ProvenanceMode::Warn,
6615 Arc::new(crate::TelemetryCounters::default()),
6616 )
6617 .expect("writer");
6618 writer
6619 .submit(crate::WriteRequest {
6620 label: "operational".to_owned(),
6621 nodes: vec![],
6622 node_retires: vec![],
6623 edges: vec![],
6624 edge_retires: vec![],
6625 chunks: vec![],
6626 runs: vec![],
6627 steps: vec![],
6628 actions: vec![],
6629 optional_backfills: vec![],
6630 vec_inserts: vec![],
6631 operational_writes: vec![
6632 crate::OperationalWrite::Append {
6633 collection: "audit_log".to_owned(),
6634 record_key: "evt-1".to_owned(),
6635 payload_json: r#"{"actor":"alice","ts":100}"#.to_owned(),
6636 source_ref: Some("src-1".to_owned()),
6637 },
6638 crate::OperationalWrite::Append {
6639 collection: "audit_log".to_owned(),
6640 record_key: "evt-2".to_owned(),
6641 payload_json: r#"{"actor":"alice-admin","ts":200}"#.to_owned(),
6642 source_ref: Some("src-2".to_owned()),
6643 },
6644 ],
6645 })
6646 .expect("write");
6647 }
6648 let conn = sqlite::open_connection(db.path()).expect("conn");
6649 conn.execute(
6650 "DELETE FROM operational_filter_values WHERE collection_name = 'audit_log'",
6651 [],
6652 )
6653 .expect("clear filter values");
6654 drop(conn);
6655
6656 let report = service
6657 .read_operational_collection(&crate::operational::OperationalReadRequest {
6658 collection_name: "audit_log".to_owned(),
6659 filters: vec![
6660 crate::operational::OperationalFilterClause::Prefix {
6661 field: "actor".to_owned(),
6662 value: "alice".to_owned(),
6663 },
6664 crate::operational::OperationalFilterClause::Range {
6665 field: "ts".to_owned(),
6666 lower: Some(150),
6667 upper: Some(250),
6668 },
6669 ],
6670 limit: Some(10),
6671 })
6672 .expect("secondary-index read");
6673
6674 assert_eq!(report.row_count, 1);
6675 assert_eq!(report.rows[0].record_key, "evt-2");
6676 }
6677
6678 #[test]
6679 fn read_operational_collection_rejects_undeclared_fields_and_latest_state_collections() {
6680 let (_db, service) = setup();
6681 service
6682 .register_operational_collection(&OperationalRegisterRequest {
6683 name: "connector_health".to_owned(),
6684 kind: OperationalCollectionKind::LatestState,
6685 schema_json: "{}".to_owned(),
6686 retention_json: "{}".to_owned(),
6687 filter_fields_json: r#"[{"name":"status","type":"string","modes":["exact"]}]"#
6688 .to_owned(),
6689 validation_json: String::new(),
6690 secondary_indexes_json: "[]".to_owned(),
6691 format_version: 1,
6692 })
6693 .expect("register collection");
6694
6695 let latest_state_error = service
6696 .read_operational_collection(&crate::operational::OperationalReadRequest {
6697 collection_name: "connector_health".to_owned(),
6698 filters: vec![crate::operational::OperationalFilterClause::Exact {
6699 field: "status".to_owned(),
6700 value: crate::operational::OperationalFilterValue::String("ok".to_owned()),
6701 }],
6702 limit: Some(10),
6703 })
6704 .expect_err("latest_state filtered reads should be rejected");
6705 assert!(latest_state_error.to_string().contains("append_only_log"));
6706
6707 service
6708 .register_operational_collection(&OperationalRegisterRequest {
6709 name: "audit_log".to_owned(),
6710 kind: OperationalCollectionKind::AppendOnlyLog,
6711 schema_json: "{}".to_owned(),
6712 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6713 filter_fields_json: r#"[{"name":"actor","type":"string","modes":["exact"]}]"#
6714 .to_owned(),
6715 validation_json: String::new(),
6716 secondary_indexes_json: "[]".to_owned(),
6717 format_version: 1,
6718 })
6719 .expect("register append-only collection");
6720
6721 let undeclared_error = service
6722 .read_operational_collection(&crate::operational::OperationalReadRequest {
6723 collection_name: "audit_log".to_owned(),
6724 filters: vec![crate::operational::OperationalFilterClause::Exact {
6725 field: "missing".to_owned(),
6726 value: crate::operational::OperationalFilterValue::String("x".to_owned()),
6727 }],
6728 limit: Some(10),
6729 })
6730 .expect_err("undeclared field should be rejected");
6731 assert!(undeclared_error.to_string().contains("undeclared"));
6732 }
6733
6734 #[test]
6735 fn read_operational_collection_applies_limit_and_reports_truncation() {
6736 let (db, service) = setup();
6737 service
6738 .register_operational_collection(&OperationalRegisterRequest {
6739 name: "audit_log".to_owned(),
6740 kind: OperationalCollectionKind::AppendOnlyLog,
6741 schema_json: "{}".to_owned(),
6742 retention_json: r#"{"mode":"keep_all"}"#.to_owned(),
6743 filter_fields_json: r#"[{"name":"actor","type":"string","modes":["prefix"]}]"#
6744 .to_owned(),
6745 validation_json: String::new(),
6746 secondary_indexes_json: "[]".to_owned(),
6747 format_version: 1,
6748 })
6749 .expect("register collection");
6750 {
6751 let writer = crate::WriterActor::start(
6752 db.path(),
6753 Arc::new(SchemaManager::new()),
6754 crate::ProvenanceMode::Warn,
6755 Arc::new(crate::TelemetryCounters::default()),
6756 )
6757 .expect("writer");
6758 writer
6759 .submit(crate::WriteRequest {
6760 label: "operational".to_owned(),
6761 nodes: vec![],
6762 node_retires: vec![],
6763 edges: vec![],
6764 edge_retires: vec![],
6765 chunks: vec![],
6766 runs: vec![],
6767 steps: vec![],
6768 actions: vec![],
6769 optional_backfills: vec![],
6770 vec_inserts: vec![],
6771 operational_writes: vec![
6772 crate::OperationalWrite::Append {
6773 collection: "audit_log".to_owned(),
6774 record_key: "evt-1".to_owned(),
6775 payload_json: r#"{"actor":"alice-1"}"#.to_owned(),
6776 source_ref: Some("src-1".to_owned()),
6777 },
6778 crate::OperationalWrite::Append {
6779 collection: "audit_log".to_owned(),
6780 record_key: "evt-2".to_owned(),
6781 payload_json: r#"{"actor":"alice-2"}"#.to_owned(),
6782 source_ref: Some("src-2".to_owned()),
6783 },
6784 ],
6785 })
6786 .expect("write");
6787 }
6788
6789 let report = service
6790 .read_operational_collection(&crate::operational::OperationalReadRequest {
6791 collection_name: "audit_log".to_owned(),
6792 filters: vec![crate::operational::OperationalFilterClause::Prefix {
6793 field: "actor".to_owned(),
6794 value: "alice".to_owned(),
6795 }],
6796 limit: Some(1),
6797 })
6798 .expect("limited read");
6799
6800 assert_eq!(report.row_count, 1);
6801 assert_eq!(report.applied_limit, 1);
6802 assert!(report.was_limited);
6803 assert_eq!(report.rows[0].record_key, "evt-2");
6804 }
6805
6806 #[test]
6807 fn preexisting_operational_collection_can_gain_filter_contract_after_upgrade() {
6808 let db = NamedTempFile::new().expect("temp db");
6809 let conn = sqlite::open_connection(db.path()).expect("conn");
6810 conn.execute_batch(
6811 r#"
6812 CREATE TABLE operational_collections (
6813 name TEXT PRIMARY KEY,
6814 kind TEXT NOT NULL,
6815 schema_json TEXT NOT NULL,
6816 retention_json TEXT NOT NULL,
6817 format_version INTEGER NOT NULL DEFAULT 1,
6818 created_at INTEGER NOT NULL DEFAULT 100,
6819 disabled_at INTEGER
6820 );
6821 CREATE TABLE operational_mutations (
6822 id TEXT PRIMARY KEY,
6823 collection_name TEXT NOT NULL,
6824 record_key TEXT NOT NULL,
6825 op_kind TEXT NOT NULL,
6826 payload_json TEXT NOT NULL,
6827 source_ref TEXT,
6828 created_at INTEGER NOT NULL DEFAULT 100,
6829 mutation_order INTEGER NOT NULL DEFAULT 1
6830 );
6831 INSERT INTO operational_collections (name, kind, schema_json, retention_json, format_version, created_at)
6832 VALUES ('audit_log', 'append_only_log', '{}', '{"mode":"keep_all"}', 1, 100);
6833 INSERT INTO operational_mutations
6834 (id, collection_name, record_key, op_kind, payload_json, source_ref, created_at, mutation_order)
6835 VALUES
6836 ('evt-1', 'audit_log', 'evt-1', 'append', '{"actor":"alice","ts":0}', 'src-1', 100, 1);
6837 "#,
6838 )
6839 .expect("seed pre-v10 schema");
6840 drop(conn);
6841
6842 let service = AdminService::new(db.path(), Arc::new(SchemaManager::new()));
6843 let pre_update = service
6844 .read_operational_collection(&crate::operational::OperationalReadRequest {
6845 collection_name: "audit_log".to_owned(),
6846 filters: vec![crate::operational::OperationalFilterClause::Exact {
6847 field: "actor".to_owned(),
6848 value: crate::operational::OperationalFilterValue::String("alice".to_owned()),
6849 }],
6850 limit: Some(10),
6851 })
6852 .expect_err("read should reject undeclared fields before migration update");
6853 assert!(pre_update.to_string().contains("undeclared"));
6854
6855 let updated = service
6856 .update_operational_collection_filters(
6857 "audit_log",
6858 r#"[{"name":"actor","type":"string","modes":["exact"]},{"name":"ts","type":"timestamp","modes":["range"]}]"#,
6859 )
6860 .expect("update filter contract");
6861 assert!(updated.filter_fields_json.contains("\"actor\""));
6862
6863 let report = service
6864 .read_operational_collection(&crate::operational::OperationalReadRequest {
6865 collection_name: "audit_log".to_owned(),
6866 filters: vec![crate::operational::OperationalFilterClause::Range {
6867 field: "ts".to_owned(),
6868 lower: Some(0),
6869 upper: Some(0),
6870 }],
6871 limit: Some(10),
6872 })
6873 .expect("read after explicit filter update");
6874 assert_eq!(report.row_count, 1);
6875 assert_eq!(report.rows[0].record_key, "evt-1");
6876 }
6877
6878 #[cfg(feature = "sqlite-vec")]
6879 #[test]
6880 fn check_semantics_detects_stale_vec_rows() {
6881 use crate::sqlite::open_connection_with_vec;
6882
6883 let db = NamedTempFile::new().expect("temp file");
6884 let schema = Arc::new(SchemaManager::new());
6885 {
6886 let conn = open_connection_with_vec(db.path()).expect("vec conn");
6887 schema.bootstrap(&conn).expect("bootstrap");
6888 schema
6889 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 3)
6890 .expect("vec profile");
6891 let bytes: Vec<u8> = [0.1f32, 0.2f32, 0.3f32]
6893 .iter()
6894 .flat_map(|f| f.to_le_bytes())
6895 .collect();
6896 conn.execute(
6897 "INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES ('ghost-chunk', ?1)",
6898 rusqlite::params![bytes],
6899 )
6900 .expect("insert stale vec row");
6901 }
6902 let service = AdminService::new(db.path(), Arc::clone(&schema));
6903 let report = service.check_semantics().expect("semantics check");
6904 assert_eq!(report.stale_vec_rows, 1);
6905 assert!(
6906 report.warnings.iter().any(|w| w.contains("stale vec")),
6907 "warning must mention stale vec"
6908 );
6909 }
6910
6911 #[cfg(feature = "sqlite-vec")]
6912 #[test]
6913 fn restore_vector_profiles_recreates_vec_table_from_metadata() {
6914 let db = NamedTempFile::new().expect("temp file");
6915 let schema = Arc::new(SchemaManager::new());
6916 {
6917 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6918 schema.bootstrap(&conn).expect("bootstrap");
6919 conn.execute(
6920 "INSERT INTO vector_profiles (profile, table_name, dimension, enabled) \
6921 VALUES ('default', 'vec_nodes_active', 3, 1)",
6922 [],
6923 )
6924 .expect("insert vector profile");
6925 }
6926
6927 let service = AdminService::new(db.path(), Arc::clone(&schema));
6928 let report = service
6929 .restore_vector_profiles()
6930 .expect("restore vector profiles");
6931 assert_eq!(
6932 report.targets,
6933 vec![crate::projection::ProjectionTarget::Vec]
6934 );
6935 assert_eq!(report.rebuilt_rows, 1);
6936
6937 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
6938 let count: i64 = conn
6939 .query_row(
6940 "SELECT count(*) FROM sqlite_schema WHERE name = 'vec_nodes_active'",
6941 [],
6942 |row| row.get(0),
6943 )
6944 .expect("vec schema count");
6945 assert_eq!(count, 1, "vec table should exist after restore");
6946 }
6947
6948 #[cfg(feature = "sqlite-vec")]
6949 #[test]
6950 fn load_vector_regeneration_config_supports_json_and_toml() {
6951 let dir = tempfile::tempdir().expect("temp dir");
6952 let json_path = dir.path().join("regen.json");
6953 let toml_path = dir.path().join("regen.toml");
6954
6955 let config = VectorRegenerationConfig {
6956 profile: "default".to_owned(),
6957 table_name: "vec_nodes_active".to_owned(),
6958 model_identity: "model-a".to_owned(),
6959 model_version: "1.0".to_owned(),
6960 dimension: 4,
6961 normalization_policy: "l2".to_owned(),
6962 chunking_policy: "per_chunk".to_owned(),
6963 preprocessing_policy: "trim".to_owned(),
6964 generator_command: vec!["/bin/echo".to_owned()],
6965 };
6966
6967 fs::write(&json_path, serde_json::to_string(&config).expect("json")).expect("write json");
6968 fs::write(&toml_path, toml::to_string(&config).expect("toml")).expect("write toml");
6969
6970 let parsed_json = load_vector_regeneration_config(&json_path).expect("json parse");
6971 let parsed_toml = load_vector_regeneration_config(&toml_path).expect("toml parse");
6972
6973 assert_eq!(parsed_json, config);
6974 assert_eq!(parsed_toml, config);
6975 }
6976
6977 #[cfg(all(not(feature = "sqlite-vec"), unix))]
6978 #[test]
6979 fn regenerate_vector_embeddings_unsupported_vec_capability_writes_request_and_failed_audit() {
6980 let db = NamedTempFile::new().expect("temp file");
6981 let schema = Arc::new(SchemaManager::new());
6982 let temp_dir = tempfile::tempdir().expect("temp dir");
6983 let script_path = temp_dir.path().join("vector-generator-no-vec.sh");
6984
6985 fs::write(
6986 &script_path,
6987 r#"#!/usr/bin/env bash
6988set -euo pipefail
6989python3 -c 'import json, sys
6990payload = json.load(sys.stdin)
6991embeddings = [{"chunk_id": chunk["chunk_id"], "embedding": [1.0, 0.0, 0.0, 0.0]} for chunk in payload["chunks"]]
6992json.dump({"embeddings": embeddings}, sys.stdout)'
6993"#,
6994 )
6995 .expect("write generator script");
6996 set_file_mode(&script_path, 0o755);
6997
6998 {
6999 let conn = sqlite::open_connection(db.path()).expect("connection");
7000 schema.bootstrap(&conn).expect("bootstrap");
7001 conn.execute(
7002 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7003 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7004 [],
7005 )
7006 .expect("insert node");
7007 conn.execute(
7008 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7009 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7010 [],
7011 )
7012 .expect("insert chunk");
7013 }
7014
7015 let service = AdminService::new(db.path(), Arc::clone(&schema));
7016 let error = service
7017 .regenerate_vector_embeddings(&VectorRegenerationConfig {
7018 profile: "default".to_owned(),
7019 table_name: "vec_nodes_active".to_owned(),
7020 model_identity: "test-model".to_owned(),
7021 model_version: "1.0.0".to_owned(),
7022 dimension: 4,
7023 normalization_policy: "l2".to_owned(),
7024 chunking_policy: "per_chunk".to_owned(),
7025 preprocessing_policy: "trim".to_owned(),
7026 generator_command: vec![script_path.to_string_lossy().to_string()],
7027 })
7028 .expect_err("sqlite-vec capability should be required");
7029
7030 assert!(error.to_string().contains("unsupported vec capability"));
7031
7032 let conn = sqlite::open_connection(db.path()).expect("connection");
7033 let request_count: i64 = conn
7034 .query_row(
7035 "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_requested' AND subject = 'default'",
7036 [],
7037 |row| row.get(0),
7038 )
7039 .expect("request count");
7040 assert_eq!(request_count, 1);
7041 let failed_count: i64 = conn
7042 .query_row(
7043 "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7044 [],
7045 |row| row.get(0),
7046 )
7047 .expect("failed count");
7048 assert_eq!(failed_count, 1);
7049 let metadata_json: String = conn
7050 .query_row(
7051 "SELECT metadata_json FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7052 [],
7053 |row| row.get(0),
7054 )
7055 .expect("failed metadata");
7056 assert!(metadata_json.contains("\"failure_class\":\"unsupported vec capability\""));
7057 }
7058
7059 #[cfg(feature = "sqlite-vec")]
7060 #[test]
7061 fn regenerate_vector_embeddings_rebuilds_embeddings_from_generator() {
7062 let db = NamedTempFile::new().expect("temp file");
7063 let schema = Arc::new(SchemaManager::new());
7064 let temp_dir = tempfile::tempdir().expect("temp dir");
7065 let script_path = temp_dir.path().join("vector-generator.sh");
7066
7067 fs::write(
7068 &script_path,
7069 r#"#!/usr/bin/env bash
7070set -euo pipefail
7071python3 -c 'import json, sys
7072payload = json.load(sys.stdin)
7073embeddings = []
7074for chunk in payload["chunks"]:
7075 text = chunk["text_content"].lower()
7076 if "budget" in text:
7077 embedding = [1.0, 0.0, 0.0, 0.0]
7078 else:
7079 embedding = [0.0, 1.0, 0.0, 0.0]
7080 embeddings.append({"chunk_id": chunk["chunk_id"], "embedding": embedding})
7081json.dump({"embeddings": embeddings}, sys.stdout)'
7082"#,
7083 )
7084 .expect("write generator script");
7085 set_file_mode(&script_path, 0o755);
7086
7087 {
7088 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7089 schema.bootstrap(&conn).expect("bootstrap");
7090 conn.execute(
7091 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7092 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7093 [],
7094 )
7095 .expect("insert node");
7096 conn.execute(
7097 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7098 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7099 [],
7100 )
7101 .expect("insert chunk 1");
7102 conn.execute(
7103 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7104 VALUES ('chunk-2', 'doc-1', 'travel plan', 101)",
7105 [],
7106 )
7107 .expect("insert chunk 2");
7108 }
7109
7110 let service = AdminService::new(db.path(), Arc::clone(&schema));
7111 let report = service
7112 .regenerate_vector_embeddings(&VectorRegenerationConfig {
7113 profile: "default".to_owned(),
7114 table_name: "vec_nodes_active".to_owned(),
7115 model_identity: "test-model".to_owned(),
7116 model_version: "1.0.0".to_owned(),
7117 dimension: 4,
7118 normalization_policy: "l2".to_owned(),
7119 chunking_policy: "per_chunk".to_owned(),
7120 preprocessing_policy: "trim".to_owned(),
7121 generator_command: vec![script_path.to_string_lossy().to_string()],
7122 })
7123 .expect("regenerate vectors");
7124
7125 assert_eq!(report.profile, "default");
7126 assert_eq!(report.table_name, "vec_nodes_active");
7127 assert_eq!(report.dimension, 4);
7128 assert_eq!(report.total_chunks, 2);
7129 assert_eq!(report.regenerated_rows, 2);
7130 assert!(report.contract_persisted);
7131
7132 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7133 let vec_count: i64 = conn
7134 .query_row("SELECT count(*) FROM vec_nodes_active", [], |row| {
7135 row.get(0)
7136 })
7137 .expect("vec count");
7138 assert_eq!(vec_count, 2);
7139
7140 let contract_count: i64 = conn
7141 .query_row(
7142 "SELECT count(*) FROM vector_embedding_contracts WHERE profile = 'default'",
7143 [],
7144 |row| row.get(0),
7145 )
7146 .expect("contract count");
7147 assert_eq!(contract_count, 1);
7148 let applied_at: i64 = conn
7149 .query_row(
7150 "SELECT applied_at FROM vector_embedding_contracts WHERE profile = 'default'",
7151 [],
7152 |row| row.get(0),
7153 )
7154 .expect("applied_at");
7155 assert!(applied_at > 0);
7156 let snapshot_hash: String = conn
7157 .query_row(
7158 "SELECT snapshot_hash FROM vector_embedding_contracts WHERE profile = 'default'",
7159 [],
7160 |row| row.get(0),
7161 )
7162 .expect("snapshot_hash");
7163 assert!(!snapshot_hash.is_empty());
7164 let contract_format_version: i64 = conn
7165 .query_row(
7166 "SELECT contract_format_version FROM vector_embedding_contracts WHERE profile = 'default'",
7167 [],
7168 |row| row.get(0),
7169 )
7170 .expect("contract_format_version");
7171 assert_eq!(contract_format_version, 1);
7172 let request_count: i64 = conn
7173 .query_row(
7174 "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_requested' AND subject = 'default'",
7175 [],
7176 |row| row.get(0),
7177 )
7178 .expect("request audit count");
7179 assert_eq!(request_count, 1);
7180 let apply_count: i64 = conn
7181 .query_row(
7182 "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_apply' AND subject = 'default'",
7183 [],
7184 |row| row.get(0),
7185 )
7186 .expect("apply audit count");
7187 assert_eq!(apply_count, 1);
7188 let apply_metadata: String = conn
7189 .query_row(
7190 "SELECT metadata_json FROM provenance_events WHERE event_type = 'vector_regeneration_apply' AND subject = 'default'",
7191 [],
7192 |row| row.get(0),
7193 )
7194 .expect("apply metadata");
7195 assert!(apply_metadata.contains("\"profile\":\"default\""));
7196 assert!(apply_metadata.contains("\"snapshot_hash\":"));
7197 }
7198
7199 #[cfg(feature = "sqlite-vec")]
7200 #[test]
7201 fn regenerate_vector_embeddings_failure_leaves_contract_and_vec_rows_unchanged() {
7202 let db = NamedTempFile::new().expect("temp file");
7203 let schema = Arc::new(SchemaManager::new());
7204 let temp_dir = tempfile::tempdir().expect("temp dir");
7205 let script_path = temp_dir.path().join("vector-generator-fail.sh");
7206
7207 fs::write(
7208 &script_path,
7209 "#!/usr/bin/env bash\nset -euo pipefail\necho 'generator boom' >&2\nexit 17\n",
7210 )
7211 .expect("write failing script");
7212 set_file_mode(&script_path, 0o755);
7213
7214 {
7215 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7216 schema.bootstrap(&conn).expect("bootstrap");
7217 conn.execute(
7218 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7219 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7220 [],
7221 )
7222 .expect("insert node");
7223 conn.execute(
7224 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7225 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7226 [],
7227 )
7228 .expect("insert chunk");
7229 schema
7230 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
7231 .expect("ensure vec profile");
7232 conn.execute(
7233 r"
7234 INSERT INTO vector_embedding_contracts (
7235 profile,
7236 table_name,
7237 model_identity,
7238 model_version,
7239 dimension,
7240 normalization_policy,
7241 chunking_policy,
7242 preprocessing_policy,
7243 generator_command_json,
7244 applied_at,
7245 snapshot_hash
7246 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)
7247 ",
7248 rusqlite::params![
7249 "default",
7250 "vec_nodes_active",
7251 "old-model",
7252 "0.9.0",
7253 4,
7254 "l2",
7255 "per_chunk",
7256 "trim",
7257 "[\"/bin/echo\"]",
7258 111,
7259 "old-snapshot"
7260 ],
7261 )
7262 .expect("seed contract");
7263 conn.execute(
7264 "INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
7265 [],
7266 )
7267 .expect("seed vec row");
7268 }
7269
7270 let service = AdminService::new(db.path(), Arc::clone(&schema));
7271 let error = service
7272 .regenerate_vector_embeddings_with_policy(
7273 &VectorRegenerationConfig {
7274 profile: "default".to_owned(),
7275 table_name: "vec_nodes_active".to_owned(),
7276 model_identity: "new-model".to_owned(),
7277 model_version: "1.0.0".to_owned(),
7278 dimension: 4,
7279 normalization_policy: "l2".to_owned(),
7280 chunking_policy: "per_chunk".to_owned(),
7281 preprocessing_policy: "trim".to_owned(),
7282 generator_command: vec![script_path.to_string_lossy().to_string()],
7283 },
7284 &VectorGeneratorPolicy::default(),
7285 )
7286 .expect_err("generator should fail");
7287
7288 assert!(error.to_string().contains("generator nonzero exit"));
7289
7290 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7291 let model_identity: String = conn
7292 .query_row(
7293 "SELECT model_identity FROM vector_embedding_contracts WHERE profile = 'default'",
7294 [],
7295 |row| row.get(0),
7296 )
7297 .expect("model identity");
7298 assert_eq!(model_identity, "old-model");
7299 let snapshot_hash: String = conn
7300 .query_row(
7301 "SELECT snapshot_hash FROM vector_embedding_contracts WHERE profile = 'default'",
7302 [],
7303 |row| row.get(0),
7304 )
7305 .expect("snapshot hash");
7306 assert_eq!(snapshot_hash, "old-snapshot");
7307 let vec_count: i64 = conn
7308 .query_row("SELECT count(*) FROM vec_nodes_active", [], |row| {
7309 row.get(0)
7310 })
7311 .expect("vec count");
7312 assert_eq!(vec_count, 1);
7313 let failure_count: i64 = conn
7314 .query_row(
7315 "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7316 [],
7317 |row| row.get(0),
7318 )
7319 .expect("failure count");
7320 assert_eq!(failure_count, 1);
7321 let failure_metadata: String = conn
7322 .query_row(
7323 "SELECT metadata_json FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7324 [],
7325 |row| row.get(0),
7326 )
7327 .expect("failure metadata");
7328 assert!(failure_metadata.contains("\"failure_class\":\"generator nonzero exit\""));
7329 }
7330
7331 #[cfg(feature = "sqlite-vec")]
7332 #[test]
7333 fn regenerate_vector_embeddings_snapshot_drift_is_retryable_and_non_mutating() {
7334 let db = NamedTempFile::new().expect("temp file");
7335 let schema = Arc::new(SchemaManager::new());
7336 let temp_dir = tempfile::tempdir().expect("temp dir");
7337 let script_path = temp_dir.path().join("vector-generator-drift.sh");
7338 let db_path = db.path().to_string_lossy().to_string();
7339
7340 fs::write(
7341 &script_path,
7342 format!(
7343 r#"#!/usr/bin/env bash
7344set -euo pipefail
7345python3 -c 'import json, sqlite3, sys
7346payload = json.load(sys.stdin)
7347conn = sqlite3.connect({db_path:?})
7348conn.execute("INSERT INTO chunks (id, node_logical_id, text_content, created_at) VALUES (?, ?, ?, ?)", ("chunk-2", "doc-1", "late arriving text", 101))
7349conn.commit()
7350conn.close()
7351embeddings = [{{"chunk_id": chunk["chunk_id"], "embedding": [1.0, 0.0, 0.0, 0.0]}} for chunk in payload["chunks"]]
7352json.dump({{"embeddings": embeddings}}, sys.stdout)'
7353"#,
7354 ),
7355 )
7356 .expect("write drift script");
7357 set_file_mode(&script_path, 0o755);
7358
7359 {
7360 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7361 schema.bootstrap(&conn).expect("bootstrap");
7362 conn.execute(
7363 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7364 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7365 [],
7366 )
7367 .expect("insert node");
7368 conn.execute(
7369 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7370 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7371 [],
7372 )
7373 .expect("insert chunk");
7374 schema
7375 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
7376 .expect("ensure vec profile");
7377 }
7378
7379 let service = AdminService::new(db.path(), Arc::clone(&schema));
7380 let error = service
7381 .regenerate_vector_embeddings_with_policy(
7382 &VectorRegenerationConfig {
7383 profile: "default".to_owned(),
7384 table_name: "vec_nodes_active".to_owned(),
7385 model_identity: "test-model".to_owned(),
7386 model_version: "1.0.0".to_owned(),
7387 dimension: 4,
7388 normalization_policy: "l2".to_owned(),
7389 chunking_policy: "per_chunk".to_owned(),
7390 preprocessing_policy: "trim".to_owned(),
7391 generator_command: vec![script_path.to_string_lossy().to_string()],
7392 },
7393 &VectorGeneratorPolicy::default(),
7394 )
7395 .expect_err("snapshot drift should fail");
7396
7397 assert!(
7398 error
7399 .to_string()
7400 .contains("vector regeneration snapshot drift:")
7401 );
7402 assert!(error.to_string().contains("[retryable]"));
7403
7404 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7405 let contract_count: i64 = conn
7406 .query_row(
7407 "SELECT count(*) FROM vector_embedding_contracts",
7408 [],
7409 |row| row.get(0),
7410 )
7411 .expect("contract count");
7412 assert_eq!(contract_count, 0);
7413 let vec_count: i64 = conn
7414 .query_row("SELECT count(*) FROM vec_nodes_active", [], |row| {
7415 row.get(0)
7416 })
7417 .expect("vec count");
7418 assert_eq!(vec_count, 0);
7419 let failure_count: i64 = conn
7420 .query_row(
7421 "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7422 [],
7423 |row| row.get(0),
7424 )
7425 .expect("failure count");
7426 assert_eq!(failure_count, 1);
7427 }
7428
7429 #[cfg(feature = "sqlite-vec")]
7430 #[test]
7431 fn regenerate_vector_embeddings_times_out_and_kills_generator() {
7432 let (_db, service) = setup();
7433 let temp_dir = tempfile::tempdir().expect("temp dir");
7434 let script_path = temp_dir.path().join("vector-generator-timeout.sh");
7435
7436 fs::write(
7437 &script_path,
7438 "#!/usr/bin/env bash\nset -euo pipefail\nsleep 1\nprintf '{\"embeddings\":[]}'\n",
7439 )
7440 .expect("write timeout script");
7441 set_file_mode(&script_path, 0o755);
7442
7443 let error = service
7444 .regenerate_vector_embeddings_with_policy(
7445 &VectorRegenerationConfig {
7446 profile: "default".to_owned(),
7447 table_name: "vec_nodes_active".to_owned(),
7448 model_identity: "model".to_owned(),
7449 model_version: "1.0.0".to_owned(),
7450 dimension: 4,
7451 normalization_policy: "l2".to_owned(),
7452 chunking_policy: "per_chunk".to_owned(),
7453 preprocessing_policy: "trim".to_owned(),
7454 generator_command: vec![script_path.to_string_lossy().to_string()],
7455 },
7456 &VectorGeneratorPolicy {
7457 timeout_ms: 50,
7458 max_stdout_bytes: 1024,
7459 max_stderr_bytes: 1024,
7460 max_input_bytes: 1024,
7461 max_chunks: 10,
7462 require_absolute_executable: true,
7463 reject_world_writable_executable: true,
7464 allowed_executable_roots: vec![],
7465 preserve_env_vars: vec![],
7466 },
7467 )
7468 .expect_err("generator should time out");
7469 assert!(error.to_string().contains("generator timeout"));
7470 }
7471
7472 #[cfg(feature = "sqlite-vec")]
7473 #[test]
7474 fn regenerate_vector_embeddings_rejects_oversized_stdout() {
7475 let (_db, service) = setup();
7476 let temp_dir = tempfile::tempdir().expect("temp dir");
7477 let script_path = temp_dir.path().join("vector-generator-stdout.sh");
7478
7479 fs::write(
7480 &script_path,
7481 "#!/usr/bin/env bash\nset -euo pipefail\npython3 -c 'import sys; sys.stdout.write(\"x\" * 5000)'\n",
7482 )
7483 .expect("write stdout script");
7484 set_file_mode(&script_path, 0o755);
7485
7486 let error = service
7487 .regenerate_vector_embeddings_with_policy(
7488 &VectorRegenerationConfig {
7489 profile: "default".to_owned(),
7490 table_name: "vec_nodes_active".to_owned(),
7491 model_identity: "model".to_owned(),
7492 model_version: "1.0.0".to_owned(),
7493 dimension: 4,
7494 normalization_policy: "l2".to_owned(),
7495 chunking_policy: "per_chunk".to_owned(),
7496 preprocessing_policy: "trim".to_owned(),
7497 generator_command: vec![script_path.to_string_lossy().to_string()],
7498 },
7499 &VectorGeneratorPolicy {
7500 timeout_ms: 1000,
7501 max_stdout_bytes: 128,
7502 max_stderr_bytes: 1024,
7503 max_input_bytes: 1024,
7504 max_chunks: 10,
7505 require_absolute_executable: true,
7506 reject_world_writable_executable: true,
7507 allowed_executable_roots: vec![],
7508 preserve_env_vars: vec![],
7509 },
7510 )
7511 .expect_err("generator stdout should overflow");
7512 assert!(error.to_string().contains("stdout overflow"));
7513 }
7514
7515 #[cfg(feature = "sqlite-vec")]
7516 #[test]
7517 fn regenerate_vector_embeddings_rejects_oversized_stderr() {
7518 let (_db, service) = setup();
7519 let temp_dir = tempfile::tempdir().expect("temp dir");
7520 let script_path = temp_dir.path().join("vector-generator-stderr.sh");
7521
7522 fs::write(
7523 &script_path,
7524 "#!/usr/bin/env bash\nset -euo pipefail\npython3 -c 'import sys; sys.stderr.write(\"e\" * 5000); sys.exit(7)'\n",
7525 )
7526 .expect("write stderr script");
7527 set_file_mode(&script_path, 0o755);
7528
7529 let error = service
7530 .regenerate_vector_embeddings_with_policy(
7531 &VectorRegenerationConfig {
7532 profile: "default".to_owned(),
7533 table_name: "vec_nodes_active".to_owned(),
7534 model_identity: "model".to_owned(),
7535 model_version: "1.0.0".to_owned(),
7536 dimension: 4,
7537 normalization_policy: "l2".to_owned(),
7538 chunking_policy: "per_chunk".to_owned(),
7539 preprocessing_policy: "trim".to_owned(),
7540 generator_command: vec![script_path.to_string_lossy().to_string()],
7541 },
7542 &VectorGeneratorPolicy {
7543 timeout_ms: 1000,
7544 max_stdout_bytes: 1024,
7545 max_stderr_bytes: 128,
7546 max_input_bytes: 1024,
7547 max_chunks: 10,
7548 require_absolute_executable: true,
7549 reject_world_writable_executable: true,
7550 allowed_executable_roots: vec![],
7551 preserve_env_vars: vec![],
7552 },
7553 )
7554 .expect_err("generator stderr should overflow");
7555 assert!(error.to_string().contains("stderr overflow"));
7556 }
7557
7558 #[cfg(feature = "sqlite-vec")]
7559 #[test]
7560 fn regenerate_vector_embeddings_rejects_oversized_input_before_spawn() {
7561 let db = NamedTempFile::new().expect("temp file");
7562 let schema = Arc::new(SchemaManager::new());
7563 {
7564 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7565 schema.bootstrap(&conn).expect("bootstrap");
7566 conn.execute(
7567 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7568 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7569 [],
7570 )
7571 .expect("insert node");
7572 conn.execute(
7573 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7574 VALUES ('chunk-1', 'doc-1', 'this chunk is intentionally long to exceed the configured input limit', 100)",
7575 [],
7576 )
7577 .expect("insert chunk");
7578 }
7579
7580 let service = AdminService::new(db.path(), Arc::clone(&schema));
7581 let error = service
7582 .regenerate_vector_embeddings_with_policy(
7583 &VectorRegenerationConfig {
7584 profile: "default".to_owned(),
7585 table_name: "vec_nodes_active".to_owned(),
7586 model_identity: "model".to_owned(),
7587 model_version: "1.0.0".to_owned(),
7588 dimension: 4,
7589 normalization_policy: "l2".to_owned(),
7590 chunking_policy: "per_chunk".to_owned(),
7591 preprocessing_policy: "trim".to_owned(),
7592 generator_command: vec!["/bin/echo".to_owned()],
7593 },
7594 &VectorGeneratorPolicy {
7595 timeout_ms: 1000,
7596 max_stdout_bytes: 1024,
7597 max_stderr_bytes: 1024,
7598 max_input_bytes: 32,
7599 max_chunks: 10,
7600 require_absolute_executable: true,
7601 reject_world_writable_executable: true,
7602 allowed_executable_roots: vec![],
7603 preserve_env_vars: vec![],
7604 },
7605 )
7606 .expect_err("input size should be rejected before spawn");
7607 assert!(error.to_string().contains("payload too large"));
7608 }
7609
7610 #[cfg(feature = "sqlite-vec")]
7611 #[test]
7612 fn regenerate_vector_embeddings_rejects_excessive_chunk_count_before_spawn() {
7613 let db = NamedTempFile::new().expect("temp file");
7614 let schema = Arc::new(SchemaManager::new());
7615 {
7616 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7617 schema.bootstrap(&conn).expect("bootstrap");
7618 conn.execute(
7619 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7620 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7621 [],
7622 )
7623 .expect("insert node");
7624 conn.execute(
7625 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) VALUES ('chunk-1', 'doc-1', 'a', 100)",
7626 [],
7627 )
7628 .expect("insert chunk 1");
7629 conn.execute(
7630 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) VALUES ('chunk-2', 'doc-1', 'b', 101)",
7631 [],
7632 )
7633 .expect("insert chunk 2");
7634 }
7635
7636 let service = AdminService::new(db.path(), Arc::clone(&schema));
7637 let error = service
7638 .regenerate_vector_embeddings_with_policy(
7639 &VectorRegenerationConfig {
7640 profile: "default".to_owned(),
7641 table_name: "vec_nodes_active".to_owned(),
7642 model_identity: "model".to_owned(),
7643 model_version: "1.0.0".to_owned(),
7644 dimension: 4,
7645 normalization_policy: "l2".to_owned(),
7646 chunking_policy: "per_chunk".to_owned(),
7647 preprocessing_policy: "trim".to_owned(),
7648 generator_command: vec!["/bin/echo".to_owned()],
7649 },
7650 &VectorGeneratorPolicy {
7651 timeout_ms: 1000,
7652 max_stdout_bytes: 1024,
7653 max_stderr_bytes: 1024,
7654 max_input_bytes: 2048,
7655 max_chunks: 1,
7656 require_absolute_executable: true,
7657 reject_world_writable_executable: true,
7658 allowed_executable_roots: vec![],
7659 preserve_env_vars: vec![],
7660 },
7661 )
7662 .expect_err("chunk count should be rejected before spawn");
7663 assert!(error.to_string().contains("payload too large"));
7664 }
7665
7666 #[cfg(feature = "sqlite-vec")]
7667 #[test]
7668 fn regenerate_vector_embeddings_malformed_json_leaves_contract_and_vec_rows_unchanged() {
7669 let db = NamedTempFile::new().expect("temp file");
7670 let schema = Arc::new(SchemaManager::new());
7671 let temp_dir = tempfile::tempdir().expect("temp dir");
7672 let script_path = temp_dir.path().join("vector-generator-bad-json.sh");
7673
7674 fs::write(
7675 &script_path,
7676 "#!/usr/bin/env bash\nset -euo pipefail\nprintf 'not-json'\n",
7677 )
7678 .expect("write bad json script");
7679 set_file_mode(&script_path, 0o755);
7680
7681 {
7682 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7683 schema.bootstrap(&conn).expect("bootstrap");
7684 conn.execute(
7685 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7686 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7687 [],
7688 )
7689 .expect("insert node");
7690 conn.execute(
7691 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7692 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7693 [],
7694 )
7695 .expect("insert chunk");
7696 schema
7697 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
7698 .expect("ensure vec profile");
7699 conn.execute(
7700 r"
7701 INSERT INTO vector_embedding_contracts (
7702 profile,
7703 table_name,
7704 model_identity,
7705 model_version,
7706 dimension,
7707 normalization_policy,
7708 chunking_policy,
7709 preprocessing_policy,
7710 generator_command_json,
7711 applied_at,
7712 snapshot_hash
7713 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)
7714 ",
7715 rusqlite::params![
7716 "default",
7717 "vec_nodes_active",
7718 "old-model",
7719 "0.9.0",
7720 4,
7721 "l2",
7722 "per_chunk",
7723 "trim",
7724 "[\"/bin/echo\"]",
7725 111,
7726 "old-snapshot"
7727 ],
7728 )
7729 .expect("seed contract");
7730 conn.execute(
7731 "INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES ('chunk-1', zeroblob(16))",
7732 [],
7733 )
7734 .expect("seed vec row");
7735 }
7736
7737 let service = AdminService::new(db.path(), Arc::clone(&schema));
7738 let error = service
7739 .regenerate_vector_embeddings_with_policy(
7740 &VectorRegenerationConfig {
7741 profile: "default".to_owned(),
7742 table_name: "vec_nodes_active".to_owned(),
7743 model_identity: "new-model".to_owned(),
7744 model_version: "1.0.0".to_owned(),
7745 dimension: 4,
7746 normalization_policy: "l2".to_owned(),
7747 chunking_policy: "per_chunk".to_owned(),
7748 preprocessing_policy: "trim".to_owned(),
7749 generator_command: vec![script_path.to_string_lossy().to_string()],
7750 },
7751 &VectorGeneratorPolicy::default(),
7752 )
7753 .expect_err("bad json should fail");
7754
7755 assert!(error.to_string().contains("decode generator output"));
7756
7757 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7758 let model_identity: String = conn
7759 .query_row(
7760 "SELECT model_identity FROM vector_embedding_contracts WHERE profile = 'default'",
7761 [],
7762 |row| row.get(0),
7763 )
7764 .expect("model identity");
7765 assert_eq!(model_identity, "old-model");
7766 let vec_count: i64 = conn
7767 .query_row("SELECT count(*) FROM vec_nodes_active", [], |row| {
7768 row.get(0)
7769 })
7770 .expect("vec count");
7771 assert_eq!(vec_count, 1);
7772 let failure_count: i64 = conn
7773 .query_row(
7774 "SELECT count(*) FROM provenance_events WHERE event_type = 'vector_regeneration_failed' AND subject = 'default'",
7775 [],
7776 |row| row.get(0),
7777 )
7778 .expect("failure count");
7779 assert_eq!(failure_count, 1);
7780 }
7781
7782 #[cfg(feature = "sqlite-vec")]
7783 #[test]
7784 fn regenerate_vector_embeddings_rejects_whitespace_only_profile_before_mutation() {
7785 let db = NamedTempFile::new().expect("temp file");
7786 let schema = Arc::new(SchemaManager::new());
7787 {
7788 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7789 schema.bootstrap(&conn).expect("bootstrap");
7790 conn.execute(
7791 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7792 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7793 [],
7794 )
7795 .expect("insert node");
7796 conn.execute(
7797 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7798 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7799 [],
7800 )
7801 .expect("insert chunk");
7802 }
7803
7804 let service = AdminService::new(db.path(), Arc::clone(&schema));
7805 let error = service
7806 .regenerate_vector_embeddings(&VectorRegenerationConfig {
7807 profile: " ".to_owned(),
7808 table_name: "vec_nodes_active".to_owned(),
7809 model_identity: "test-model".to_owned(),
7810 model_version: "1.0.0".to_owned(),
7811 dimension: 4,
7812 normalization_policy: "l2".to_owned(),
7813 chunking_policy: "per_chunk".to_owned(),
7814 preprocessing_policy: "trim".to_owned(),
7815 generator_command: vec!["/bin/echo".to_owned()],
7816 })
7817 .expect_err("whitespace profile should be rejected");
7818
7819 assert!(error.to_string().contains("invalid contract"));
7820 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7821 let contract_count: i64 = conn
7822 .query_row(
7823 "SELECT count(*) FROM vector_embedding_contracts",
7824 [],
7825 |row| row.get(0),
7826 )
7827 .expect("contract count");
7828 assert_eq!(contract_count, 0);
7829 let provenance_count: i64 = conn
7830 .query_row("SELECT count(*) FROM provenance_events", [], |row| {
7831 row.get(0)
7832 })
7833 .expect("provenance count");
7834 assert_eq!(provenance_count, 0);
7835 }
7836
7837 #[cfg(feature = "sqlite-vec")]
7838 #[test]
7839 fn regenerate_vector_embeddings_rejects_world_writable_executable_when_policy_requires_it() {
7840 let (_db, service) = setup();
7841 let temp_dir = tempfile::tempdir().expect("temp dir");
7842 let script_path = temp_dir.path().join("vector-generator-world-writable.sh");
7843
7844 fs::write(
7845 &script_path,
7846 "#!/usr/bin/env bash\nset -euo pipefail\nprintf '{\"embeddings\":[]}'\n",
7847 )
7848 .expect("write script");
7849 set_file_mode(&script_path, 0o777);
7850
7851 let error = service
7852 .regenerate_vector_embeddings_with_policy(
7853 &VectorRegenerationConfig {
7854 profile: "default".to_owned(),
7855 table_name: "vec_nodes_active".to_owned(),
7856 model_identity: "model".to_owned(),
7857 model_version: "1.0.0".to_owned(),
7858 dimension: 4,
7859 normalization_policy: "l2".to_owned(),
7860 chunking_policy: "per_chunk".to_owned(),
7861 preprocessing_policy: "trim".to_owned(),
7862 generator_command: vec![script_path.to_string_lossy().to_string()],
7863 },
7864 &VectorGeneratorPolicy::default(),
7865 )
7866 .expect_err("world-writable executable should be rejected");
7867
7868 assert!(error.to_string().contains("world-writable executable"));
7869 }
7870
7871 #[cfg(feature = "sqlite-vec")]
7872 #[test]
7873 fn regenerate_vector_embeddings_rejects_executable_outside_allowlisted_roots() {
7874 let (_db, service) = setup();
7875 let temp_dir = tempfile::tempdir().expect("temp dir");
7876 let allowed_dir = tempfile::tempdir().expect("allowed dir");
7877 let script_path = temp_dir.path().join("vector-generator-outside-root.sh");
7878
7879 fs::write(
7880 &script_path,
7881 "#!/usr/bin/env bash\nset -euo pipefail\nprintf '{\"embeddings\":[]}'\n",
7882 )
7883 .expect("write script");
7884 set_file_mode(&script_path, 0o755);
7885
7886 let error = service
7887 .regenerate_vector_embeddings_with_policy(
7888 &VectorRegenerationConfig {
7889 profile: "default".to_owned(),
7890 table_name: "vec_nodes_active".to_owned(),
7891 model_identity: "model".to_owned(),
7892 model_version: "1.0.0".to_owned(),
7893 dimension: 4,
7894 normalization_policy: "l2".to_owned(),
7895 chunking_policy: "per_chunk".to_owned(),
7896 preprocessing_policy: "trim".to_owned(),
7897 generator_command: vec![script_path.to_string_lossy().to_string()],
7898 },
7899 &VectorGeneratorPolicy {
7900 timeout_ms: 1000,
7901 max_stdout_bytes: 1024,
7902 max_stderr_bytes: 1024,
7903 max_input_bytes: 1024,
7904 max_chunks: 10,
7905 require_absolute_executable: true,
7906 reject_world_writable_executable: true,
7907 allowed_executable_roots: vec![
7908 allowed_dir.path().to_string_lossy().to_string(),
7909 ],
7910 preserve_env_vars: vec![],
7911 },
7912 )
7913 .expect_err("disallowed root should be rejected");
7914
7915 assert!(
7916 error
7917 .to_string()
7918 .contains("outside allowed executable roots")
7919 );
7920 }
7921
7922 #[cfg(feature = "sqlite-vec")]
7923 #[test]
7924 fn regenerate_vector_embeddings_rejects_future_contract_format_version() {
7925 let db = NamedTempFile::new().expect("temp file");
7926 let schema = Arc::new(SchemaManager::new());
7927 {
7928 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
7929 schema.bootstrap(&conn).expect("bootstrap");
7930 conn.execute(
7931 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
7932 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
7933 [],
7934 )
7935 .expect("insert node");
7936 conn.execute(
7937 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
7938 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
7939 [],
7940 )
7941 .expect("insert chunk");
7942 conn.execute(
7943 r"
7944 INSERT INTO vector_embedding_contracts (
7945 profile,
7946 table_name,
7947 model_identity,
7948 model_version,
7949 dimension,
7950 normalization_policy,
7951 chunking_policy,
7952 preprocessing_policy,
7953 generator_command_json,
7954 applied_at,
7955 snapshot_hash,
7956 contract_format_version,
7957 updated_at
7958 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13)
7959 ",
7960 rusqlite::params![
7961 "default",
7962 "vec_nodes_active",
7963 "old-model",
7964 "0.9.0",
7965 4,
7966 "l2",
7967 "per_chunk",
7968 "trim",
7969 "[\"/bin/echo\"]",
7970 111,
7971 "old-snapshot",
7972 99,
7973 111,
7974 ],
7975 )
7976 .expect("seed future contract");
7977 }
7978
7979 let service = AdminService::new(db.path(), Arc::clone(&schema));
7980 let error = service
7981 .regenerate_vector_embeddings(&VectorRegenerationConfig {
7982 profile: "default".to_owned(),
7983 table_name: "vec_nodes_active".to_owned(),
7984 model_identity: "test-model".to_owned(),
7985 model_version: "1.0.0".to_owned(),
7986 dimension: 4,
7987 normalization_policy: "l2".to_owned(),
7988 chunking_policy: "per_chunk".to_owned(),
7989 preprocessing_policy: "trim".to_owned(),
7990 generator_command: vec!["/bin/echo".to_owned()],
7991 })
7992 .expect_err("future contract version should be rejected");
7993
7994 assert!(error.to_string().contains("unsupported"));
7995 assert!(error.to_string().contains("format version"));
7996 }
7997
7998 #[cfg(feature = "sqlite-vec")]
7999 #[test]
8000 fn regenerate_vector_embeddings_clears_environment_except_preserved_vars() {
8001 let db = NamedTempFile::new().expect("temp file");
8002 let schema = Arc::new(SchemaManager::new());
8003 let temp_dir = tempfile::tempdir().expect("temp dir");
8004 let script_path = temp_dir.path().join("vector-generator-env.sh");
8005 {
8006 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8007 schema.bootstrap(&conn).expect("bootstrap");
8008 conn.execute(
8009 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8010 VALUES ('row-1', 'doc-1', 'Document', '{}', 100, 'source-1')",
8011 [],
8012 )
8013 .expect("insert node");
8014 conn.execute(
8015 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8016 VALUES ('chunk-1', 'doc-1', 'budget discussion', 100)",
8017 [],
8018 )
8019 .expect("insert chunk");
8020 }
8021
8022 fs::write(
8023 &script_path,
8024 r#"#!/usr/bin/env bash
8025set -euo pipefail
8026if [[ "${VECTOR_TEST_SECRET:-}" != "expected" ]]; then
8027 echo "missing secret" >&2
8028 exit 9
8029fi
8030python3 -c 'import json, sys
8031payload = json.load(sys.stdin)
8032json.dump({"embeddings": [{"chunk_id": payload["chunks"][0]["chunk_id"], "embedding": [1.0, 0.0, 0.0, 0.0]}]}, sys.stdout)'
8033"#,
8034 )
8035 .expect("write script");
8036 set_file_mode(&script_path, 0o755);
8037
8038 let service = AdminService::new(db.path(), Arc::clone(&schema));
8039 unsafe {
8040 std::env::set_var("VECTOR_TEST_SECRET", "expected");
8041 }
8042 let missing_env = service
8043 .regenerate_vector_embeddings_with_policy(
8044 &VectorRegenerationConfig {
8045 profile: "default".to_owned(),
8046 table_name: "vec_nodes_active".to_owned(),
8047 model_identity: "model".to_owned(),
8048 model_version: "1.0.0".to_owned(),
8049 dimension: 4,
8050 normalization_policy: "l2".to_owned(),
8051 chunking_policy: "per_chunk".to_owned(),
8052 preprocessing_policy: "trim".to_owned(),
8053 generator_command: vec![script_path.to_string_lossy().to_string()],
8054 },
8055 &VectorGeneratorPolicy::default(),
8056 )
8057 .expect_err("non-preserved env var should be dropped");
8058 assert!(missing_env.to_string().contains("nonzero exit"));
8059
8060 let report = service
8061 .regenerate_vector_embeddings_with_policy(
8062 &VectorRegenerationConfig {
8063 profile: "default".to_owned(),
8064 table_name: "vec_nodes_active".to_owned(),
8065 model_identity: "model".to_owned(),
8066 model_version: "1.0.0".to_owned(),
8067 dimension: 4,
8068 normalization_policy: "l2".to_owned(),
8069 chunking_policy: "per_chunk".to_owned(),
8070 preprocessing_policy: "trim".to_owned(),
8071 generator_command: vec![script_path.to_string_lossy().to_string()],
8072 },
8073 &VectorGeneratorPolicy {
8074 timeout_ms: 1000,
8075 max_stdout_bytes: 1024,
8076 max_stderr_bytes: 1024,
8077 max_input_bytes: 4096,
8078 max_chunks: 10,
8079 require_absolute_executable: true,
8080 reject_world_writable_executable: true,
8081 allowed_executable_roots: vec![],
8082 preserve_env_vars: vec!["VECTOR_TEST_SECRET".to_owned()],
8083 },
8084 )
8085 .expect("preserved env var should allow success");
8086 assert_eq!(report.regenerated_rows, 1);
8087 unsafe {
8088 std::env::remove_var("VECTOR_TEST_SECRET");
8089 }
8090 }
8091
8092 #[test]
8093 fn check_semantics_detects_orphaned_chunk() {
8094 let (db, service) = setup();
8095 {
8096 let conn = sqlite::open_connection(db.path()).expect("conn");
8098 conn.execute(
8099 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8100 VALUES ('c1', 'ghost-node', 'text', 100)",
8101 [],
8102 )
8103 .expect("insert orphaned chunk");
8104 }
8105 let report = service.check_semantics().expect("semantics check");
8106 assert_eq!(report.orphaned_chunks, 1);
8107 }
8108
8109 #[test]
8110 fn check_semantics_detects_null_source_ref() {
8111 let (db, service) = setup();
8112 {
8113 let conn = sqlite::open_connection(db.path()).expect("conn");
8114 conn.execute(
8115 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at) \
8116 VALUES ('r1', 'lg1', 'Meeting', '{}', 100)",
8117 [],
8118 )
8119 .expect("insert node with null source_ref");
8120 }
8121 let report = service.check_semantics().expect("semantics check");
8122 assert_eq!(report.null_source_ref_nodes, 1);
8123 }
8124
8125 #[test]
8126 fn check_semantics_detects_broken_step_fk() {
8127 let (db, service) = setup();
8128 {
8129 let conn = sqlite::open_connection(db.path()).expect("conn");
8132 conn.execute_batch("PRAGMA foreign_keys = OFF;")
8133 .expect("disable FK");
8134 conn.execute(
8135 "INSERT INTO steps (id, run_id, kind, status, properties, created_at) \
8136 VALUES ('s1', 'ghost-run', 'llm', 'completed', '{}', 100)",
8137 [],
8138 )
8139 .expect("insert step with ghost run_id");
8140 }
8141 let report = service.check_semantics().expect("semantics check");
8142 assert_eq!(report.broken_step_fk, 1);
8143 }
8144
8145 #[test]
8146 fn check_semantics_detects_broken_action_fk() {
8147 let (db, service) = setup();
8148 {
8149 let conn = sqlite::open_connection(db.path()).expect("conn");
8150 conn.execute_batch("PRAGMA foreign_keys = OFF;")
8151 .expect("disable FK");
8152 conn.execute(
8153 "INSERT INTO actions (id, step_id, kind, status, properties, created_at) \
8154 VALUES ('a1', 'ghost-step', 'emit', 'completed', '{}', 100)",
8155 [],
8156 )
8157 .expect("insert action with ghost step_id");
8158 }
8159 let report = service.check_semantics().expect("semantics check");
8160 assert_eq!(report.broken_action_fk, 1);
8161 }
8162
8163 #[test]
8164 fn check_semantics_detects_stale_fts_rows() {
8165 let (db, service) = setup();
8166 {
8167 let conn = sqlite::open_connection(db.path()).expect("conn");
8168 conn.execute(
8171 "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
8172 VALUES ('ghost-chunk', 'any-node', 'Meeting', 'stale content')",
8173 [],
8174 )
8175 .expect("insert stale FTS row");
8176 }
8177 let report = service.check_semantics().expect("semantics check");
8178 assert_eq!(report.stale_fts_rows, 1);
8179 }
8180
8181 #[test]
8182 fn check_semantics_detects_fts_rows_for_superseded_nodes() {
8183 let (db, service) = setup();
8184 {
8185 let conn = sqlite::open_connection(db.path()).expect("conn");
8186 conn.execute(
8188 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8189 VALUES ('r1', 'lg-sup', 'Meeting', '{}', 100, 200, 'src-1')",
8190 [],
8191 )
8192 .expect("insert superseded node");
8193 conn.execute(
8195 "INSERT INTO fts_nodes (chunk_id, node_logical_id, kind, text_content) \
8196 VALUES ('ck-x', 'lg-sup', 'Meeting', 'superseded content')",
8197 [],
8198 )
8199 .expect("insert FTS row for superseded node");
8200 }
8201 let report = service.check_semantics().expect("semantics check");
8202 assert_eq!(report.fts_rows_for_superseded_nodes, 1);
8203 }
8204
8205 #[test]
8206 fn check_semantics_detects_dangling_edges() {
8207 let (db, service) = setup();
8208 {
8209 let conn = sqlite::open_connection(db.path()).expect("conn");
8210 conn.execute_batch("PRAGMA foreign_keys = OFF;")
8211 .expect("disable FK");
8212 conn.execute(
8214 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8215 VALUES ('r1', 'lg-src', 'Meeting', '{}', 100, 'src-1')",
8216 [],
8217 )
8218 .expect("insert source node");
8219 conn.execute(
8220 "INSERT INTO edges \
8221 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
8222 VALUES ('e1', 'edge-1', 'lg-src', 'ghost-target', 'LINKS', '{}', 100, 'src-1')",
8223 [],
8224 )
8225 .expect("insert dangling edge");
8226 }
8227 let report = service.check_semantics().expect("semantics check");
8228 assert_eq!(report.dangling_edges, 1);
8229 }
8230
8231 #[test]
8232 fn check_semantics_detects_orphaned_supersession_chains() {
8233 let (db, service) = setup();
8234 {
8235 let conn = sqlite::open_connection(db.path()).expect("conn");
8236 conn.execute(
8238 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8239 VALUES ('r1', 'lg-orphaned', 'Meeting', '{}', 100, 200, 'src-1')",
8240 [],
8241 )
8242 .expect("insert fully superseded node");
8243 }
8244 let report = service.check_semantics().expect("semantics check");
8245 assert_eq!(report.orphaned_supersession_chains, 1);
8246 }
8247
8248 #[test]
8249 fn check_semantics_detects_mismatched_kind_property_fts_rows() {
8250 let (db, service) = setup();
8251 {
8252 let conn = sqlite::open_connection(db.path()).expect("conn");
8253 conn.execute(
8255 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8256 VALUES ('r1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'src-1')",
8257 [],
8258 )
8259 .expect("insert node");
8260 conn.execute(
8262 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
8263 VALUES ('goal-1', 'WrongKind', 'Ship v2')",
8264 [],
8265 )
8266 .expect("insert mismatched property FTS row");
8267 }
8268 let report = service.check_semantics().expect("semantics check");
8269 assert_eq!(report.mismatched_kind_property_fts_rows, 1);
8270 }
8271
8272 #[test]
8273 fn check_semantics_detects_duplicate_property_fts_rows() {
8274 let (db, service) = setup();
8275 {
8276 let conn = sqlite::open_connection(db.path()).expect("conn");
8277 conn.execute(
8278 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8279 VALUES ('r1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'src-1')",
8280 [],
8281 )
8282 .expect("insert node");
8283 conn.execute(
8285 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
8286 VALUES ('goal-1', 'Goal', 'Ship v2')",
8287 [],
8288 )
8289 .expect("insert first property FTS row");
8290 conn.execute(
8291 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
8292 VALUES ('goal-1', 'Goal', 'Ship v2 duplicate')",
8293 [],
8294 )
8295 .expect("insert duplicate property FTS row");
8296 }
8297 let report = service.check_semantics().expect("semantics check");
8298 assert_eq!(report.duplicate_property_fts_rows, 1);
8299 }
8300
8301 #[test]
8302 fn check_semantics_detects_drifted_property_fts_text() {
8303 let (db, service) = setup();
8304 {
8305 let conn = sqlite::open_connection(db.path()).expect("conn");
8306 conn.execute(
8307 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
8308 VALUES ('Goal', '[\"$.name\"]', ' ')",
8309 [],
8310 )
8311 .expect("register schema");
8312 conn.execute(
8313 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8314 VALUES ('r1', 'goal-1', 'Goal', '{\"name\":\"Current name\"}', 100, 'src-1')",
8315 [],
8316 )
8317 .expect("insert node");
8318 conn.execute(
8320 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
8321 VALUES ('goal-1', 'Goal', 'Old stale name')",
8322 [],
8323 )
8324 .expect("insert stale property FTS row");
8325 }
8326 let report = service.check_semantics().expect("semantics check");
8327 assert_eq!(report.drifted_property_fts_rows, 1);
8328 }
8329
8330 #[test]
8331 fn check_semantics_detects_property_fts_row_that_should_not_exist() {
8332 let (db, service) = setup();
8333 {
8334 let conn = sqlite::open_connection(db.path()).expect("conn");
8335 conn.execute(
8336 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
8337 VALUES ('Goal', '[\"$.searchable\"]', ' ')",
8338 [],
8339 )
8340 .expect("register schema");
8341 conn.execute(
8343 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8344 VALUES ('r1', 'goal-1', 'Goal', '{\"other\":\"field\"}', 100, 'src-1')",
8345 [],
8346 )
8347 .expect("insert node");
8348 conn.execute(
8350 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
8351 VALUES ('goal-1', 'Goal', 'phantom text')",
8352 [],
8353 )
8354 .expect("insert phantom property FTS row");
8355 }
8356 let report = service.check_semantics().expect("semantics check");
8357 assert_eq!(
8358 report.drifted_property_fts_rows, 1,
8359 "row that should not exist must be counted as drifted"
8360 );
8361 }
8362
8363 #[test]
8364 fn safe_export_writes_manifest_with_sha256() {
8365 let (_db, service) = setup();
8366 let export_dir = tempfile::TempDir::new().expect("temp dir");
8367 let export_path = export_dir.path().join("backup.db");
8368
8369 let manifest = service
8370 .safe_export(
8371 &export_path,
8372 SafeExportOptions {
8373 force_checkpoint: false,
8374 },
8375 )
8376 .expect("export");
8377
8378 assert!(export_path.exists(), "exported db should exist");
8379 let manifest_path = export_dir.path().join("backup.db.export-manifest.json");
8380 assert!(
8381 manifest_path.exists(),
8382 "manifest file should exist at {}",
8383 manifest_path.display()
8384 );
8385 assert_eq!(manifest.sha256.len(), 64, "sha256 should be 64 hex chars");
8386 assert!(
8387 manifest.exported_at > 0,
8388 "exported_at should be a unix timestamp"
8389 );
8390 assert_eq!(
8391 manifest.schema_version,
8392 SchemaManager::new().current_version().0,
8393 "schema_version should match the live schema version"
8394 );
8395 assert_eq!(manifest.protocol_version, 1, "protocol_version should be 1");
8396 assert!(manifest.page_count > 0, "page_count should be positive");
8397 }
8398
8399 #[test]
8400 fn safe_export_preserves_operational_validation_contracts() {
8401 let (_db, service) = setup();
8402 let validation_json = r#"{"format_version":1,"mode":"enforce","additional_properties":false,"fields":[{"name":"status","type":"string","required":true,"enum":["ok","failed"]}]}"#;
8403 service
8404 .register_operational_collection(&OperationalRegisterRequest {
8405 name: "connector_health".to_owned(),
8406 kind: OperationalCollectionKind::LatestState,
8407 schema_json: "{}".to_owned(),
8408 retention_json: "{}".to_owned(),
8409 filter_fields_json: "[]".to_owned(),
8410 validation_json: validation_json.to_owned(),
8411 secondary_indexes_json: "[]".to_owned(),
8412 format_version: 1,
8413 })
8414 .expect("register collection");
8415
8416 let export_dir = tempfile::TempDir::new().expect("temp dir");
8417 let export_path = export_dir.path().join("backup.db");
8418 service
8419 .safe_export(
8420 &export_path,
8421 SafeExportOptions {
8422 force_checkpoint: false,
8423 },
8424 )
8425 .expect("export");
8426
8427 let exported = sqlite::open_connection(&export_path).expect("exported conn");
8428 let exported_validation_json: String = exported
8429 .query_row(
8430 "SELECT validation_json FROM operational_collections WHERE name = 'connector_health'",
8431 [],
8432 |row| row.get(0),
8433 )
8434 .expect("validation_json");
8435 assert_eq!(exported_validation_json, validation_json);
8436 }
8437
8438 #[test]
8439 fn safe_export_force_checkpoint_false_skips_wal_pragma() {
8440 let (_db, service) = setup();
8441 let export_dir = tempfile::TempDir::new().expect("temp dir");
8442 let export_path = export_dir.path().join("no-wal.db");
8443
8444 let manifest = service
8446 .safe_export(
8447 &export_path,
8448 SafeExportOptions {
8449 force_checkpoint: false,
8450 },
8451 )
8452 .expect("export with no checkpoint");
8453
8454 assert!(
8455 manifest.page_count > 0,
8456 "page_count must be populated regardless of checkpoint mode"
8457 );
8458 assert_eq!(
8459 manifest.schema_version,
8460 SchemaManager::new().current_version().0
8461 );
8462 assert_eq!(manifest.protocol_version, 1);
8463 }
8464
8465 #[test]
8466 fn safe_export_force_checkpoint_false_still_captures_wal_backed_changes() {
8467 let (db, service) = setup();
8468 let conn = sqlite::open_connection(db.path()).expect("conn");
8469 let journal_mode: String = conn
8470 .query_row("PRAGMA journal_mode=WAL", [], |row| row.get(0))
8471 .expect("enable wal");
8472 assert_eq!(journal_mode.to_lowercase(), "wal");
8473 let auto_checkpoint_pages: i64 = conn
8474 .query_row("PRAGMA wal_autocheckpoint=0", [], |row| row.get(0))
8475 .expect("disable auto checkpoint");
8476 assert_eq!(auto_checkpoint_pages, 0);
8477 conn.execute(
8478 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8479 VALUES ('r-wal', 'lg-wal', 'Meeting', '{}', 100, 'src-wal')",
8480 [],
8481 )
8482 .expect("insert wal-backed node");
8483
8484 let export_dir = tempfile::TempDir::new().expect("temp dir");
8485 let export_path = export_dir.path().join("wal-backed.db");
8486 service
8487 .safe_export(
8488 &export_path,
8489 SafeExportOptions {
8490 force_checkpoint: false,
8491 },
8492 )
8493 .expect("export wal-backed db");
8494
8495 let exported = sqlite::open_connection(&export_path).expect("open exported db");
8496 let exported_count: i64 = exported
8497 .query_row(
8498 "SELECT count(*) FROM nodes WHERE logical_id = 'lg-wal'",
8499 [],
8500 |row| row.get(0),
8501 )
8502 .expect("count exported nodes");
8503 assert_eq!(
8504 exported_count, 1,
8505 "safe_export must include committed rows that are still resident in the WAL"
8506 );
8507 }
8508
8509 #[test]
8510 fn excise_source_removes_searchable_content_after_excision() {
8511 let (db, service) = setup();
8512 {
8513 let conn = sqlite::open_connection(db.path()).expect("conn");
8514 conn.execute(
8515 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8516 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 200, 'source-1')",
8517 [],
8518 )
8519 .expect("insert v1");
8520 conn.execute(
8521 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8522 VALUES ('r2', 'lg1', 'Meeting', '{}', 200, 'source-2')",
8523 [],
8524 )
8525 .expect("insert v2");
8526 conn.execute(
8527 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8528 VALUES ('ck1', 'lg1', 'hello world', 100)",
8529 [],
8530 )
8531 .expect("insert chunk");
8532 }
8533 service.excise_source("source-2").expect("excise");
8534 {
8535 let conn = sqlite::open_connection(db.path()).expect("conn");
8536 let fts_count: i64 = conn
8537 .query_row(
8538 "SELECT count(*) FROM fts_nodes WHERE chunk_id = 'ck1'",
8539 [],
8540 |row| row.get(0),
8541 )
8542 .expect("fts count");
8543 assert_eq!(
8544 fts_count, 0,
8545 "excised content should not remain searchable after excise"
8546 );
8547 }
8548 }
8549
8550 #[cfg(feature = "sqlite-vec")]
8551 #[test]
8552 fn excise_source_cleans_chunks_and_vec_rows_for_excised_version() {
8553 let (db, service) = setup();
8554 {
8555 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8556 service
8557 .schema_manager
8558 .ensure_vector_profile(&conn, "default", "vec_nodes_active", 4)
8559 .expect("ensure vec profile");
8560 conn.execute(
8561 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, superseded_at, source_ref) \
8562 VALUES ('r1', 'lg1', 'Meeting', '{}', 100, 200, 'source-1')",
8563 [],
8564 )
8565 .expect("insert v1");
8566 conn.execute(
8567 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8568 VALUES ('r2', 'lg1', 'Meeting', '{}', 200, 'source-2')",
8569 [],
8570 )
8571 .expect("insert v2");
8572 conn.execute(
8573 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
8574 VALUES ('ck1', 'lg1', 'new content', 200)",
8575 [],
8576 )
8577 .expect("insert chunk");
8578 conn.execute(
8579 "INSERT INTO vec_nodes_active (chunk_id, embedding) VALUES ('ck1', zeroblob(16))",
8580 [],
8581 )
8582 .expect("insert vec row");
8583 }
8584
8585 service.excise_source("source-2").expect("excise");
8586
8587 let conn = crate::sqlite::open_connection_with_vec(db.path()).expect("vec conn");
8588 let active_row: String = conn
8589 .query_row(
8590 "SELECT row_id FROM nodes WHERE logical_id = 'lg1' AND superseded_at IS NULL",
8591 [],
8592 |row| row.get(0),
8593 )
8594 .expect("restored active row");
8595 assert_eq!(active_row, "r1");
8596 let chunk_count: i64 = conn
8597 .query_row(
8598 "SELECT count(*) FROM chunks WHERE node_logical_id = 'lg1'",
8599 [],
8600 |row| row.get(0),
8601 )
8602 .expect("chunk count");
8603 assert_eq!(
8604 chunk_count, 0,
8605 "excised source content must not survive as chunks"
8606 );
8607 let vec_count: i64 = conn
8608 .query_row("SELECT count(*) FROM vec_nodes_active", [], |row| {
8609 row.get(0)
8610 })
8611 .expect("vec count");
8612 assert_eq!(vec_count, 0, "excised source vec rows must be removed");
8613 let fts_count: i64 = conn
8614 .query_row(
8615 "SELECT count(*) FROM fts_nodes WHERE node_logical_id = 'lg1'",
8616 [],
8617 |row| row.get(0),
8618 )
8619 .expect("fts count");
8620 assert_eq!(
8621 fts_count, 0,
8622 "excised source content must not remain searchable"
8623 );
8624 }
8625
8626 #[test]
8627 fn export_page_count_matches_exported_file() {
8628 let (_db, service) = setup();
8629 let export_dir = tempfile::TempDir::new().expect("temp dir");
8630 let export_path = export_dir.path().join("page-count.db");
8631
8632 let manifest = service
8633 .safe_export(
8634 &export_path,
8635 SafeExportOptions {
8636 force_checkpoint: false,
8637 },
8638 )
8639 .expect("export");
8640
8641 let exported = sqlite::open_connection(&export_path).expect("open exported db");
8642 let actual_page_count: u64 = exported
8643 .query_row("PRAGMA page_count", [], |row| row.get(0))
8644 .expect("page_count from exported file");
8645
8646 assert_eq!(
8647 manifest.page_count, actual_page_count,
8648 "manifest page_count must match the exported file's PRAGMA page_count"
8649 );
8650 }
8651
8652 #[test]
8653 fn no_temp_file_after_successful_export() {
8654 let (_db, service) = setup();
8655 let export_dir = tempfile::TempDir::new().expect("temp dir");
8656 let export_path = export_dir.path().join("no-tmp.db");
8657
8658 service
8659 .safe_export(
8660 &export_path,
8661 SafeExportOptions {
8662 force_checkpoint: false,
8663 },
8664 )
8665 .expect("export");
8666
8667 let tmp_files: Vec<_> = fs::read_dir(export_dir.path())
8668 .expect("read export dir")
8669 .filter_map(Result::ok)
8670 .filter(|e| e.path().extension().is_some_and(|ext| ext == "tmp"))
8671 .collect();
8672
8673 assert!(
8674 tmp_files.is_empty(),
8675 "no .tmp files should remain after a successful export, found: {tmp_files:?}"
8676 );
8677 }
8678
8679 #[test]
8680 fn export_manifest_is_valid_json() {
8681 let (_db, service) = setup();
8682 let export_dir = tempfile::TempDir::new().expect("temp dir");
8683 let export_path = export_dir.path().join("valid-json.db");
8684
8685 service
8686 .safe_export(
8687 &export_path,
8688 SafeExportOptions {
8689 force_checkpoint: false,
8690 },
8691 )
8692 .expect("export");
8693
8694 let manifest_path = export_dir.path().join("valid-json.db.export-manifest.json");
8695 let manifest_contents = fs::read_to_string(&manifest_path).expect("read manifest");
8696 let parsed: serde_json::Value =
8697 serde_json::from_str(&manifest_contents).expect("manifest must be valid JSON");
8698
8699 assert!(
8700 parsed.get("exported_at").is_some(),
8701 "manifest must contain exported_at"
8702 );
8703 assert!(
8704 parsed.get("sha256").is_some(),
8705 "manifest must contain sha256"
8706 );
8707 assert!(
8708 parsed.get("schema_version").is_some(),
8709 "manifest must contain schema_version"
8710 );
8711 assert!(
8712 parsed.get("protocol_version").is_some(),
8713 "manifest must contain protocol_version"
8714 );
8715 assert!(
8716 parsed.get("page_count").is_some(),
8717 "manifest must contain page_count"
8718 );
8719 }
8720
8721 #[test]
8722 fn provenance_purge_dry_run_reports_counts() {
8723 let (db, service) = setup();
8724 {
8725 let conn = sqlite::open_connection(db.path()).expect("conn");
8726 conn.execute(
8727 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8728 VALUES ('p1', 'node_insert', 'lg1', 'src-1', 100)",
8729 [],
8730 )
8731 .expect("insert p1");
8732 conn.execute(
8733 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8734 VALUES ('p2', 'node_insert', 'lg2', 'src-1', 200)",
8735 [],
8736 )
8737 .expect("insert p2");
8738 conn.execute(
8739 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8740 VALUES ('p3', 'excise', 'lg3', 'src-1', 300)",
8741 [],
8742 )
8743 .expect("insert p3");
8744 }
8745
8746 let options = super::ProvenancePurgeOptions {
8747 dry_run: true,
8748 preserve_event_types: Vec::new(),
8749 };
8750 let report = service
8751 .purge_provenance_events(250, &options)
8752 .expect("dry run purge");
8753
8754 assert_eq!(report.events_deleted, 2);
8755 assert_eq!(report.events_preserved, 1);
8756 assert!(report.oldest_remaining.is_some());
8757
8758 let conn = sqlite::open_connection(db.path()).expect("conn");
8759 let total: i64 = conn
8760 .query_row("SELECT count(*) FROM provenance_events", [], |row| {
8761 row.get(0)
8762 })
8763 .expect("count");
8764 assert_eq!(total, 3, "dry_run must not delete any events");
8765 }
8766
8767 #[test]
8768 fn provenance_purge_deletes_old_events() {
8769 let (db, service) = setup();
8770 {
8771 let conn = sqlite::open_connection(db.path()).expect("conn");
8772 conn.execute(
8773 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8774 VALUES ('p1', 'node_insert', 'lg1', 'src-1', 100)",
8775 [],
8776 )
8777 .expect("insert p1");
8778 conn.execute(
8779 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8780 VALUES ('p2', 'node_insert', 'lg2', 'src-1', 200)",
8781 [],
8782 )
8783 .expect("insert p2");
8784 }
8785
8786 let options = super::ProvenancePurgeOptions {
8787 dry_run: false,
8788 preserve_event_types: Vec::new(),
8789 };
8790 let report = service
8791 .purge_provenance_events(150, &options)
8792 .expect("purge");
8793
8794 assert_eq!(report.events_deleted, 1);
8795 assert_eq!(report.events_preserved, 1);
8796 assert_eq!(report.oldest_remaining, Some(200));
8797
8798 let conn = sqlite::open_connection(db.path()).expect("conn");
8799 let remaining: i64 = conn
8800 .query_row("SELECT count(*) FROM provenance_events", [], |row| {
8801 row.get(0)
8802 })
8803 .expect("count");
8804 assert_eq!(remaining, 1);
8805 }
8806
8807 #[test]
8808 fn provenance_purge_preserves_specified_types() {
8809 let (db, service) = setup();
8810 {
8811 let conn = sqlite::open_connection(db.path()).expect("conn");
8812 conn.execute(
8813 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8814 VALUES ('p1', 'excise', 'lg1', 'src-1', 100)",
8815 [],
8816 )
8817 .expect("insert p1");
8818 conn.execute(
8819 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8820 VALUES ('p2', 'node_insert', 'lg2', 'src-1', 100)",
8821 [],
8822 )
8823 .expect("insert p2");
8824 conn.execute(
8825 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8826 VALUES ('p3', 'node_insert', 'lg3', 'src-1', 100)",
8827 [],
8828 )
8829 .expect("insert p3");
8830 }
8831
8832 let options = super::ProvenancePurgeOptions {
8833 dry_run: false,
8834 preserve_event_types: Vec::new(),
8835 };
8836 let report = service
8837 .purge_provenance_events(500, &options)
8838 .expect("purge");
8839
8840 assert_eq!(report.events_deleted, 2);
8841 assert_eq!(report.events_preserved, 1);
8842
8843 let conn = sqlite::open_connection(db.path()).expect("conn");
8844 let remaining_type: String = conn
8845 .query_row("SELECT event_type FROM provenance_events", [], |row| {
8846 row.get(0)
8847 })
8848 .expect("remaining event type");
8849 assert_eq!(remaining_type, "excise");
8850 }
8851
8852 #[test]
8853 fn provenance_purge_noop_with_zero_timestamp() {
8854 let (db, service) = setup();
8855 {
8856 let conn = sqlite::open_connection(db.path()).expect("conn");
8857 conn.execute(
8858 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at) \
8859 VALUES ('p1', 'node_insert', 'lg1', 'src-1', 100)",
8860 [],
8861 )
8862 .expect("insert p1");
8863 }
8864
8865 let options = super::ProvenancePurgeOptions {
8866 dry_run: false,
8867 preserve_event_types: Vec::new(),
8868 };
8869 let report = service.purge_provenance_events(0, &options).expect("purge");
8870
8871 assert_eq!(report.events_deleted, 0);
8872 assert_eq!(report.events_preserved, 1);
8873 assert_eq!(report.oldest_remaining, Some(100));
8874 }
8875
8876 #[test]
8877 fn restore_skips_edge_when_counterpart_purged() {
8878 let (db, service) = setup();
8879 {
8880 let conn = sqlite::open_connection(db.path()).expect("conn");
8881 conn.execute(
8883 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8884 VALUES ('node-row-a', 'doc-1', 'Document', '{}', 100, 'seed')",
8885 [],
8886 )
8887 .expect("insert node A");
8888 conn.execute(
8889 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8890 VALUES ('node-row-b', 'doc-2', 'Document', '{}', 100, 'seed')",
8891 [],
8892 )
8893 .expect("insert node B");
8894 conn.execute(
8896 "INSERT INTO edges \
8897 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
8898 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'doc-2', 'RELATED', '{}', 100, 'seed')",
8899 [],
8900 )
8901 .expect("insert edge");
8902 conn.execute(
8904 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
8905 VALUES ('evt-retire-a', 'node_retire', 'doc-1', 'forget-1', 200, '')",
8906 [],
8907 )
8908 .expect("insert retire event A");
8909 conn.execute(
8910 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
8911 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
8912 [],
8913 )
8914 .expect("insert edge retire event");
8915 conn.execute(
8916 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
8917 [],
8918 )
8919 .expect("retire node A");
8920 conn.execute(
8921 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-2'",
8922 [],
8923 )
8924 .expect("retire node B");
8925 conn.execute(
8926 "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
8927 [],
8928 )
8929 .expect("retire edge");
8930 conn.execute("DELETE FROM nodes WHERE logical_id = 'doc-2'", [])
8933 .expect("purge node B rows");
8934 }
8935
8936 let report = service.restore_logical_id("doc-1").expect("restore A");
8938 assert!(!report.was_noop);
8939 assert_eq!(report.restored_node_rows, 1);
8940 assert_eq!(report.restored_edge_rows, 0, "edge should not be restored");
8941 assert_eq!(report.skipped_edges.len(), 1);
8942 assert_eq!(report.skipped_edges[0].edge_logical_id, "edge-1");
8943 assert_eq!(report.skipped_edges[0].missing_endpoint, "doc-2");
8944
8945 let conn = sqlite::open_connection(db.path()).expect("conn");
8947 let active_edge_count: i64 = conn
8948 .query_row(
8949 "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
8950 [],
8951 |row| row.get(0),
8952 )
8953 .expect("active edge count");
8954 assert_eq!(active_edge_count, 0, "edge must remain retired");
8955 }
8956
8957 #[test]
8958 fn restore_restores_edges_to_active_nodes() {
8959 let (db, service) = setup();
8960 {
8961 let conn = sqlite::open_connection(db.path()).expect("conn");
8962 conn.execute(
8964 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8965 VALUES ('node-row-a', 'doc-1', 'Document', '{}', 100, 'seed')",
8966 [],
8967 )
8968 .expect("insert node A");
8969 conn.execute(
8970 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
8971 VALUES ('node-row-b', 'doc-2', 'Document', '{}', 100, 'seed')",
8972 [],
8973 )
8974 .expect("insert node B");
8975 conn.execute(
8977 "INSERT INTO edges \
8978 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
8979 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'doc-2', 'RELATED', '{}', 100, 'seed')",
8980 [],
8981 )
8982 .expect("insert edge");
8983 conn.execute(
8985 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
8986 VALUES ('evt-retire-a', 'node_retire', 'doc-1', 'forget-1', 200, '')",
8987 [],
8988 )
8989 .expect("insert retire event A");
8990 conn.execute(
8991 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
8992 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
8993 [],
8994 )
8995 .expect("insert edge retire event");
8996 conn.execute(
8997 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
8998 [],
8999 )
9000 .expect("retire node A");
9001 conn.execute(
9002 "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
9003 [],
9004 )
9005 .expect("retire edge");
9006 }
9007
9008 let report = service.restore_logical_id("doc-1").expect("restore A");
9010 assert!(!report.was_noop);
9011 assert_eq!(report.restored_node_rows, 1);
9012 assert!(report.restored_edge_rows > 0, "edge should be restored");
9013 assert!(
9014 report.skipped_edges.is_empty(),
9015 "no edges should be skipped"
9016 );
9017
9018 let conn = sqlite::open_connection(db.path()).expect("conn");
9019 let active_edge_count: i64 = conn
9020 .query_row(
9021 "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
9022 [],
9023 |row| row.get(0),
9024 )
9025 .expect("active edge count");
9026 assert_eq!(active_edge_count, 1, "edge must be active");
9027 }
9028
9029 #[test]
9030 fn restore_restores_edges_when_both_restored() {
9031 let (db, service) = setup();
9032 {
9033 let conn = sqlite::open_connection(db.path()).expect("conn");
9034 conn.execute(
9036 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9037 VALUES ('node-row-a', 'doc-1', 'Document', '{}', 100, 'seed')",
9038 [],
9039 )
9040 .expect("insert node A");
9041 conn.execute(
9042 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9043 VALUES ('node-row-b', 'doc-2', 'Document', '{}', 100, 'seed')",
9044 [],
9045 )
9046 .expect("insert node B");
9047 conn.execute(
9049 "INSERT INTO edges \
9050 (row_id, logical_id, source_logical_id, target_logical_id, kind, properties, created_at, source_ref) \
9051 VALUES ('edge-row-1', 'edge-1', 'doc-1', 'doc-2', 'RELATED', '{}', 100, 'seed')",
9052 [],
9053 )
9054 .expect("insert edge");
9055 conn.execute(
9057 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9058 VALUES ('evt-retire-a', 'node_retire', 'doc-1', 'forget-1', 200, '')",
9059 [],
9060 )
9061 .expect("insert retire event A");
9062 conn.execute(
9063 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9064 VALUES ('evt-retire-b', 'node_retire', 'doc-2', 'forget-1', 200, '')",
9065 [],
9066 )
9067 .expect("insert retire event B");
9068 conn.execute(
9069 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9070 VALUES ('evt-edge-retire', 'edge_retire', 'edge-1', 'forget-1', 200, '')",
9071 [],
9072 )
9073 .expect("insert edge retire event");
9074 conn.execute(
9075 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
9076 [],
9077 )
9078 .expect("retire node A");
9079 conn.execute(
9080 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-2'",
9081 [],
9082 )
9083 .expect("retire node B");
9084 conn.execute(
9085 "UPDATE edges SET superseded_at = 200 WHERE logical_id = 'edge-1'",
9086 [],
9087 )
9088 .expect("retire edge");
9089 }
9090
9091 let report_b = service.restore_logical_id("doc-2").expect("restore B");
9093 assert!(!report_b.was_noop);
9094
9095 let report_a = service.restore_logical_id("doc-1").expect("restore A");
9097 assert!(!report_a.was_noop);
9098 assert_eq!(report_a.restored_node_rows, 1);
9099 assert!(
9100 report_a.restored_edge_rows > 0,
9101 "edge should be restored when both endpoints active"
9102 );
9103 assert!(
9104 report_a.skipped_edges.is_empty(),
9105 "no edges should be skipped"
9106 );
9107
9108 let conn = sqlite::open_connection(db.path()).expect("conn");
9109 let active_edge_count: i64 = conn
9110 .query_row(
9111 "SELECT count(*) FROM edges WHERE logical_id = 'edge-1' AND superseded_at IS NULL",
9112 [],
9113 |row| row.get(0),
9114 )
9115 .expect("active edge count");
9116 assert_eq!(
9117 active_edge_count, 1,
9118 "edge must be active after both endpoints restored"
9119 );
9120 }
9121
9122 #[test]
9125 fn fts_property_schema_crud_round_trip() {
9126 let (_db, service) = setup();
9127
9128 let record = service
9130 .register_fts_property_schema(
9131 "Meeting",
9132 &["$.title".to_owned(), "$.summary".to_owned()],
9133 None,
9134 )
9135 .expect("register");
9136 assert_eq!(record.kind, "Meeting");
9137 assert_eq!(record.property_paths, vec!["$.title", "$.summary"]);
9138 assert_eq!(record.separator, " ");
9139 assert_eq!(record.format_version, 1);
9140
9141 let described = service
9143 .describe_fts_property_schema("Meeting")
9144 .expect("describe")
9145 .expect("should exist");
9146 assert_eq!(described, record);
9147
9148 let missing = service
9150 .describe_fts_property_schema("NoSuchKind")
9151 .expect("describe missing");
9152 assert!(missing.is_none());
9153
9154 let list = service.list_fts_property_schemas().expect("list");
9156 assert_eq!(list.len(), 1);
9157 assert_eq!(list[0].kind, "Meeting");
9158
9159 let updated = service
9161 .register_fts_property_schema(
9162 "Meeting",
9163 &["$.title".to_owned(), "$.notes".to_owned()],
9164 Some("\n"),
9165 )
9166 .expect("update");
9167 assert_eq!(updated.property_paths, vec!["$.title", "$.notes"]);
9168 assert_eq!(updated.separator, "\n");
9169
9170 service
9172 .remove_fts_property_schema("Meeting")
9173 .expect("remove");
9174 let after_remove = service
9175 .describe_fts_property_schema("Meeting")
9176 .expect("describe after remove");
9177 assert!(after_remove.is_none());
9178
9179 let err = service.remove_fts_property_schema("Meeting");
9181 assert!(err.is_err());
9182 }
9183
9184 #[test]
9185 fn restore_reestablishes_property_fts_visibility() {
9186 let (db, service) = setup();
9187 {
9188 let conn = sqlite::open_connection(db.path()).expect("conn");
9189 conn.execute(
9191 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9192 VALUES ('Document', '[\"$.title\", \"$.body\"]', ' ')",
9193 [],
9194 )
9195 .expect("register schema");
9196 conn.execute(
9198 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9199 VALUES ('row-1', 'doc-1', 'Document', '{\"title\":\"Budget\",\"body\":\"Q3 forecast\"}', 100, 'seed')",
9200 [],
9201 )
9202 .expect("insert node");
9203 conn.execute(
9205 "INSERT INTO chunks (id, node_logical_id, text_content, created_at) \
9206 VALUES ('chunk-1', 'doc-1', 'budget text', 100)",
9207 [],
9208 )
9209 .expect("insert chunk");
9210 conn.execute(
9212 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
9213 VALUES ('doc-1', 'Document', 'Budget Q3 forecast')",
9214 [],
9215 )
9216 .expect("insert property fts");
9217 conn.execute(
9219 "INSERT INTO provenance_events (id, event_type, subject, source_ref, created_at, metadata_json) \
9220 VALUES ('evt-retire', 'node_retire', 'doc-1', 'forget-1', 200, '')",
9221 [],
9222 )
9223 .expect("retire event");
9224 conn.execute(
9225 "UPDATE nodes SET superseded_at = 200 WHERE logical_id = 'doc-1'",
9226 [],
9227 )
9228 .expect("supersede");
9229 conn.execute("DELETE FROM fts_nodes", [])
9230 .expect("clear chunk fts");
9231 conn.execute("DELETE FROM fts_node_properties", [])
9232 .expect("clear property fts");
9233 }
9234
9235 let report = service.restore_logical_id("doc-1").expect("restore");
9236 assert_eq!(report.restored_property_fts_rows, 1);
9237
9238 let conn = sqlite::open_connection(db.path()).expect("conn");
9240 let prop_fts_count: i64 = conn
9241 .query_row(
9242 "SELECT count(*) FROM fts_node_properties WHERE node_logical_id = 'doc-1'",
9243 [],
9244 |row| row.get(0),
9245 )
9246 .expect("prop fts count");
9247 assert_eq!(prop_fts_count, 1, "property FTS must be restored");
9248
9249 let text: String = conn
9250 .query_row(
9251 "SELECT text_content FROM fts_node_properties WHERE node_logical_id = 'doc-1'",
9252 [],
9253 |row| row.get(0),
9254 )
9255 .expect("prop fts text");
9256 assert_eq!(text, "Budget Q3 forecast");
9257 }
9258
9259 #[test]
9260 fn safe_export_preserves_fts_property_schemas() {
9261 let (_db, service) = setup();
9262 service
9263 .register_fts_property_schema(
9264 "Goal",
9265 &["$.name".to_owned(), "$.rationale".to_owned()],
9266 None,
9267 )
9268 .expect("register schema");
9269
9270 let export_dir = tempfile::TempDir::new().expect("temp dir");
9271 let export_path = export_dir.path().join("backup.db");
9272 service
9273 .safe_export(
9274 &export_path,
9275 SafeExportOptions {
9276 force_checkpoint: false,
9277 },
9278 )
9279 .expect("export");
9280
9281 let exported_conn = rusqlite::Connection::open(&export_path).expect("open exported db");
9283 let kind: String = exported_conn
9284 .query_row(
9285 "SELECT kind FROM fts_property_schemas WHERE kind = 'Goal'",
9286 [],
9287 |row| row.get(0),
9288 )
9289 .expect("schema must exist in export");
9290 assert_eq!(kind, "Goal");
9291 let paths_json: String = exported_conn
9292 .query_row(
9293 "SELECT property_paths_json FROM fts_property_schemas WHERE kind = 'Goal'",
9294 [],
9295 |row| row.get(0),
9296 )
9297 .expect("paths must exist");
9298 let paths: Vec<String> = serde_json::from_str(&paths_json).expect("valid json");
9299 assert_eq!(paths, vec!["$.name", "$.rationale"]);
9300 }
9301
9302 #[test]
9303 #[allow(clippy::too_many_lines)]
9304 fn export_recovery_rebuilds_property_fts_from_canonical_state() {
9305 let (db, service) = setup();
9306 service
9308 .register_fts_property_schema("Goal", &["$.name".to_owned()], None)
9309 .expect("register");
9310 {
9311 let conn = sqlite::open_connection(db.path()).expect("conn");
9312 conn.execute(
9313 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9314 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9315 [],
9316 )
9317 .expect("insert node 1");
9318 conn.execute(
9319 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
9320 VALUES ('goal-1', 'Goal', 'Ship v2')",
9321 [],
9322 )
9323 .expect("insert property FTS row 1");
9324 conn.execute(
9325 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9326 VALUES ('row-2', 'goal-2', 'Goal', '{\"name\":\"Launch redesign\"}', 100, 'seed')",
9327 [],
9328 )
9329 .expect("insert node 2");
9330 conn.execute(
9331 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
9332 VALUES ('goal-2', 'Goal', 'Launch redesign')",
9333 [],
9334 )
9335 .expect("insert property FTS row 2");
9336 }
9337
9338 let export_dir = tempfile::TempDir::new().expect("temp dir");
9340 let export_path = export_dir.path().join("backup.db");
9341 service
9342 .safe_export(
9343 &export_path,
9344 SafeExportOptions {
9345 force_checkpoint: false,
9346 },
9347 )
9348 .expect("export");
9349
9350 {
9354 let conn = rusqlite::Connection::open(&export_path).expect("open export");
9355 conn.execute(
9356 "DELETE FROM fts_node_properties WHERE node_logical_id = 'goal-1'",
9357 [],
9358 )
9359 .expect("delete old row");
9360 conn.execute(
9361 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
9362 VALUES ('goal-1', 'Goal', 'completely wrong stale text')",
9363 [],
9364 )
9365 .expect("insert corrupted row");
9366 conn.execute(
9367 "DELETE FROM fts_node_properties WHERE node_logical_id = 'goal-2'",
9368 [],
9369 )
9370 .expect("delete goal-2 row");
9371 }
9372
9373 let schema = Arc::new(SchemaManager::new());
9375 let exported_service = AdminService::new(&export_path, Arc::clone(&schema));
9376 exported_service
9377 .rebuild_projections(ProjectionTarget::Fts)
9378 .expect("rebuild");
9379
9380 let coordinator = ExecutionCoordinator::open(
9383 &export_path,
9384 Arc::clone(&schema),
9385 None,
9386 1,
9387 Arc::new(TelemetryCounters::default()),
9388 )
9389 .expect("coordinator");
9390
9391 let compiled = QueryBuilder::nodes("Goal")
9392 .text_search("Ship", 10)
9393 .limit(10)
9394 .compile()
9395 .expect("compile");
9396 let rows = coordinator
9397 .execute_compiled_read(&compiled)
9398 .expect("execute read");
9399 assert_eq!(rows.nodes.len(), 1);
9400 assert_eq!(rows.nodes[0].logical_id, "goal-1");
9401
9402 let compiled2 = QueryBuilder::nodes("Goal")
9404 .text_search("redesign", 10)
9405 .limit(10)
9406 .compile()
9407 .expect("compile");
9408 let rows2 = coordinator
9409 .execute_compiled_read(&compiled2)
9410 .expect("execute read");
9411 assert_eq!(rows2.nodes.len(), 1);
9412 assert_eq!(rows2.nodes[0].logical_id, "goal-2");
9413
9414 let compiled3 = QueryBuilder::nodes("Goal")
9416 .text_search("stale", 10)
9417 .limit(10)
9418 .compile()
9419 .expect("compile");
9420 let rows3 = coordinator
9421 .execute_compiled_read(&compiled3)
9422 .expect("execute read");
9423 assert_eq!(
9424 rows3.nodes.len(),
9425 0,
9426 "corrupted text must not appear in search after rebuild"
9427 );
9428
9429 let integrity = exported_service.check_integrity().expect("integrity");
9431 assert_eq!(integrity.missing_property_fts_rows, 0);
9432 let semantics = exported_service.check_semantics().expect("semantics");
9433 assert_eq!(semantics.drifted_property_fts_rows, 0);
9434 assert_eq!(semantics.orphaned_property_fts_rows, 0);
9435 assert_eq!(semantics.duplicate_property_fts_rows, 0);
9436 }
9437
9438 #[test]
9439 fn check_integrity_no_false_positives_for_empty_extraction() {
9440 let (db, service) = setup();
9441 {
9442 let conn = sqlite::open_connection(db.path()).expect("conn");
9443 conn.execute(
9445 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9446 VALUES ('Ticket', '[\"$.searchable\"]', ' ')",
9447 [],
9448 )
9449 .expect("register schema");
9450 conn.execute(
9453 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9454 VALUES ('row-1', 'ticket-1', 'Ticket', '{\"status\":\"open\"}', 100, 'seed')",
9455 [],
9456 )
9457 .expect("insert node");
9458 }
9459
9460 let report = service.check_integrity().expect("integrity");
9461 assert_eq!(
9462 report.missing_property_fts_rows, 0,
9463 "node with no extractable values must not be counted as missing"
9464 );
9465 }
9466
9467 #[test]
9468 fn check_integrity_detects_genuinely_missing_property_fts_rows() {
9469 let (db, service) = setup();
9470 {
9471 let conn = sqlite::open_connection(db.path()).expect("conn");
9472 conn.execute(
9473 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9474 VALUES ('Ticket', '[\"$.title\"]', ' ')",
9475 [],
9476 )
9477 .expect("register schema");
9478 conn.execute(
9480 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9481 VALUES ('row-1', 'ticket-1', 'Ticket', '{\"title\":\"fix login bug\"}', 100, 'seed')",
9482 [],
9483 )
9484 .expect("insert node");
9485 }
9486
9487 let report = service.check_integrity().expect("integrity");
9488 assert_eq!(
9489 report.missing_property_fts_rows, 1,
9490 "node with extractable values but no property FTS row must be detected"
9491 );
9492 }
9493
9494 #[test]
9495 fn rebuild_projections_fts_restores_missing_property_fts_rows() {
9496 let (db, service) = setup();
9497 {
9498 let conn = sqlite::open_connection(db.path()).expect("conn");
9499 conn.execute(
9500 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9501 VALUES ('Goal', '[\"$.name\"]', ' ')",
9502 [],
9503 )
9504 .expect("register schema");
9505 conn.execute(
9506 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9507 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9508 [],
9509 )
9510 .expect("insert node");
9511 }
9513
9514 let report = service
9515 .rebuild_projections(ProjectionTarget::Fts)
9516 .expect("rebuild");
9517 assert!(
9518 report.rebuilt_rows >= 1,
9519 "rebuild must insert at least one property FTS row"
9520 );
9521
9522 let conn = sqlite::open_connection(db.path()).expect("conn");
9523 let text: String = conn
9524 .query_row(
9525 "SELECT text_content FROM fts_node_properties WHERE node_logical_id = 'goal-1'",
9526 [],
9527 |row| row.get(0),
9528 )
9529 .expect("property FTS row must exist after rebuild");
9530 assert_eq!(text, "Ship v2");
9531 }
9532
9533 #[test]
9534 fn rebuild_missing_projections_fills_gap_for_deleted_property_fts_row() {
9535 let (db, service) = setup();
9536 {
9537 let conn = sqlite::open_connection(db.path()).expect("conn");
9538 conn.execute(
9539 "INSERT INTO fts_property_schemas (kind, property_paths_json, separator) \
9540 VALUES ('Goal', '[\"$.name\"]', ' ')",
9541 [],
9542 )
9543 .expect("register schema");
9544 conn.execute(
9545 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9546 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9547 [],
9548 )
9549 .expect("insert node");
9550 conn.execute(
9552 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
9553 VALUES ('goal-1', 'Goal', 'Ship v2')",
9554 [],
9555 )
9556 .expect("insert property fts");
9557 conn.execute(
9558 "DELETE FROM fts_node_properties WHERE node_logical_id = 'goal-1'",
9559 [],
9560 )
9561 .expect("delete property fts");
9562 }
9563
9564 let report = service
9565 .rebuild_missing_projections()
9566 .expect("rebuild missing");
9567 assert!(
9568 report.rebuilt_rows >= 1,
9569 "missing rebuild must insert the gap-fill row"
9570 );
9571
9572 let conn = sqlite::open_connection(db.path()).expect("conn");
9573 let count: i64 = conn
9574 .query_row(
9575 "SELECT count(*) FROM fts_node_properties WHERE node_logical_id = 'goal-1'",
9576 [],
9577 |row| row.get(0),
9578 )
9579 .expect("count");
9580 assert_eq!(
9581 count, 1,
9582 "gap-fill must restore exactly one property FTS row"
9583 );
9584 }
9585
9586 #[test]
9587 fn remove_schema_then_rebuild_cleans_stale_property_fts_rows() {
9588 let (db, service) = setup();
9589 service
9590 .register_fts_property_schema("Goal", &["$.name".to_owned()], None)
9591 .expect("register");
9592 {
9593 let conn = sqlite::open_connection(db.path()).expect("conn");
9594 conn.execute(
9595 "INSERT INTO nodes (row_id, logical_id, kind, properties, created_at, source_ref) \
9596 VALUES ('row-1', 'goal-1', 'Goal', '{\"name\":\"Ship v2\"}', 100, 'seed')",
9597 [],
9598 )
9599 .expect("insert node");
9600 conn.execute(
9602 "INSERT INTO fts_node_properties (node_logical_id, kind, text_content) \
9603 VALUES ('goal-1', 'Goal', 'Ship v2')",
9604 [],
9605 )
9606 .expect("insert property fts");
9607 }
9608
9609 service.remove_fts_property_schema("Goal").expect("remove");
9611
9612 let semantics = service.check_semantics().expect("semantics");
9614 assert_eq!(
9615 semantics.orphaned_property_fts_rows, 1,
9616 "stale property FTS rows must be detected after schema removal"
9617 );
9618
9619 service
9621 .rebuild_projections(ProjectionTarget::Fts)
9622 .expect("rebuild");
9623
9624 let conn = sqlite::open_connection(db.path()).expect("conn");
9625 let count: i64 = conn
9626 .query_row(
9627 "SELECT count(*) FROM fts_node_properties WHERE node_logical_id = 'goal-1'",
9628 [],
9629 |row| row.get(0),
9630 )
9631 .expect("count");
9632 assert_eq!(
9633 count, 0,
9634 "rebuild after schema removal must delete stale property FTS rows"
9635 );
9636 }
9637
9638 mod validate_fts_property_paths_tests {
9639 use super::super::validate_fts_property_paths;
9640
9641 #[test]
9642 fn valid_simple_path() {
9643 assert!(validate_fts_property_paths(&["$.name".to_owned()]).is_ok());
9644 }
9645
9646 #[test]
9647 fn valid_nested_path() {
9648 assert!(validate_fts_property_paths(&["$.address.city".to_owned()]).is_ok());
9649 }
9650
9651 #[test]
9652 fn valid_underscore_segment() {
9653 assert!(validate_fts_property_paths(&["$.a_b".to_owned()]).is_ok());
9654 }
9655
9656 #[test]
9657 fn rejects_bare_prefix() {
9658 let result = validate_fts_property_paths(&["$.".to_owned()]);
9659 assert!(result.is_err(), "path '$.' must be rejected");
9660 }
9661
9662 #[test]
9663 fn rejects_double_dot() {
9664 let result = validate_fts_property_paths(&["$..x".to_owned()]);
9665 assert!(result.is_err(), "path '$..x' must be rejected");
9666 }
9667
9668 #[test]
9669 fn rejects_trailing_dot() {
9670 let result = validate_fts_property_paths(&["$.foo.".to_owned()]);
9671 assert!(result.is_err(), "path '$.foo.' must be rejected");
9672 }
9673
9674 #[test]
9675 fn rejects_space_in_segment() {
9676 let result = validate_fts_property_paths(&["$.foo bar".to_owned()]);
9677 assert!(result.is_err(), "path '$.foo bar' must be rejected");
9678 }
9679
9680 #[test]
9681 fn rejects_bracket_syntax() {
9682 let result = validate_fts_property_paths(&["$.foo[0]".to_owned()]);
9683 assert!(result.is_err(), "path '$.foo[0]' must be rejected");
9684 }
9685
9686 #[test]
9687 fn rejects_duplicates() {
9688 let result = validate_fts_property_paths(&["$.name".to_owned(), "$.name".to_owned()]);
9689 assert!(result.is_err(), "duplicate paths must be rejected");
9690 }
9691
9692 #[test]
9693 fn rejects_empty_list() {
9694 let result = validate_fts_property_paths(&[]);
9695 assert!(result.is_err(), "empty path list must be rejected");
9696 }
9697 }
9698}