fathomdb_query/search.rs
1//! Result-type surface for adaptive text search.
2//!
3//! Phase 1 wires a strict-only execution path through the coordinator. The
4//! types exposed here are intentionally forward-compatible with later phases
5//! that will add a relaxed branch, match-mode attribution, and recursive
6//! property extraction. Fields that are reserved for those phases are present
7//! and documented but populated with defaults in Phase 1.
8
9use crate::{Predicate, TextQuery};
10
11/// Which branch of the adaptive text-search policy produced a given result
12/// set or was used to construct a given [`CompiledSearch`].
13///
14/// Phase 3 runs the strict branch first, then conditionally runs a relaxed
15/// branch derived from the same user query (see
16/// [`crate::derive_relaxed`]). The coordinator tags each in-flight branch
17/// with this enum so that merge, dedup, and counts stay straightforward.
18#[derive(Clone, Copy, Debug, Eq, PartialEq)]
19pub enum SearchBranch {
20 /// The strict branch: the user's query as written.
21 Strict,
22 /// The relaxed fallback branch derived from the strict query.
23 Relaxed,
24}
25
26/// Source of a [`SearchHit`] within the FTS surface.
27#[derive(Clone, Copy, Debug, Eq, PartialEq)]
28pub enum SearchHitSource {
29 /// The hit came from the chunk-backed full-text index (`fts_nodes`).
30 Chunk,
31 /// The hit came from the property-backed full-text index
32 /// (`fts_node_properties`).
33 Property,
34 /// Reserved for future vector-search attribution.
35 ///
36 /// No Phase 1 code path emits this variant; it is exported so that future
37 /// vector wiring can be added without a breaking change to consumers that
38 /// exhaustively match on [`SearchHitSource`].
39 Vector,
40}
41
42/// Whether a [`SearchHit`] was produced by the strict user query or by a
43/// relaxed (Phase 2+) fallback branch.
44#[derive(Clone, Copy, Debug, Eq, PartialEq)]
45pub enum SearchMatchMode {
46 /// The hit matched the user's query exactly as written.
47 Strict,
48 /// Reserved: the hit matched only after the query was relaxed by an
49 /// adaptive fallback pass. No Phase 1 code path emits this variant.
50 Relaxed,
51}
52
53/// Coarse retrieval-modality classifier for a [`SearchHit`].
54///
55/// Phase 10 adds this field to the result surface so that future phases
56/// which introduce a vector retrieval branch can tag their hits without a
57/// breaking change to consumers. Every hit produced by the current (text-
58/// only) execution paths is tagged [`RetrievalModality::Text`].
59#[derive(Clone, Copy, Debug, Eq, PartialEq)]
60pub enum RetrievalModality {
61 /// The hit came from a text retrieval branch (chunk or property FTS).
62 Text,
63 /// The hit came from a vector retrieval branch. Reserved — no current
64 /// execution path emits this variant.
65 Vector,
66}
67
68/// Per-hit attribution data produced by the (Phase 5) match attributor.
69///
70/// The struct is exported in Phase 1 to lock in the shape of
71/// [`SearchHit::attribution`], but it is never populated by the current
72/// execution path. All hits return `attribution: None` until Phase 5 wires
73/// the attributor.
74#[derive(Clone, Debug, Default, Eq, PartialEq)]
75pub struct HitAttribution {
76 /// Property paths (or `"text_content"` for chunk hits) that contributed to
77 /// the match. Empty in Phase 1.
78 pub matched_paths: Vec<String>,
79}
80
81/// A single result row emitted by adaptive text search.
82#[derive(Clone, Debug, PartialEq)]
83pub struct SearchHit {
84 /// The matched node, projected in the same shape the flat query surface
85 /// uses.
86 pub node: NodeRowLite,
87 /// Raw engine score used for ordering within a block. Higher is always
88 /// better, across every modality and every source:
89 /// - Text hits: the FTS5 bm25 score with its sign flipped (`-bm25(...)`),
90 /// so higher score corresponds to stronger lexical relevance.
91 /// - Vector hits: a negated distance (`-vector_distance`) for distance
92 /// metrics, or a direct similarity value for similarity metrics.
93 ///
94 /// Scores are **ordering-only within a block**. Scores from different
95 /// blocks — and in particular text scores vs. vector scores — are not
96 /// on a shared scale. The engine does not normalize across blocks, and
97 /// callers must not compare or arithmetically combine scores across
98 /// blocks.
99 pub score: f64,
100 /// Coarse retrieval-modality classifier. Every hit produced by the
101 /// current text execution paths is tagged
102 /// [`RetrievalModality::Text`]; future phases that wire vector
103 /// retrieval will tag those hits [`RetrievalModality::Vector`].
104 pub modality: RetrievalModality,
105 /// Which FTS surface produced the hit.
106 pub source: SearchHitSource,
107 /// Whether this hit came from the strict or relaxed branch. `Some`
108 /// for every text hit; reserved as `None` for future vector hits,
109 /// which have no strict/relaxed notion.
110 pub match_mode: Option<SearchMatchMode>,
111 /// Short context snippet for display. `Some` for at least the chunk path
112 /// (`SQLite`'s `snippet(...)`) and a trimmed window of `text_content` for
113 /// the property path.
114 pub snippet: Option<String>,
115 /// Wall-clock timestamp (unix seconds) at which the *active* version of
116 /// the node was written.
117 ///
118 /// Under fathomdb's soft-delete supersession model, nodes are versioned:
119 /// each edit creates a new active row and marks the prior row
120 /// superseded. `written_at` reflects when the **current** active row was
121 /// inserted, which is "when the text that just matched was written," not
122 /// "when the `logical_id` was first created." A node created two years ago
123 /// but updated yesterday will show yesterday's timestamp, because
124 /// yesterday's text is what the FTS index scored against.
125 ///
126 /// This is deliberately distinct from `superseded_at` (only populated on
127 /// dead rows), `node_access_metadata.last_accessed_at` (an explicit touch,
128 /// not a write), and `provenance_events.created_at` (audit event time).
129 pub written_at: i64,
130 /// Opaque identifier of the underlying projection row (e.g. `chunks.id`
131 /// for chunk hits, or `fts_node_properties.rowid` for property hits).
132 /// Useful for debugging and for future attribution paths.
133 pub projection_row_id: Option<String>,
134 /// Raw vector distance or similarity for vector hits. `None` for text
135 /// hits.
136 ///
137 /// Stable public API: this field ships in v1 and is documented as
138 /// modality-specific diagnostic data. Callers may read it for display
139 /// or internal reranking but must **not** compare it against text-hit
140 /// `score` values or use it arithmetically alongside text scores — the
141 /// two are not on a shared scale.
142 ///
143 /// For distance metrics the raw distance is preserved (lower = closer
144 /// match); callers that want a "higher is better" ordering value should
145 /// read `score` instead, which is already negated appropriately for
146 /// intra-block ranking.
147 pub vector_distance: Option<f64>,
148 /// Reserved: match-attribution payload. Always `None` in Phase 1.
149 pub attribution: Option<HitAttribution>,
150}
151
152/// Minimal node-shaped projection attached to every [`SearchHit`].
153///
154/// This intentionally mirrors the fields of `fathomdb_engine::NodeRow` without
155/// depending on the engine crate. The engine-side `execute_compiled_search`
156/// materializes `SearchHit` values using its own `NodeRow` type, so the facade
157/// crate converts between the two.
158#[derive(Clone, Debug, Eq, PartialEq)]
159pub struct NodeRowLite {
160 /// Physical row ID.
161 pub row_id: String,
162 /// Logical ID of the node.
163 pub logical_id: String,
164 /// Node kind.
165 pub kind: String,
166 /// JSON-encoded node properties.
167 pub properties: String,
168 /// Optional URI referencing external content.
169 pub content_ref: Option<String>,
170 /// Unix timestamp of last access, if tracked.
171 pub last_accessed_at: Option<i64>,
172}
173
174/// Result set returned by an adaptive text-search execution.
175#[derive(Clone, Debug, Default, PartialEq)]
176pub struct SearchRows {
177 /// Matched hits in descending score order.
178 pub hits: Vec<SearchHit>,
179 /// Count of strict-branch hits (Phase 1: equals `hits.len()`).
180 pub strict_hit_count: usize,
181 /// Count of relaxed-branch hits (Phase 1: always 0).
182 pub relaxed_hit_count: usize,
183 /// Count of vector-branch hits. Always `0` after Phase 10 because no
184 /// vector execution path exists yet; reserved so that when vector
185 /// retrieval lands in a later phase, the wire shape already has the
186 /// counter and consumers do not need a breaking change.
187 pub vector_hit_count: usize,
188 /// Whether the relaxed fallback branch fired (Phase 1: always `false`).
189 pub fallback_used: bool,
190 /// Whether a capability miss caused the query to degrade to an empty
191 /// result set (mirrors `QueryRows::was_degraded`).
192 pub was_degraded: bool,
193}
194
195/// A compiled adaptive-search plan ready for the coordinator to execute.
196///
197/// Phase 2 splits the filter pipeline into two sets: `fusable_filters`
198/// (pushed into the `search_hits` CTE so the CTE `LIMIT` applies after
199/// filtering) and `residual_filters` (evaluated in the outer `WHERE`). The
200/// coordinator emits SQL for it directly rather than reusing
201/// [`crate::compile_query`], because the search SELECT projects a different
202/// row shape (score, source, snippet, projection id) than the flat query
203/// path.
204#[derive(Clone, Debug, PartialEq, Eq)]
205pub struct CompiledSearch {
206 /// Root kind the caller built the query against.
207 pub root_kind: String,
208 /// Parsed text-search intent, to be lowered into safe FTS5 syntax.
209 pub text_query: TextQuery,
210 /// Maximum number of candidate hits to retrieve from the FTS indexes.
211 pub limit: usize,
212 /// Fusable predicates pushed into the `search_hits` CTE by the coordinator.
213 /// These evaluate against columns directly available on the `nodes` table
214 /// joined inside the CTE (`kind`, `logical_id`, `source_ref`,
215 /// `content_ref`).
216 pub fusable_filters: Vec<Predicate>,
217 /// Residual predicates applied in the outer `WHERE` after the CTE
218 /// materializes. Currently limited to JSON-property predicates
219 /// (`json_extract` on `n.properties`).
220 pub residual_filters: Vec<Predicate>,
221 /// Whether the caller requested per-hit match attribution. Phase 5: when
222 /// `true`, the coordinator populates [`SearchHit::attribution`] on every
223 /// hit by resolving FTS5 match positions against the Phase 4 position
224 /// map. When `false` (the default), the position map is not read at all
225 /// and `attribution` stays `None`.
226 pub attribution_requested: bool,
227}
228
229/// A compiled vector-only search plan ready for the coordinator to execute.
230///
231/// Phase 11 delivers a standalone vector retrieval path parallel to
232/// [`CompiledSearch`]. It is intentionally structurally distinct: the vector
233/// path has no [`TextQuery`], no relaxed branch, and no [`SearchMatchMode`] —
234/// vector hits always carry `match_mode: None` per addendum 1. The
235/// coordinator consumes this carrier via
236/// `ExecutionCoordinator::execute_compiled_vector_search`, which emits SQL
237/// against the `vec_nodes_active` virtual table joined to `nodes`, and
238/// returns a [`SearchRows`] with a single vector block (or an empty result
239/// with `was_degraded = true` when the sqlite-vec capability is absent).
240#[derive(Clone, Debug, PartialEq, Eq)]
241pub struct CompiledVectorSearch {
242 /// Root kind the caller built the query against. May be empty for
243 /// kind-agnostic callers, mirroring the text path.
244 pub root_kind: String,
245 /// Raw vector query text passed to sqlite-vec via the `embedding MATCH`
246 /// operator. This is a serialized JSON float array (e.g.
247 /// `"[0.1, 0.2, 0.3, 0.4]"`) at the time the coordinator binds it.
248 pub query_text: String,
249 /// Maximum number of candidate hits to retrieve from the vec0 KNN scan.
250 pub limit: usize,
251 /// Fusable predicates pushed into the vector-search CTE by the
252 /// coordinator. Evaluated against columns directly available on the
253 /// `nodes` table joined inside the CTE.
254 pub fusable_filters: Vec<Predicate>,
255 /// Residual predicates applied in the outer `WHERE` after the CTE
256 /// materializes. Currently limited to JSON-property predicates.
257 pub residual_filters: Vec<Predicate>,
258 /// Whether the caller requested per-hit match attribution. Per addendum
259 /// 1 §Attribution on vector hits, vector hits under this flag carry
260 /// `Some(HitAttribution { matched_paths: vec![] })` — an empty
261 /// matched-paths list, not `None`.
262 pub attribution_requested: bool,
263}
264
265/// A two-branch compiled search plan ready for the coordinator to execute.
266///
267/// Phase 6 factors the strict+relaxed retrieval pair into a small carrier so
268/// that the adaptive [`crate::compile_search`] path and the narrow
269/// `fallback_search(strict, relaxed)` helper share a single coordinator
270/// routine. Both branches carry fully compiled [`CompiledSearch`] values —
271/// including the same fused/residual filter chain and the same
272/// `attribution_requested` flag — so merge/dedup stays branch-agnostic.
273#[derive(Clone, Debug, PartialEq, Eq)]
274pub struct CompiledSearchPlan {
275 /// The strict branch — always runs first.
276 pub strict: CompiledSearch,
277 /// The relaxed branch, or `None` when the caller did not request a
278 /// fallback shape. When `None`, the coordinator runs strict only and
279 /// never triggers the fallback policy.
280 pub relaxed: Option<CompiledSearch>,
281 /// Set when the plan originated from [`crate::derive_relaxed`] and its
282 /// alternatives list was truncated past [`crate::RELAXED_BRANCH_CAP`].
283 /// The `fallback_search` path always sets this to `false` because the
284 /// relaxed shape is caller-provided and not subject to the cap.
285 pub was_degraded_at_plan_time: bool,
286}
287
288/// A compiled unified retrieval plan for the Phase 12 `search()` entry point.
289///
290/// `CompiledRetrievalPlan` carries the bounded set of branches the engine-owned
291/// retrieval planner may run on behalf of a single `search(query, limit)` call:
292/// the text strict + optional text relaxed pair (carried structurally as the
293/// existing Phase 6 [`CompiledSearchPlan`]) and an optional vector branch.
294///
295/// **v1 scope (Phase 12)**: the planner's `vector` branch slot is structurally
296/// supported so that the coordinator's three-block fusion path is fully wired,
297/// but [`crate::compile_retrieval_plan`] always sets `vector` to `None`. Read-
298/// time embedding of natural-language queries is not wired into the engine in
299/// v1; callers that want vector retrieval through the unified `search()`
300/// entry point will get text-only results until a future phase wires the
301/// embedding generator into the read path. Callers who want explicit vector
302/// retrieval today use the advanced `vector_search()` override (Phase 11),
303/// which takes a caller-provided vector literal.
304///
305/// `CompiledRetrievalPlan` is intentionally distinct from
306/// [`CompiledSearchPlan`]: `CompiledSearchPlan` is the text-only carrier
307/// consumed by `text_search()` and `fallback_search()`, and the two paths
308/// remain separate so the text-only call sites do not pay any vector-branch
309/// cost. The Phase 12 unified planner is a sibling, not a replacement.
310#[derive(Clone, Debug, PartialEq, Eq)]
311pub struct CompiledRetrievalPlan {
312 /// The text branches (strict + optional relaxed) of the unified plan.
313 /// Always present — every `search()` call produces at least a strict
314 /// text branch (which may itself short-circuit to empty when the query
315 /// is `Empty` or a top-level `Not`).
316 pub text: CompiledSearchPlan,
317 /// The vector branch slot. Always `None` in v1 per the Phase 12 scope
318 /// constraint above.
319 pub vector: Option<CompiledVectorSearch>,
320 /// Mirrors [`CompiledSearchPlan::was_degraded_at_plan_time`] for the
321 /// text branches: set when the relaxed branch's alternatives list was
322 /// truncated past [`crate::RELAXED_BRANCH_CAP`] at plan-construction
323 /// time. Propagated to the result's `was_degraded` flag if and only if
324 /// the relaxed branch actually fires at execution time.
325 pub was_degraded_at_plan_time: bool,
326}
327
328#[cfg(test)]
329#[allow(clippy::expect_used)]
330mod tests {
331 use super::*;
332
333 #[test]
334 fn search_hit_source_has_vector_variant_reserved() {
335 // Compile-time exhaustiveness check: if a future change removes or
336 // renames Vector, this match stops compiling and the test fails
337 // loudly rather than silently breaking consumers that rely on the
338 // reserved variant.
339 let source = SearchHitSource::Chunk;
340 match source {
341 SearchHitSource::Chunk | SearchHitSource::Property | SearchHitSource::Vector => {}
342 }
343 }
344
345 #[test]
346 fn search_match_mode_has_strict_and_relaxed() {
347 let mode = SearchMatchMode::Strict;
348 match mode {
349 SearchMatchMode::Strict | SearchMatchMode::Relaxed => {}
350 }
351 }
352
353 #[test]
354 fn compile_search_rejects_ast_without_text_search_step() {
355 use crate::{CompileError, QueryBuilder, compile_search};
356 let ast = QueryBuilder::nodes("Goal")
357 .filter_kind_eq("Goal")
358 .into_ast();
359 let result = compile_search(&ast);
360 assert!(
361 matches!(result, Err(CompileError::MissingTextSearchStep)),
362 "expected MissingTextSearchStep, got {result:?}"
363 );
364 }
365
366 #[test]
367 fn compile_search_accepts_text_search_step_with_filters() {
368 use crate::{QueryBuilder, compile_search};
369 let ast = QueryBuilder::nodes("Goal")
370 .text_search("quarterly docs", 7)
371 .filter_kind_eq("Goal")
372 .into_ast();
373 let compiled = compile_search(&ast).expect("compiles");
374 assert_eq!(compiled.root_kind, "Goal");
375 assert_eq!(compiled.limit, 7);
376 assert_eq!(compiled.fusable_filters.len(), 1);
377 assert!(compiled.residual_filters.is_empty());
378 }
379
380 #[test]
381 fn compile_vector_search_rejects_ast_without_vector_search_step() {
382 use crate::{CompileError, QueryBuilder, compile_vector_search};
383 let ast = QueryBuilder::nodes("Goal")
384 .filter_kind_eq("Goal")
385 .into_ast();
386 let result = compile_vector_search(&ast);
387 assert!(
388 matches!(result, Err(CompileError::MissingVectorSearchStep)),
389 "expected MissingVectorSearchStep, got {result:?}"
390 );
391 }
392
393 #[test]
394 fn compile_vector_search_accepts_vector_search_step_with_filters() {
395 use crate::{Predicate, QueryBuilder, compile_vector_search};
396 let ast = QueryBuilder::nodes("Goal")
397 .vector_search("[0.1, 0.2, 0.3, 0.4]", 7)
398 .filter_kind_eq("Goal")
399 .filter_json_text_eq("$.status", "active")
400 .into_ast();
401 let compiled = compile_vector_search(&ast).expect("compiles");
402 assert_eq!(compiled.root_kind, "Goal");
403 assert_eq!(compiled.query_text, "[0.1, 0.2, 0.3, 0.4]");
404 assert_eq!(compiled.limit, 7);
405 assert_eq!(compiled.fusable_filters.len(), 1);
406 assert!(matches!(
407 compiled.fusable_filters[0],
408 Predicate::KindEq(ref k) if k == "Goal"
409 ));
410 assert_eq!(compiled.residual_filters.len(), 1);
411 assert!(!compiled.attribution_requested);
412 }
413}