Skip to main content

fathomdb_query/
search.rs

1//! Result-type surface for adaptive text search.
2//!
3//! Phase 1 wires a strict-only execution path through the coordinator. The
4//! types exposed here are intentionally forward-compatible with later phases
5//! that will add a relaxed branch, match-mode attribution, and recursive
6//! property extraction. Fields that are reserved for those phases are present
7//! and documented but populated with defaults in Phase 1.
8
9use crate::{Predicate, TextQuery};
10
11/// Which branch of the adaptive text-search policy produced a given result
12/// set or was used to construct a given [`CompiledSearch`].
13///
14/// Phase 3 runs the strict branch first, then conditionally runs a relaxed
15/// branch derived from the same user query (see
16/// [`crate::derive_relaxed`]). The coordinator tags each in-flight branch
17/// with this enum so that merge, dedup, and counts stay straightforward.
18#[derive(Clone, Copy, Debug, Eq, PartialEq)]
19pub enum SearchBranch {
20    /// The strict branch: the user's query as written.
21    Strict,
22    /// The relaxed fallback branch derived from the strict query.
23    Relaxed,
24}
25
26/// Source of a [`SearchHit`] within the FTS surface.
27#[derive(Clone, Copy, Debug, Eq, PartialEq)]
28pub enum SearchHitSource {
29    /// The hit came from the chunk-backed full-text index (`fts_nodes`).
30    Chunk,
31    /// The hit came from the property-backed full-text index
32    /// (`fts_node_properties`).
33    Property,
34    /// Reserved for future vector-search attribution.
35    ///
36    /// No Phase 1 code path emits this variant; it is exported so that future
37    /// vector wiring can be added without a breaking change to consumers that
38    /// exhaustively match on [`SearchHitSource`].
39    Vector,
40}
41
42/// Whether a [`SearchHit`] was produced by the strict user query or by a
43/// relaxed (Phase 2+) fallback branch.
44#[derive(Clone, Copy, Debug, Eq, PartialEq)]
45pub enum SearchMatchMode {
46    /// The hit matched the user's query exactly as written.
47    Strict,
48    /// Reserved: the hit matched only after the query was relaxed by an
49    /// adaptive fallback pass. No Phase 1 code path emits this variant.
50    Relaxed,
51}
52
53/// Coarse retrieval-modality classifier for a [`SearchHit`].
54///
55/// Phase 10 adds this field to the result surface so that future phases
56/// which introduce a vector retrieval branch can tag their hits without a
57/// breaking change to consumers. Every hit produced by the current (text-
58/// only) execution paths is tagged [`RetrievalModality::Text`].
59#[derive(Clone, Copy, Debug, Eq, PartialEq)]
60pub enum RetrievalModality {
61    /// The hit came from a text retrieval branch (chunk or property FTS).
62    Text,
63    /// The hit came from a vector retrieval branch. Reserved — no current
64    /// execution path emits this variant.
65    Vector,
66}
67
68/// Per-hit attribution data produced by the (Phase 5) match attributor.
69///
70/// The struct is exported in Phase 1 to lock in the shape of
71/// [`SearchHit::attribution`], but it is never populated by the current
72/// execution path. All hits return `attribution: None` until Phase 5 wires
73/// the attributor.
74#[derive(Clone, Debug, Default, Eq, PartialEq)]
75pub struct HitAttribution {
76    /// Property paths (or `"text_content"` for chunk hits) that contributed to
77    /// the match. Empty in Phase 1.
78    pub matched_paths: Vec<String>,
79}
80
81/// A single result row emitted by adaptive text search.
82#[derive(Clone, Debug, PartialEq)]
83pub struct SearchHit {
84    /// The matched node, projected in the same shape the flat query surface
85    /// uses.
86    pub node: NodeRowLite,
87    /// Raw engine score used for ordering within a block. Higher is always
88    /// better, across every modality and every source:
89    /// - Text hits: the FTS5 bm25 score with its sign flipped (`-bm25(...)`),
90    ///   so higher score corresponds to stronger lexical relevance.
91    /// - Vector hits: a negated distance (`-vector_distance`) for distance
92    ///   metrics, or a direct similarity value for similarity metrics.
93    ///
94    /// Scores are **ordering-only within a block**. Scores from different
95    /// blocks — and in particular text scores vs. vector scores — are not
96    /// on a shared scale. The engine does not normalize across blocks, and
97    /// callers must not compare or arithmetically combine scores across
98    /// blocks.
99    pub score: f64,
100    /// Coarse retrieval-modality classifier. Every hit produced by the
101    /// current text execution paths is tagged
102    /// [`RetrievalModality::Text`]; future phases that wire vector
103    /// retrieval will tag those hits [`RetrievalModality::Vector`].
104    pub modality: RetrievalModality,
105    /// Which FTS surface produced the hit.
106    pub source: SearchHitSource,
107    /// Whether this hit came from the strict or relaxed branch. `Some`
108    /// for every text hit; reserved as `None` for future vector hits,
109    /// which have no strict/relaxed notion.
110    pub match_mode: Option<SearchMatchMode>,
111    /// Short context snippet for display. `Some` for at least the chunk path
112    /// (`SQLite`'s `snippet(...)`) and a trimmed window of `text_content` for
113    /// the property path.
114    pub snippet: Option<String>,
115    /// Wall-clock timestamp (unix seconds) at which the *active* version of
116    /// the node was written.
117    ///
118    /// Under fathomdb's soft-delete supersession model, nodes are versioned:
119    /// each edit creates a new active row and marks the prior row
120    /// superseded. `written_at` reflects when the **current** active row was
121    /// inserted, which is "when the text that just matched was written," not
122    /// "when the `logical_id` was first created." A node created two years ago
123    /// but updated yesterday will show yesterday's timestamp, because
124    /// yesterday's text is what the FTS index scored against.
125    ///
126    /// This is deliberately distinct from `superseded_at` (only populated on
127    /// dead rows), `node_access_metadata.last_accessed_at` (an explicit touch,
128    /// not a write), and `provenance_events.created_at` (audit event time).
129    pub written_at: i64,
130    /// Opaque identifier of the underlying projection row (e.g. `chunks.id`
131    /// for chunk hits, or `fts_node_properties.rowid` for property hits).
132    /// Useful for debugging and for future attribution paths.
133    pub projection_row_id: Option<String>,
134    /// Raw vector distance or similarity for vector hits. `None` for text
135    /// hits.
136    ///
137    /// Stable public API: this field ships in v1 and is documented as
138    /// modality-specific diagnostic data. Callers may read it for display
139    /// or internal reranking but must **not** compare it against text-hit
140    /// `score` values or use it arithmetically alongside text scores — the
141    /// two are not on a shared scale.
142    ///
143    /// For distance metrics the raw distance is preserved (lower = closer
144    /// match); callers that want a "higher is better" ordering value should
145    /// read `score` instead, which is already negated appropriately for
146    /// intra-block ranking.
147    pub vector_distance: Option<f64>,
148    /// Reserved: match-attribution payload. Always `None` in Phase 1.
149    pub attribution: Option<HitAttribution>,
150}
151
152/// Minimal node-shaped projection attached to every [`SearchHit`].
153///
154/// This intentionally mirrors the fields of `fathomdb_engine::NodeRow` without
155/// depending on the engine crate. The engine-side `execute_compiled_search`
156/// materializes `SearchHit` values using its own `NodeRow` type, so the facade
157/// crate converts between the two.
158#[derive(Clone, Debug, Eq, PartialEq)]
159pub struct NodeRowLite {
160    /// Physical row ID.
161    pub row_id: String,
162    /// Logical ID of the node.
163    pub logical_id: String,
164    /// Node kind.
165    pub kind: String,
166    /// JSON-encoded node properties.
167    pub properties: String,
168    /// Optional URI referencing external content.
169    pub content_ref: Option<String>,
170    /// Unix timestamp of last access, if tracked.
171    pub last_accessed_at: Option<i64>,
172}
173
174/// Result set returned by an adaptive text-search execution.
175#[derive(Clone, Debug, Default, PartialEq)]
176pub struct SearchRows {
177    /// Matched hits in descending score order.
178    pub hits: Vec<SearchHit>,
179    /// Count of strict-branch hits (Phase 1: equals `hits.len()`).
180    pub strict_hit_count: usize,
181    /// Count of relaxed-branch hits (Phase 1: always 0).
182    pub relaxed_hit_count: usize,
183    /// Count of vector-branch hits. Always `0` after Phase 10 because no
184    /// vector execution path exists yet; reserved so that when vector
185    /// retrieval lands in a later phase, the wire shape already has the
186    /// counter and consumers do not need a breaking change.
187    pub vector_hit_count: usize,
188    /// Whether the relaxed fallback branch fired (Phase 1: always `false`).
189    pub fallback_used: bool,
190    /// Whether a capability miss caused the query to degrade to an empty
191    /// result set (mirrors `QueryRows::was_degraded`).
192    pub was_degraded: bool,
193}
194
195/// A compiled adaptive-search plan ready for the coordinator to execute.
196///
197/// Phase 2 splits the filter pipeline into two sets: `fusable_filters`
198/// (pushed into the `search_hits` CTE so the CTE `LIMIT` applies after
199/// filtering) and `residual_filters` (evaluated in the outer `WHERE`). The
200/// coordinator emits SQL for it directly rather than reusing
201/// [`crate::compile_query`], because the search SELECT projects a different
202/// row shape (score, source, snippet, projection id) than the flat query
203/// path.
204#[derive(Clone, Debug, PartialEq, Eq)]
205pub struct CompiledSearch {
206    /// Root kind the caller built the query against.
207    pub root_kind: String,
208    /// Parsed text-search intent, to be lowered into safe FTS5 syntax.
209    pub text_query: TextQuery,
210    /// Maximum number of candidate hits to retrieve from the FTS indexes.
211    pub limit: usize,
212    /// Fusable predicates pushed into the `search_hits` CTE by the coordinator.
213    /// These evaluate against columns directly available on the `nodes` table
214    /// joined inside the CTE (`kind`, `logical_id`, `source_ref`,
215    /// `content_ref`).
216    pub fusable_filters: Vec<Predicate>,
217    /// Residual predicates applied in the outer `WHERE` after the CTE
218    /// materializes. Currently limited to JSON-property predicates
219    /// (`json_extract` on `n.properties`).
220    pub residual_filters: Vec<Predicate>,
221    /// Whether the caller requested per-hit match attribution. Phase 5: when
222    /// `true`, the coordinator populates [`SearchHit::attribution`] on every
223    /// hit by resolving FTS5 match positions against the Phase 4 position
224    /// map. When `false` (the default), the position map is not read at all
225    /// and `attribution` stays `None`.
226    pub attribution_requested: bool,
227}
228
229/// A compiled vector-only search plan ready for the coordinator to execute.
230///
231/// Phase 11 delivers a standalone vector retrieval path parallel to
232/// [`CompiledSearch`]. It is intentionally structurally distinct: the vector
233/// path has no [`TextQuery`], no relaxed branch, and no [`SearchMatchMode`] —
234/// vector hits always carry `match_mode: None` per addendum 1. The
235/// coordinator consumes this carrier via
236/// `ExecutionCoordinator::execute_compiled_vector_search`, which emits SQL
237/// against the `vec_nodes_active` virtual table joined to `nodes`, and
238/// returns a [`SearchRows`] with a single vector block (or an empty result
239/// with `was_degraded = true` when the sqlite-vec capability is absent).
240#[derive(Clone, Debug, PartialEq, Eq)]
241pub struct CompiledVectorSearch {
242    /// Root kind the caller built the query against. May be empty for
243    /// kind-agnostic callers, mirroring the text path.
244    pub root_kind: String,
245    /// Raw vector query text passed to sqlite-vec via the `embedding MATCH`
246    /// operator. This is a serialized JSON float array (e.g.
247    /// `"[0.1, 0.2, 0.3, 0.4]"`) at the time the coordinator binds it.
248    pub query_text: String,
249    /// Maximum number of candidate hits to retrieve from the vec0 KNN scan.
250    pub limit: usize,
251    /// Fusable predicates pushed into the vector-search CTE by the
252    /// coordinator. Evaluated against columns directly available on the
253    /// `nodes` table joined inside the CTE.
254    pub fusable_filters: Vec<Predicate>,
255    /// Residual predicates applied in the outer `WHERE` after the CTE
256    /// materializes. Currently limited to JSON-property predicates.
257    pub residual_filters: Vec<Predicate>,
258    /// Whether the caller requested per-hit match attribution. Per addendum
259    /// 1 §Attribution on vector hits, vector hits under this flag carry
260    /// `Some(HitAttribution { matched_paths: vec![] })` — an empty
261    /// matched-paths list, not `None`.
262    pub attribution_requested: bool,
263}
264
265/// A two-branch compiled search plan ready for the coordinator to execute.
266///
267/// Phase 6 factors the strict+relaxed retrieval pair into a small carrier so
268/// that the adaptive [`crate::compile_search`] path and the narrow
269/// `fallback_search(strict, relaxed)` helper share a single coordinator
270/// routine. Both branches carry fully compiled [`CompiledSearch`] values —
271/// including the same fused/residual filter chain and the same
272/// `attribution_requested` flag — so merge/dedup stays branch-agnostic.
273#[derive(Clone, Debug, PartialEq, Eq)]
274pub struct CompiledSearchPlan {
275    /// The strict branch — always runs first.
276    pub strict: CompiledSearch,
277    /// The relaxed branch, or `None` when the caller did not request a
278    /// fallback shape. When `None`, the coordinator runs strict only and
279    /// never triggers the fallback policy.
280    pub relaxed: Option<CompiledSearch>,
281    /// Set when the plan originated from [`crate::derive_relaxed`] and its
282    /// alternatives list was truncated past [`crate::RELAXED_BRANCH_CAP`].
283    /// The `fallback_search` path always sets this to `false` because the
284    /// relaxed shape is caller-provided and not subject to the cap.
285    pub was_degraded_at_plan_time: bool,
286}
287
288/// A compiled unified retrieval plan for the Phase 12 `search()` entry point.
289///
290/// `CompiledRetrievalPlan` carries the bounded set of branches the engine-owned
291/// retrieval planner may run on behalf of a single `search(query, limit)` call:
292/// the text strict + optional text relaxed pair (carried structurally as the
293/// existing Phase 6 [`CompiledSearchPlan`]) and an optional vector branch.
294///
295/// **v1 scope (Phase 12)**: the planner's `vector` branch slot is structurally
296/// supported so that the coordinator's three-block fusion path is fully wired,
297/// but [`crate::compile_retrieval_plan`] always sets `vector` to `None`. Read-
298/// time embedding of natural-language queries is not wired into the engine in
299/// v1; callers that want vector retrieval through the unified `search()`
300/// entry point will get text-only results until a future phase wires the
301/// embedding generator into the read path. Callers who want explicit vector
302/// retrieval today use the advanced `vector_search()` override (Phase 11),
303/// which takes a caller-provided vector literal.
304///
305/// `CompiledRetrievalPlan` is intentionally distinct from
306/// [`CompiledSearchPlan`]: `CompiledSearchPlan` is the text-only carrier
307/// consumed by `text_search()` and `fallback_search()`, and the two paths
308/// remain separate so the text-only call sites do not pay any vector-branch
309/// cost. The Phase 12 unified planner is a sibling, not a replacement.
310#[derive(Clone, Debug, PartialEq, Eq)]
311pub struct CompiledRetrievalPlan {
312    /// The text branches (strict + optional relaxed) of the unified plan.
313    /// Always present — every `search()` call produces at least a strict
314    /// text branch (which may itself short-circuit to empty when the query
315    /// is `Empty` or a top-level `Not`).
316    pub text: CompiledSearchPlan,
317    /// The vector branch slot. Always `None` in v1 per the Phase 12 scope
318    /// constraint above.
319    pub vector: Option<CompiledVectorSearch>,
320    /// Mirrors [`CompiledSearchPlan::was_degraded_at_plan_time`] for the
321    /// text branches: set when the relaxed branch's alternatives list was
322    /// truncated past [`crate::RELAXED_BRANCH_CAP`] at plan-construction
323    /// time. Propagated to the result's `was_degraded` flag if and only if
324    /// the relaxed branch actually fires at execution time.
325    pub was_degraded_at_plan_time: bool,
326}
327
328#[cfg(test)]
329#[allow(clippy::expect_used)]
330mod tests {
331    use super::*;
332
333    #[test]
334    fn search_hit_source_has_vector_variant_reserved() {
335        // Compile-time exhaustiveness check: if a future change removes or
336        // renames Vector, this match stops compiling and the test fails
337        // loudly rather than silently breaking consumers that rely on the
338        // reserved variant.
339        let source = SearchHitSource::Chunk;
340        match source {
341            SearchHitSource::Chunk | SearchHitSource::Property | SearchHitSource::Vector => {}
342        }
343    }
344
345    #[test]
346    fn search_match_mode_has_strict_and_relaxed() {
347        let mode = SearchMatchMode::Strict;
348        match mode {
349            SearchMatchMode::Strict | SearchMatchMode::Relaxed => {}
350        }
351    }
352
353    #[test]
354    fn compile_search_rejects_ast_without_text_search_step() {
355        use crate::{CompileError, QueryBuilder, compile_search};
356        let ast = QueryBuilder::nodes("Goal")
357            .filter_kind_eq("Goal")
358            .into_ast();
359        let result = compile_search(&ast);
360        assert!(
361            matches!(result, Err(CompileError::MissingTextSearchStep)),
362            "expected MissingTextSearchStep, got {result:?}"
363        );
364    }
365
366    #[test]
367    fn compile_search_accepts_text_search_step_with_filters() {
368        use crate::{QueryBuilder, compile_search};
369        let ast = QueryBuilder::nodes("Goal")
370            .text_search("quarterly docs", 7)
371            .filter_kind_eq("Goal")
372            .into_ast();
373        let compiled = compile_search(&ast).expect("compiles");
374        assert_eq!(compiled.root_kind, "Goal");
375        assert_eq!(compiled.limit, 7);
376        assert_eq!(compiled.fusable_filters.len(), 1);
377        assert!(compiled.residual_filters.is_empty());
378    }
379
380    #[test]
381    fn compile_vector_search_rejects_ast_without_vector_search_step() {
382        use crate::{CompileError, QueryBuilder, compile_vector_search};
383        let ast = QueryBuilder::nodes("Goal")
384            .filter_kind_eq("Goal")
385            .into_ast();
386        let result = compile_vector_search(&ast);
387        assert!(
388            matches!(result, Err(CompileError::MissingVectorSearchStep)),
389            "expected MissingVectorSearchStep, got {result:?}"
390        );
391    }
392
393    #[test]
394    fn compile_vector_search_accepts_vector_search_step_with_filters() {
395        use crate::{Predicate, QueryBuilder, compile_vector_search};
396        let ast = QueryBuilder::nodes("Goal")
397            .vector_search("[0.1, 0.2, 0.3, 0.4]", 7)
398            .filter_kind_eq("Goal")
399            .filter_json_text_eq("$.status", "active")
400            .into_ast();
401        let compiled = compile_vector_search(&ast).expect("compiles");
402        assert_eq!(compiled.root_kind, "Goal");
403        assert_eq!(compiled.query_text, "[0.1, 0.2, 0.3, 0.4]");
404        assert_eq!(compiled.limit, 7);
405        assert_eq!(compiled.fusable_filters.len(), 1);
406        assert!(matches!(
407            compiled.fusable_filters[0],
408            Predicate::KindEq(ref k) if k == "Goal"
409        ));
410        assert_eq!(compiled.residual_filters.len(), 1);
411        assert!(!compiled.attribution_requested);
412    }
413}