Skip to main content

pylon_storage/
search.rs

1//! Native faceted full-text search for Pylon entities.
2//!
3//! The design goal is Meilisearch-class latency at 1M-row catalogs
4//! without running a second service. Two ideas carry the performance:
5//!
6//! 1. **SQLite FTS5 for text matching.** A shadow contentless table
7//!    indexes declared text fields. Matches come back as a stream of
8//!    rowids in a few ms regardless of catalog size.
9//!
10//! 2. **Roaring bitmaps for facets.** For every declared facet field,
11//!    we keep a bitmap per distinct value: the set of entity rowids
12//!    that carry that value. On insert/update/delete we flip bits in
13//!    the same transaction as the row change. Facet counts at query
14//!    time collapse to `popcount(match ∩ filter ∩ facet_bitmap)` —
15//!    single-digit microseconds per facet value, no matter how wide
16//!    the match.
17//!
18//! The shadow tables Pylon creates when an entity declares `search:`:
19//!
20//! ```sql
21//! -- One FTS5 virtual table per searchable entity.
22//! CREATE VIRTUAL TABLE "_fts_<Entity>" USING fts5(
23//!     rowid UNINDEXED,       -- external rowid (entity.id)
24//!     <text_field1>,
25//!     <text_field2>,
26//!     ...,
27//!     tokenize = 'unicode61 remove_diacritics 2'
28//! );
29//!
30//! -- One row per (entity, facet, value) — bitmap of matching rowids.
31//! CREATE TABLE "_facet_bitmap" (
32//!   entity    TEXT NOT NULL,
33//!   facet     TEXT NOT NULL,
34//!   value     TEXT NOT NULL,
35//!   bitmap    BLOB NOT NULL,   -- Roaring-serialized
36//!   row_count INTEGER NOT NULL,
37//!   PRIMARY KEY (entity, facet, value)
38//! );
39//!
40//! -- Mapping from entity rowid → numeric rowid-in-bitmap. Bitmaps
41//! -- operate on u32 so we need a compact numeric id; ULIDs are too
42//! -- wide. The rowid column of the entity's SQLite table is perfect.
43//! ```
44//!
45//! Rowid strategy: we treat SQLite's implicit `rowid` as the bitmap
46//! identifier. On insert we remember the rowid in a side-car (the
47//! entity's row already carries it); on facet-bitmap ops we read the
48//! rowid from `"_rowid_of_id"` lookup. No extra surface for users.
49
50use std::collections::{BTreeMap, HashMap};
51
52use roaring::RoaringBitmap;
53use serde::{Deserialize, Serialize};
54
55use crate::StorageError;
56
57// ---------------------------------------------------------------------------
58// Config — re-exported from pylon-kernel
59// ---------------------------------------------------------------------------
60
61/// Storage-side alias for `pylon_kernel::ManifestSearchConfig`. The
62/// shape lives in the kernel because the manifest is what every layer
63/// reads (runtime, storage, router all share it). We re-export here
64/// so storage callers don't have to double-import.
65pub use pylon_kernel::ManifestSearchConfig as SearchConfig;
66
67// ---------------------------------------------------------------------------
68// Query + result shapes (what clients send / get back)
69// ---------------------------------------------------------------------------
70
71/// A single-entity search query. Client sends this; server returns
72/// `SearchResult`. Intentionally small — filter parsing lives on top
73/// of `FilterExpr`, not as free-form JSON.
74#[derive(Debug, Clone, Default, Serialize, Deserialize)]
75pub struct SearchQuery {
76    /// Free-text match across the declared `text` fields.
77    #[serde(default)]
78    pub query: String,
79
80    /// Filter expression: `{ field: value }` = equality. Combined with
81    /// match using AND. Range + IN + boolean ops live on FilterExpr.
82    #[serde(default)]
83    pub filters: HashMap<String, serde_json::Value>,
84
85    /// Facets to compute counts for. If empty, use all declared facets.
86    #[serde(default)]
87    pub facets: Vec<String>,
88
89    /// Sort spec: `(field, "asc" | "desc")`. Must be in the entity's
90    /// `sortable` list.
91    #[serde(default)]
92    pub sort: Option<(String, String)>,
93
94    /// Zero-indexed page.
95    #[serde(default)]
96    pub page: usize,
97
98    /// Page size. Default 20, hard-cap at 100 to keep result payloads
99    /// predictable for subscriptions.
100    #[serde(default = "default_page_size")]
101    pub page_size: usize,
102}
103
104fn default_page_size() -> usize {
105    20
106}
107
108/// What the server hands back. Wire format uses camelCase
109/// (`facetCounts`, `tookMs`) to match the typed-client SDKs and
110/// `ctx.db.search` TS surface — Rust callers still see snake_case
111/// field names because the rename is only on the serde layer.
112#[derive(Debug, Clone, Serialize, Deserialize)]
113#[serde(rename_all = "camelCase")]
114pub struct SearchResult {
115    /// Hit rows in ranked (or sorted) order. Each is a plain JSON row.
116    pub hits: Vec<serde_json::Value>,
117
118    /// `{facet_name: {value: count}}`. Values sorted by count desc in
119    /// the server's serialization.
120    pub facet_counts: BTreeMap<String, BTreeMap<String, u64>>,
121
122    /// Total hit count (before pagination).
123    pub total: u64,
124
125    /// Milliseconds spent in the query engine, for client-side perf
126    /// instrumentation and snappy-feeling dashboards.
127    pub took_ms: u64,
128}
129
130// ---------------------------------------------------------------------------
131// Cross-backend helpers — used by both SQLite (`search_maintenance`)
132// and Postgres (`pg_search`). Living here keeps the two backends from
133// drifting on the same row-merge / facet-stringify rules.
134// ---------------------------------------------------------------------------
135
136/// Shallow merge: `patch` overwrites corresponding fields on `old_row`.
137/// Both backends use this when rebuilding the FTS row after a partial
138/// UPDATE — the new tsvector reflects the post-update state.
139pub fn merge_row(old_row: &serde_json::Value, patch: &serde_json::Value) -> serde_json::Value {
140    let mut merged = old_row.as_object().cloned().unwrap_or_default();
141    if let Some(obj) = patch.as_object() {
142        for (k, v) in obj {
143            merged.insert(k.clone(), v.clone());
144        }
145    }
146    serde_json::Value::Object(merged)
147}
148
149/// Coerce a JSON value into the canonical string used as a facet
150/// bitmap key (SQLite) or facet equality value (Postgres). Numbers
151/// drop trailing zeros so `4.50` and `4.5` map to the same value;
152/// nulls + complex types become `None` so they don't get a bucket.
153pub fn stringify_facet(value: &serde_json::Value) -> Option<String> {
154    match value {
155        serde_json::Value::Null => None,
156        serde_json::Value::Bool(b) => Some(b.to_string()),
157        serde_json::Value::Number(n) => Some(n.to_string()),
158        serde_json::Value::String(s) => Some(s.clone()),
159        serde_json::Value::Array(_) | serde_json::Value::Object(_) => None,
160    }
161}
162
163// ---------------------------------------------------------------------------
164// Facet bitmap storage
165// ---------------------------------------------------------------------------
166
167/// Serialize a Roaring bitmap to bytes for storage in the
168/// `_facet_bitmap.bitmap` BLOB column.
169pub fn serialize_bitmap(b: &RoaringBitmap) -> Result<Vec<u8>, StorageError> {
170    let mut out = Vec::with_capacity(b.serialized_size());
171    b.serialize_into(&mut out)
172        .map_err(|e| StorageError::new("BITMAP_SERIALIZE_FAILED", &e.to_string()))?;
173    Ok(out)
174}
175
176/// Inverse of `serialize_bitmap`.
177pub fn deserialize_bitmap(bytes: &[u8]) -> Result<RoaringBitmap, StorageError> {
178    RoaringBitmap::deserialize_from(bytes)
179        .map_err(|e| StorageError::new("BITMAP_DESERIALIZE_FAILED", &e.to_string()))
180}
181
182// ---------------------------------------------------------------------------
183// SQL generation — table setup + maintenance
184// ---------------------------------------------------------------------------
185
186/// Generate the `CREATE VIRTUAL TABLE _fts_<Entity>` statement for a
187/// given entity + config. Called once during schema push.
188pub fn create_fts_table_sql(entity: &str, config: &SearchConfig) -> Option<String> {
189    if config.text.is_empty() {
190        return None;
191    }
192    let cols = config
193        .text
194        .iter()
195        .map(|f| format!("\"{f}\""))
196        .collect::<Vec<_>>()
197        .join(", ");
198    Some(format!(
199        "CREATE VIRTUAL TABLE IF NOT EXISTS \"_fts_{entity}\" USING fts5(\
200          entity_id UNINDEXED, {cols}, \
201          tokenize = 'unicode61 remove_diacritics 2'\
202         );"
203    ))
204}
205
206/// One-time table for all facet bitmaps across all entities. Shared so
207/// ops like "warm cache for all facets" can page through a single
208/// table. Keyed by (entity, facet, value).
209pub fn create_facet_table_sql() -> &'static str {
210    "CREATE TABLE IF NOT EXISTS \"_facet_bitmap\" (\
211       entity    TEXT NOT NULL,\
212       facet     TEXT NOT NULL,\
213       value     TEXT NOT NULL,\
214       bitmap    BLOB NOT NULL,\
215       row_count INTEGER NOT NULL,\
216       PRIMARY KEY (entity, facet, value)\
217     );"
218}
219
220// ---------------------------------------------------------------------------
221// Tests
222// ---------------------------------------------------------------------------
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn bitmap_roundtrip() {
230        let mut b = RoaringBitmap::new();
231        for i in 0..10_000 {
232            if i % 3 == 0 {
233                b.insert(i);
234            }
235        }
236        let bytes = serialize_bitmap(&b).unwrap();
237        let round = deserialize_bitmap(&bytes).unwrap();
238        assert_eq!(b, round);
239    }
240
241    #[test]
242    fn fts_sql_skipped_when_no_text_fields() {
243        let cfg = SearchConfig {
244            text: vec![],
245            facets: vec!["brand".into()],
246            sortable: vec![],
247            language: None,
248        };
249        assert!(create_fts_table_sql("Product", &cfg).is_none());
250    }
251
252    #[test]
253    fn fts_sql_lists_declared_text_columns() {
254        let cfg = SearchConfig {
255            text: vec!["name".into(), "description".into()],
256            facets: vec![],
257            sortable: vec![],
258            language: None,
259        };
260        let sql = create_fts_table_sql("Product", &cfg).unwrap();
261        assert!(sql.contains("\"_fts_Product\""));
262        assert!(sql.contains("\"name\""));
263        assert!(sql.contains("\"description\""));
264        assert!(sql.contains("unicode61"));
265    }
266
267    #[test]
268    fn bitmap_intersect_popcount_is_facet_count() {
269        // Proves the core performance move: after ANDing a match bitmap
270        // with a facet bitmap, counting the cardinality is a single
271        // popcount, not a table scan.
272        let mut matches = RoaringBitmap::new();
273        matches.insert_range(0..1_000_000u32);
274
275        let mut brand_nike = RoaringBitmap::new();
276        for i in (0..1_000_000u32).step_by(7) {
277            brand_nike.insert(i);
278        }
279
280        let and = &matches & &brand_nike;
281        assert_eq!(and.len(), brand_nike.len());
282    }
283}