pylon_storage/search.rs
1//! Native faceted full-text search for Pylon entities.
2//!
3//! The design goal is Meilisearch-class latency at 1M-row catalogs
4//! without running a second service. Two ideas carry the performance:
5//!
6//! 1. **SQLite FTS5 for text matching.** A shadow contentless table
7//! indexes declared text fields. Matches come back as a stream of
8//! rowids in a few ms regardless of catalog size.
9//!
10//! 2. **Roaring bitmaps for facets.** For every declared facet field,
11//! we keep a bitmap per distinct value: the set of entity rowids
12//! that carry that value. On insert/update/delete we flip bits in
13//! the same transaction as the row change. Facet counts at query
14//! time collapse to `popcount(match ∩ filter ∩ facet_bitmap)` —
15//! single-digit microseconds per facet value, no matter how wide
16//! the match.
17//!
18//! The shadow tables Pylon creates when an entity declares `search:`:
19//!
20//! ```sql
21//! -- One FTS5 virtual table per searchable entity.
22//! CREATE VIRTUAL TABLE "_fts_<Entity>" USING fts5(
23//! rowid UNINDEXED, -- external rowid (entity.id)
24//! <text_field1>,
25//! <text_field2>,
26//! ...,
27//! tokenize = 'unicode61 remove_diacritics 2'
28//! );
29//!
30//! -- One row per (entity, facet, value) — bitmap of matching rowids.
31//! CREATE TABLE "_facet_bitmap" (
32//! entity TEXT NOT NULL,
33//! facet TEXT NOT NULL,
34//! value TEXT NOT NULL,
35//! bitmap BLOB NOT NULL, -- Roaring-serialized
36//! row_count INTEGER NOT NULL,
37//! PRIMARY KEY (entity, facet, value)
38//! );
39//!
40//! -- Mapping from entity rowid → numeric rowid-in-bitmap. Bitmaps
41//! -- operate on u32 so we need a compact numeric id; ULIDs are too
42//! -- wide. The rowid column of the entity's SQLite table is perfect.
43//! ```
44//!
45//! Rowid strategy: we treat SQLite's implicit `rowid` as the bitmap
46//! identifier. On insert we remember the rowid in a side-car (the
47//! entity's row already carries it); on facet-bitmap ops we read the
48//! rowid from `"_rowid_of_id"` lookup. No extra surface for users.
49
50use std::collections::{BTreeMap, HashMap};
51
52use roaring::RoaringBitmap;
53use serde::{Deserialize, Serialize};
54
55use crate::StorageError;
56
57// ---------------------------------------------------------------------------
58// Config — re-exported from pylon-kernel
59// ---------------------------------------------------------------------------
60
61/// Storage-side alias for `pylon_kernel::ManifestSearchConfig`. The
62/// shape lives in the kernel because the manifest is what every layer
63/// reads (runtime, storage, router all share it). We re-export here
64/// so storage callers don't have to double-import.
65pub use pylon_kernel::ManifestSearchConfig as SearchConfig;
66
67// ---------------------------------------------------------------------------
68// Query + result shapes (what clients send / get back)
69// ---------------------------------------------------------------------------
70
71/// A single-entity search query. Client sends this; server returns
72/// `SearchResult`. Intentionally small — filter parsing lives on top
73/// of `FilterExpr`, not as free-form JSON.
74#[derive(Debug, Clone, Default, Serialize, Deserialize)]
75pub struct SearchQuery {
76 /// Free-text match across the declared `text` fields.
77 #[serde(default)]
78 pub query: String,
79
80 /// Filter expression: `{ field: value }` = equality. Combined with
81 /// match using AND. Range + IN + boolean ops live on FilterExpr.
82 #[serde(default)]
83 pub filters: HashMap<String, serde_json::Value>,
84
85 /// Facets to compute counts for. If empty, use all declared facets.
86 #[serde(default)]
87 pub facets: Vec<String>,
88
89 /// Sort spec: `(field, "asc" | "desc")`. Must be in the entity's
90 /// `sortable` list.
91 #[serde(default)]
92 pub sort: Option<(String, String)>,
93
94 /// Zero-indexed page.
95 #[serde(default)]
96 pub page: usize,
97
98 /// Page size. Default 20, hard-cap at 100 to keep result payloads
99 /// predictable for subscriptions.
100 #[serde(default = "default_page_size")]
101 pub page_size: usize,
102}
103
104fn default_page_size() -> usize {
105 20
106}
107
108/// What the server hands back. Wire format uses camelCase
109/// (`facetCounts`, `tookMs`) to match the typed-client SDKs and
110/// `ctx.db.search` TS surface — Rust callers still see snake_case
111/// field names because the rename is only on the serde layer.
112#[derive(Debug, Clone, Serialize, Deserialize)]
113#[serde(rename_all = "camelCase")]
114pub struct SearchResult {
115 /// Hit rows in ranked (or sorted) order. Each is a plain JSON row.
116 pub hits: Vec<serde_json::Value>,
117
118 /// `{facet_name: {value: count}}`. Values sorted by count desc in
119 /// the server's serialization.
120 pub facet_counts: BTreeMap<String, BTreeMap<String, u64>>,
121
122 /// Total hit count (before pagination).
123 pub total: u64,
124
125 /// Milliseconds spent in the query engine, for client-side perf
126 /// instrumentation and snappy-feeling dashboards.
127 pub took_ms: u64,
128}
129
130// ---------------------------------------------------------------------------
131// Cross-backend helpers — used by both SQLite (`search_maintenance`)
132// and Postgres (`pg_search`). Living here keeps the two backends from
133// drifting on the same row-merge / facet-stringify rules.
134// ---------------------------------------------------------------------------
135
136/// Shallow merge: `patch` overwrites corresponding fields on `old_row`.
137/// Both backends use this when rebuilding the FTS row after a partial
138/// UPDATE — the new tsvector reflects the post-update state.
139pub fn merge_row(old_row: &serde_json::Value, patch: &serde_json::Value) -> serde_json::Value {
140 let mut merged = old_row.as_object().cloned().unwrap_or_default();
141 if let Some(obj) = patch.as_object() {
142 for (k, v) in obj {
143 merged.insert(k.clone(), v.clone());
144 }
145 }
146 serde_json::Value::Object(merged)
147}
148
149/// Coerce a JSON value into the canonical string used as a facet
150/// bitmap key (SQLite) or facet equality value (Postgres). Numbers
151/// drop trailing zeros so `4.50` and `4.5` map to the same value;
152/// nulls + complex types become `None` so they don't get a bucket.
153pub fn stringify_facet(value: &serde_json::Value) -> Option<String> {
154 match value {
155 serde_json::Value::Null => None,
156 serde_json::Value::Bool(b) => Some(b.to_string()),
157 serde_json::Value::Number(n) => Some(n.to_string()),
158 serde_json::Value::String(s) => Some(s.clone()),
159 serde_json::Value::Array(_) | serde_json::Value::Object(_) => None,
160 }
161}
162
163// ---------------------------------------------------------------------------
164// Facet bitmap storage
165// ---------------------------------------------------------------------------
166
167/// Serialize a Roaring bitmap to bytes for storage in the
168/// `_facet_bitmap.bitmap` BLOB column.
169pub fn serialize_bitmap(b: &RoaringBitmap) -> Result<Vec<u8>, StorageError> {
170 let mut out = Vec::with_capacity(b.serialized_size());
171 b.serialize_into(&mut out)
172 .map_err(|e| StorageError::new("BITMAP_SERIALIZE_FAILED", &e.to_string()))?;
173 Ok(out)
174}
175
176/// Inverse of `serialize_bitmap`.
177pub fn deserialize_bitmap(bytes: &[u8]) -> Result<RoaringBitmap, StorageError> {
178 RoaringBitmap::deserialize_from(bytes)
179 .map_err(|e| StorageError::new("BITMAP_DESERIALIZE_FAILED", &e.to_string()))
180}
181
182// ---------------------------------------------------------------------------
183// SQL generation — table setup + maintenance
184// ---------------------------------------------------------------------------
185
186/// Generate the `CREATE VIRTUAL TABLE _fts_<Entity>` statement for a
187/// given entity + config. Called once during schema push.
188pub fn create_fts_table_sql(entity: &str, config: &SearchConfig) -> Option<String> {
189 if config.text.is_empty() {
190 return None;
191 }
192 let cols = config
193 .text
194 .iter()
195 .map(|f| format!("\"{f}\""))
196 .collect::<Vec<_>>()
197 .join(", ");
198 Some(format!(
199 "CREATE VIRTUAL TABLE IF NOT EXISTS \"_fts_{entity}\" USING fts5(\
200 entity_id UNINDEXED, {cols}, \
201 tokenize = 'unicode61 remove_diacritics 2'\
202 );"
203 ))
204}
205
206/// One-time table for all facet bitmaps across all entities. Shared so
207/// ops like "warm cache for all facets" can page through a single
208/// table. Keyed by (entity, facet, value).
209pub fn create_facet_table_sql() -> &'static str {
210 "CREATE TABLE IF NOT EXISTS \"_facet_bitmap\" (\
211 entity TEXT NOT NULL,\
212 facet TEXT NOT NULL,\
213 value TEXT NOT NULL,\
214 bitmap BLOB NOT NULL,\
215 row_count INTEGER NOT NULL,\
216 PRIMARY KEY (entity, facet, value)\
217 );"
218}
219
220// ---------------------------------------------------------------------------
221// Tests
222// ---------------------------------------------------------------------------
223
224#[cfg(test)]
225mod tests {
226 use super::*;
227
228 #[test]
229 fn bitmap_roundtrip() {
230 let mut b = RoaringBitmap::new();
231 for i in 0..10_000 {
232 if i % 3 == 0 {
233 b.insert(i);
234 }
235 }
236 let bytes = serialize_bitmap(&b).unwrap();
237 let round = deserialize_bitmap(&bytes).unwrap();
238 assert_eq!(b, round);
239 }
240
241 #[test]
242 fn fts_sql_skipped_when_no_text_fields() {
243 let cfg = SearchConfig {
244 text: vec![],
245 facets: vec!["brand".into()],
246 sortable: vec![],
247 language: None,
248 };
249 assert!(create_fts_table_sql("Product", &cfg).is_none());
250 }
251
252 #[test]
253 fn fts_sql_lists_declared_text_columns() {
254 let cfg = SearchConfig {
255 text: vec!["name".into(), "description".into()],
256 facets: vec![],
257 sortable: vec![],
258 language: None,
259 };
260 let sql = create_fts_table_sql("Product", &cfg).unwrap();
261 assert!(sql.contains("\"_fts_Product\""));
262 assert!(sql.contains("\"name\""));
263 assert!(sql.contains("\"description\""));
264 assert!(sql.contains("unicode61"));
265 }
266
267 #[test]
268 fn bitmap_intersect_popcount_is_facet_count() {
269 // Proves the core performance move: after ANDing a match bitmap
270 // with a facet bitmap, counting the cardinality is a single
271 // popcount, not a table scan.
272 let mut matches = RoaringBitmap::new();
273 matches.insert_range(0..1_000_000u32);
274
275 let mut brand_nike = RoaringBitmap::new();
276 for i in (0..1_000_000u32).step_by(7) {
277 brand_nike.insert(i);
278 }
279
280 let and = &matches & &brand_nike;
281 assert_eq!(and.len(), brand_nike.len());
282 }
283}