pylon_storage/search.rs
1//! Native faceted full-text search for Pylon entities.
2//!
3//! The design goal is Meilisearch-class latency at 1M-row catalogs
4//! without running a second service. Two ideas carry the performance:
5//!
6//! 1. **SQLite FTS5 for text matching.** A shadow contentless table
7//! indexes declared text fields. Matches come back as a stream of
8//! rowids in a few ms regardless of catalog size.
9//!
10//! 2. **Roaring bitmaps for facets.** For every declared facet field,
11//! we keep a bitmap per distinct value: the set of entity rowids
12//! that carry that value. On insert/update/delete we flip bits in
13//! the same transaction as the row change. Facet counts at query
14//! time collapse to `popcount(match ∩ filter ∩ facet_bitmap)` —
15//! single-digit microseconds per facet value, no matter how wide
16//! the match.
17//!
18//! The shadow tables Pylon creates when an entity declares `search:`:
19//!
20//! ```sql
21//! -- One FTS5 virtual table per searchable entity.
22//! CREATE VIRTUAL TABLE "_fts_<Entity>" USING fts5(
23//! rowid UNINDEXED, -- external rowid (entity.id)
24//! <text_field1>,
25//! <text_field2>,
26//! ...,
27//! tokenize = 'unicode61 remove_diacritics 2'
28//! );
29//!
30//! -- One row per (entity, facet, value) — bitmap of matching rowids.
31//! CREATE TABLE "_facet_bitmap" (
32//! entity TEXT NOT NULL,
33//! facet TEXT NOT NULL,
34//! value TEXT NOT NULL,
35//! bitmap BLOB NOT NULL, -- Roaring-serialized
36//! row_count INTEGER NOT NULL,
37//! PRIMARY KEY (entity, facet, value)
38//! );
39//!
40//! -- Mapping from entity rowid → numeric rowid-in-bitmap. Bitmaps
41//! -- operate on u32 so we need a compact numeric id; ULIDs are too
42//! -- wide. The rowid column of the entity's SQLite table is perfect.
43//! ```
44//!
45//! Rowid strategy: we treat SQLite's implicit `rowid` as the bitmap
46//! identifier. On insert we remember the rowid in a side-car (the
47//! entity's row already carries it); on facet-bitmap ops we read the
48//! rowid from `"_rowid_of_id"` lookup. No extra surface for users.
49
50use std::collections::{BTreeMap, HashMap};
51
52use roaring::RoaringBitmap;
53use serde::{Deserialize, Serialize};
54
55use crate::StorageError;
56
57// ---------------------------------------------------------------------------
58// Config — re-exported from pylon-kernel
59// ---------------------------------------------------------------------------
60
61/// Storage-side alias for `pylon_kernel::ManifestSearchConfig`. The
62/// shape lives in the kernel because the manifest is what every layer
63/// reads (runtime, storage, router all share it). We re-export here
64/// so storage callers don't have to double-import.
65pub use pylon_kernel::ManifestSearchConfig as SearchConfig;
66
67// ---------------------------------------------------------------------------
68// Query + result shapes (what clients send / get back)
69// ---------------------------------------------------------------------------
70
71/// A single-entity search query. Client sends this; server returns
72/// `SearchResult`. Intentionally small — filter parsing lives on top
73/// of `FilterExpr`, not as free-form JSON.
74#[derive(Debug, Clone, Default, Serialize, Deserialize)]
75pub struct SearchQuery {
76 /// Free-text match across the declared `text` fields.
77 #[serde(default)]
78 pub query: String,
79
80 /// Filter expression: `{ field: value }` = equality. Combined with
81 /// match using AND. Range + IN + boolean ops live on FilterExpr.
82 #[serde(default)]
83 pub filters: HashMap<String, serde_json::Value>,
84
85 /// Facets to compute counts for. If empty, use all declared facets.
86 #[serde(default)]
87 pub facets: Vec<String>,
88
89 /// Sort spec: `(field, "asc" | "desc")`. Must be in the entity's
90 /// `sortable` list.
91 #[serde(default)]
92 pub sort: Option<(String, String)>,
93
94 /// Zero-indexed page.
95 #[serde(default)]
96 pub page: usize,
97
98 /// Page size. Default 20, hard-cap at 100 to keep result payloads
99 /// predictable for subscriptions.
100 #[serde(default = "default_page_size")]
101 pub page_size: usize,
102}
103
104fn default_page_size() -> usize {
105 20
106}
107
108/// What the server hands back.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct SearchResult {
111 /// Hit rows in ranked (or sorted) order. Each is a plain JSON row.
112 pub hits: Vec<serde_json::Value>,
113
114 /// `{facet_name: {value: count}}`. Values sorted by count desc in
115 /// the server's serialization.
116 pub facet_counts: BTreeMap<String, BTreeMap<String, u64>>,
117
118 /// Total hit count (before pagination).
119 pub total: u64,
120
121 /// Milliseconds spent in the query engine, for client-side perf
122 /// instrumentation and snappy-feeling dashboards.
123 pub took_ms: u64,
124}
125
126// ---------------------------------------------------------------------------
127// Facet bitmap storage
128// ---------------------------------------------------------------------------
129
130/// Serialize a Roaring bitmap to bytes for storage in the
131/// `_facet_bitmap.bitmap` BLOB column.
132pub fn serialize_bitmap(b: &RoaringBitmap) -> Result<Vec<u8>, StorageError> {
133 let mut out = Vec::with_capacity(b.serialized_size());
134 b.serialize_into(&mut out)
135 .map_err(|e| StorageError::new("BITMAP_SERIALIZE_FAILED", &e.to_string()))?;
136 Ok(out)
137}
138
139/// Inverse of `serialize_bitmap`.
140pub fn deserialize_bitmap(bytes: &[u8]) -> Result<RoaringBitmap, StorageError> {
141 RoaringBitmap::deserialize_from(bytes)
142 .map_err(|e| StorageError::new("BITMAP_DESERIALIZE_FAILED", &e.to_string()))
143}
144
145// ---------------------------------------------------------------------------
146// SQL generation — table setup + maintenance
147// ---------------------------------------------------------------------------
148
149/// Generate the `CREATE VIRTUAL TABLE _fts_<Entity>` statement for a
150/// given entity + config. Called once during schema push.
151pub fn create_fts_table_sql(entity: &str, config: &SearchConfig) -> Option<String> {
152 if config.text.is_empty() {
153 return None;
154 }
155 let cols = config
156 .text
157 .iter()
158 .map(|f| format!("\"{f}\""))
159 .collect::<Vec<_>>()
160 .join(", ");
161 Some(format!(
162 "CREATE VIRTUAL TABLE IF NOT EXISTS \"_fts_{entity}\" USING fts5(\
163 entity_id UNINDEXED, {cols}, \
164 tokenize = 'unicode61 remove_diacritics 2'\
165 );"
166 ))
167}
168
169/// One-time table for all facet bitmaps across all entities. Shared so
170/// ops like "warm cache for all facets" can page through a single
171/// table. Keyed by (entity, facet, value).
172pub fn create_facet_table_sql() -> &'static str {
173 "CREATE TABLE IF NOT EXISTS \"_facet_bitmap\" (\
174 entity TEXT NOT NULL,\
175 facet TEXT NOT NULL,\
176 value TEXT NOT NULL,\
177 bitmap BLOB NOT NULL,\
178 row_count INTEGER NOT NULL,\
179 PRIMARY KEY (entity, facet, value)\
180 );"
181}
182
183// ---------------------------------------------------------------------------
184// Tests
185// ---------------------------------------------------------------------------
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190
191 #[test]
192 fn bitmap_roundtrip() {
193 let mut b = RoaringBitmap::new();
194 for i in 0..10_000 {
195 if i % 3 == 0 {
196 b.insert(i);
197 }
198 }
199 let bytes = serialize_bitmap(&b).unwrap();
200 let round = deserialize_bitmap(&bytes).unwrap();
201 assert_eq!(b, round);
202 }
203
204 #[test]
205 fn fts_sql_skipped_when_no_text_fields() {
206 let cfg = SearchConfig {
207 text: vec![],
208 facets: vec!["brand".into()],
209 sortable: vec![],
210 };
211 assert!(create_fts_table_sql("Product", &cfg).is_none());
212 }
213
214 #[test]
215 fn fts_sql_lists_declared_text_columns() {
216 let cfg = SearchConfig {
217 text: vec!["name".into(), "description".into()],
218 facets: vec![],
219 sortable: vec![],
220 };
221 let sql = create_fts_table_sql("Product", &cfg).unwrap();
222 assert!(sql.contains("\"_fts_Product\""));
223 assert!(sql.contains("\"name\""));
224 assert!(sql.contains("\"description\""));
225 assert!(sql.contains("unicode61"));
226 }
227
228 #[test]
229 fn bitmap_intersect_popcount_is_facet_count() {
230 // Proves the core performance move: after ANDing a match bitmap
231 // with a facet bitmap, counting the cardinality is a single
232 // popcount, not a table scan.
233 let mut matches = RoaringBitmap::new();
234 matches.insert_range(0..1_000_000u32);
235
236 let mut brand_nike = RoaringBitmap::new();
237 for i in (0..1_000_000u32).step_by(7) {
238 brand_nike.insert(i);
239 }
240
241 let and = &matches & &brand_nike;
242 assert_eq!(and.len(), brand_nike.len());
243 }
244}