Skip to main content

pylon_storage/
search.rs

1//! Native faceted full-text search for Pylon entities.
2//!
3//! The design goal is Meilisearch-class latency at 1M-row catalogs
4//! without running a second service. Two ideas carry the performance:
5//!
6//! 1. **SQLite FTS5 for text matching.** A shadow contentless table
7//!    indexes declared text fields. Matches come back as a stream of
8//!    rowids in a few ms regardless of catalog size.
9//!
10//! 2. **Roaring bitmaps for facets.** For every declared facet field,
11//!    we keep a bitmap per distinct value: the set of entity rowids
12//!    that carry that value. On insert/update/delete we flip bits in
13//!    the same transaction as the row change. Facet counts at query
14//!    time collapse to `popcount(match ∩ filter ∩ facet_bitmap)` —
15//!    single-digit microseconds per facet value, no matter how wide
16//!    the match.
17//!
18//! The shadow tables Pylon creates when an entity declares `search:`:
19//!
20//! ```sql
21//! -- One FTS5 virtual table per searchable entity.
22//! CREATE VIRTUAL TABLE "_fts_<Entity>" USING fts5(
23//!     rowid UNINDEXED,       -- external rowid (entity.id)
24//!     <text_field1>,
25//!     <text_field2>,
26//!     ...,
27//!     tokenize = 'unicode61 remove_diacritics 2'
28//! );
29//!
30//! -- One row per (entity, facet, value) — bitmap of matching rowids.
31//! CREATE TABLE "_facet_bitmap" (
32//!   entity    TEXT NOT NULL,
33//!   facet     TEXT NOT NULL,
34//!   value     TEXT NOT NULL,
35//!   bitmap    BLOB NOT NULL,   -- Roaring-serialized
36//!   row_count INTEGER NOT NULL,
37//!   PRIMARY KEY (entity, facet, value)
38//! );
39//!
40//! -- Mapping from entity rowid → numeric rowid-in-bitmap. Bitmaps
41//! -- operate on u32 so we need a compact numeric id; ULIDs are too
42//! -- wide. The rowid column of the entity's SQLite table is perfect.
43//! ```
44//!
45//! Rowid strategy: we treat SQLite's implicit `rowid` as the bitmap
46//! identifier. On insert we remember the rowid in a side-car (the
47//! entity's row already carries it); on facet-bitmap ops we read the
48//! rowid from `"_rowid_of_id"` lookup. No extra surface for users.
49
50use std::collections::{BTreeMap, HashMap};
51
52use roaring::RoaringBitmap;
53use serde::{Deserialize, Serialize};
54
55use crate::StorageError;
56
57// ---------------------------------------------------------------------------
58// Config — re-exported from pylon-kernel
59// ---------------------------------------------------------------------------
60
61/// Storage-side alias for `pylon_kernel::ManifestSearchConfig`. The
62/// shape lives in the kernel because the manifest is what every layer
63/// reads (runtime, storage, router all share it). We re-export here
64/// so storage callers don't have to double-import.
65pub use pylon_kernel::ManifestSearchConfig as SearchConfig;
66
67// ---------------------------------------------------------------------------
68// Query + result shapes (what clients send / get back)
69// ---------------------------------------------------------------------------
70
71/// A single-entity search query. Client sends this; server returns
72/// `SearchResult`. Intentionally small — filter parsing lives on top
73/// of `FilterExpr`, not as free-form JSON.
74#[derive(Debug, Clone, Default, Serialize, Deserialize)]
75pub struct SearchQuery {
76    /// Free-text match across the declared `text` fields.
77    #[serde(default)]
78    pub query: String,
79
80    /// Filter expression: `{ field: value }` = equality. Combined with
81    /// match using AND. Range + IN + boolean ops live on FilterExpr.
82    #[serde(default)]
83    pub filters: HashMap<String, serde_json::Value>,
84
85    /// Facets to compute counts for. If empty, use all declared facets.
86    #[serde(default)]
87    pub facets: Vec<String>,
88
89    /// Sort spec: `(field, "asc" | "desc")`. Must be in the entity's
90    /// `sortable` list.
91    #[serde(default)]
92    pub sort: Option<(String, String)>,
93
94    /// Zero-indexed page.
95    #[serde(default)]
96    pub page: usize,
97
98    /// Page size. Default 20, hard-cap at 100 to keep result payloads
99    /// predictable for subscriptions.
100    #[serde(default = "default_page_size")]
101    pub page_size: usize,
102}
103
104fn default_page_size() -> usize {
105    20
106}
107
108/// What the server hands back.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct SearchResult {
111    /// Hit rows in ranked (or sorted) order. Each is a plain JSON row.
112    pub hits: Vec<serde_json::Value>,
113
114    /// `{facet_name: {value: count}}`. Values sorted by count desc in
115    /// the server's serialization.
116    pub facet_counts: BTreeMap<String, BTreeMap<String, u64>>,
117
118    /// Total hit count (before pagination).
119    pub total: u64,
120
121    /// Milliseconds spent in the query engine, for client-side perf
122    /// instrumentation and snappy-feeling dashboards.
123    pub took_ms: u64,
124}
125
126// ---------------------------------------------------------------------------
127// Facet bitmap storage
128// ---------------------------------------------------------------------------
129
130/// Serialize a Roaring bitmap to bytes for storage in the
131/// `_facet_bitmap.bitmap` BLOB column.
132pub fn serialize_bitmap(b: &RoaringBitmap) -> Result<Vec<u8>, StorageError> {
133    let mut out = Vec::with_capacity(b.serialized_size());
134    b.serialize_into(&mut out)
135        .map_err(|e| StorageError::new("BITMAP_SERIALIZE_FAILED", &e.to_string()))?;
136    Ok(out)
137}
138
139/// Inverse of `serialize_bitmap`.
140pub fn deserialize_bitmap(bytes: &[u8]) -> Result<RoaringBitmap, StorageError> {
141    RoaringBitmap::deserialize_from(bytes)
142        .map_err(|e| StorageError::new("BITMAP_DESERIALIZE_FAILED", &e.to_string()))
143}
144
145// ---------------------------------------------------------------------------
146// SQL generation — table setup + maintenance
147// ---------------------------------------------------------------------------
148
149/// Generate the `CREATE VIRTUAL TABLE _fts_<Entity>` statement for a
150/// given entity + config. Called once during schema push.
151pub fn create_fts_table_sql(entity: &str, config: &SearchConfig) -> Option<String> {
152    if config.text.is_empty() {
153        return None;
154    }
155    let cols = config
156        .text
157        .iter()
158        .map(|f| format!("\"{f}\""))
159        .collect::<Vec<_>>()
160        .join(", ");
161    Some(format!(
162        "CREATE VIRTUAL TABLE IF NOT EXISTS \"_fts_{entity}\" USING fts5(\
163          entity_id UNINDEXED, {cols}, \
164          tokenize = 'unicode61 remove_diacritics 2'\
165         );"
166    ))
167}
168
169/// One-time table for all facet bitmaps across all entities. Shared so
170/// ops like "warm cache for all facets" can page through a single
171/// table. Keyed by (entity, facet, value).
172pub fn create_facet_table_sql() -> &'static str {
173    "CREATE TABLE IF NOT EXISTS \"_facet_bitmap\" (\
174       entity    TEXT NOT NULL,\
175       facet     TEXT NOT NULL,\
176       value     TEXT NOT NULL,\
177       bitmap    BLOB NOT NULL,\
178       row_count INTEGER NOT NULL,\
179       PRIMARY KEY (entity, facet, value)\
180     );"
181}
182
183// ---------------------------------------------------------------------------
184// Tests
185// ---------------------------------------------------------------------------
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn bitmap_roundtrip() {
193        let mut b = RoaringBitmap::new();
194        for i in 0..10_000 {
195            if i % 3 == 0 {
196                b.insert(i);
197            }
198        }
199        let bytes = serialize_bitmap(&b).unwrap();
200        let round = deserialize_bitmap(&bytes).unwrap();
201        assert_eq!(b, round);
202    }
203
204    #[test]
205    fn fts_sql_skipped_when_no_text_fields() {
206        let cfg = SearchConfig {
207            text: vec![],
208            facets: vec!["brand".into()],
209            sortable: vec![],
210        };
211        assert!(create_fts_table_sql("Product", &cfg).is_none());
212    }
213
214    #[test]
215    fn fts_sql_lists_declared_text_columns() {
216        let cfg = SearchConfig {
217            text: vec!["name".into(), "description".into()],
218            facets: vec![],
219            sortable: vec![],
220        };
221        let sql = create_fts_table_sql("Product", &cfg).unwrap();
222        assert!(sql.contains("\"_fts_Product\""));
223        assert!(sql.contains("\"name\""));
224        assert!(sql.contains("\"description\""));
225        assert!(sql.contains("unicode61"));
226    }
227
228    #[test]
229    fn bitmap_intersect_popcount_is_facet_count() {
230        // Proves the core performance move: after ANDing a match bitmap
231        // with a facet bitmap, counting the cardinality is a single
232        // popcount, not a table scan.
233        let mut matches = RoaringBitmap::new();
234        matches.insert_range(0..1_000_000u32);
235
236        let mut brand_nike = RoaringBitmap::new();
237        for i in (0..1_000_000u32).step_by(7) {
238            brand_nike.insert(i);
239        }
240
241        let and = &matches & &brand_nike;
242        assert_eq!(and.len(), brand_nike.len());
243    }
244}