Skip to main content

reddb_server/runtime/
schema_vocabulary.rs

1//! Reverse-index from tokens to schema entities (issue #120).
2//!
3//! `SchemaVocabulary` holds a `token → Vec<VocabHit>` map. Lookups are
4//! O(1) average per token (HashMap probe + slice return). The index is
5//! kept current incrementally via `on_ddl(event)` calls from the DDL
6//! execution paths in [`super::impl_ddl`], [`super::impl_timeseries`],
7//! [`super::impl_queue`], [`super::impl_tree`], and the policy /
8//! migration dispatch in [`super::impl_core`].
9//!
10//! The AskPipeline (slice C, issue #121) uses this index in its Stage 2
11//! candidate-narrowing pass *before* spending embedding compute, so the
12//! API exposes only the read path that the pipeline needs:
13//! `lookup(token) -> &[VocabHit]`. Catalog ownership (mutating writes)
14//! stays internal to the runtime.
15//!
16//! ## Token sources
17//!
18//! - Collection names
19//! - Column names (declared in `CollectionContract`)
20//! - Doc-shape `type` / `kind` discriminator values, when surfaced via
21//!   the contract's enum metadata
22//! - Index names
23//! - Policy names
24//! - Operator-supplied descriptions when present in the catalog
25//!
26//! ## Token normalisation
27//!
28//! 1. Unicode NFKD decomposition.
29//! 2. Strip combining marks (accent-fold).
30//! 3. Lowercase.
31//! 4. Strip surrounding non-alphanumerics.
32//!
33//! `passport`, `PASSPORT`, and `pässpört` all normalise to `passport`.
34//! Pinned by `tests::normalisation_*`.
35
36use std::collections::{HashMap, HashSet};
37
38/// Stable identifier alias used by issue #120's public API. Matches the
39/// `String`-keyed collections used elsewhere in the runtime.
40pub type CollectionId = String;
41/// Column name alias.
42pub type ColumnName = String;
43
44/// One hit returned by a vocabulary lookup.
45#[derive(Debug, Clone, PartialEq, Eq, Hash)]
46pub struct VocabHit {
47    pub collection: CollectionId,
48    /// `None` when the token matched the collection name itself rather
49    /// than one of its columns.
50    pub column: Option<ColumnName>,
51    /// Doc-shape `type` / `kind` discriminator value, when applicable.
52    pub type_tag: Option<String>,
53}
54
55/// Catalog-shaped DDL events the index reacts to.
56///
57/// One enum per issue #120 acceptance row. The variants stay flat so
58/// the dispatch sites in `impl_ddl` etc. don't need to share AST types.
59#[derive(Debug, Clone)]
60pub enum DdlEvent {
61    /// CREATE TABLE / CREATE collection of any kind. `columns` is the
62    /// list of declared column names (may be empty for dynamic
63    /// collections); `type_tags` carries any catalog-known discriminator
64    /// values (enum variants on a `type` / `kind` column).
65    CreateCollection {
66        collection: CollectionId,
67        columns: Vec<ColumnName>,
68        type_tags: Vec<String>,
69        description: Option<String>,
70    },
71    /// ALTER TABLE — replaces the column / discriminator set for the
72    /// collection. Implemented as a drop+recreate of the collection's
73    /// entries to guarantee invalidation completeness.
74    AlterCollection {
75        collection: CollectionId,
76        columns: Vec<ColumnName>,
77        type_tags: Vec<String>,
78        description: Option<String>,
79    },
80    /// DROP TABLE / DROP collection. Removes every entry whose
81    /// `collection` field equals the dropped collection's name,
82    /// including index / policy entries scoped to it.
83    DropCollection { collection: CollectionId },
84    /// CREATE INDEX. The token feed is the index name; `columns`
85    /// captures the indexed columns so a token match can disambiguate
86    /// between the index hit and the column hits.
87    CreateIndex {
88        collection: CollectionId,
89        index: String,
90        columns: Vec<ColumnName>,
91    },
92    /// DROP INDEX. Removes the index-name token entry.
93    DropIndex {
94        collection: CollectionId,
95        index: String,
96    },
97    /// CREATE POLICY. Policy names are token sources for the AskPipeline
98    /// when an operator asks "show me the rls policy that..." —
99    /// matching `PolicyName` should resolve back to the table.
100    CreatePolicy {
101        collection: CollectionId,
102        policy: String,
103    },
104    /// DROP POLICY.
105    DropPolicy {
106        collection: CollectionId,
107        policy: String,
108    },
109}
110
111/// Reverse index `token -> Vec<VocabHit>`.
112///
113/// `lookup` is O(token count): one HashMap probe to retrieve the slice,
114/// then the caller iterates the (typically small) slice. Empty when the
115/// token has no match.
116#[derive(Debug, Clone, Default)]
117pub struct SchemaVocabulary {
118    inverted: HashMap<String, Vec<VocabHit>>,
119}
120
121impl SchemaVocabulary {
122    pub fn new() -> Self {
123        Self::default()
124    }
125
126    /// Lookup a token. The caller is expected to pass the *raw* token;
127    /// we normalise before probing so callers don't have to remember
128    /// the rule.
129    pub fn lookup(&self, token: &str) -> &[VocabHit] {
130        let key = match normalise(token) {
131            Some(k) => k,
132            None => return &[],
133        };
134        self.inverted.get(&key).map(|v| v.as_slice()).unwrap_or(&[])
135    }
136
137    /// Apply a DDL event to the index.
138    pub fn on_ddl(&mut self, event: DdlEvent) {
139        match event {
140            DdlEvent::CreateCollection {
141                collection,
142                columns,
143                type_tags,
144                description,
145            } => {
146                self.insert_collection(&collection, &columns, &type_tags, description.as_deref());
147            }
148            DdlEvent::AlterCollection {
149                collection,
150                columns,
151                type_tags,
152                description,
153            } => {
154                // Drop+recreate is the simplest way to keep
155                // invalidation complete: stale columns / discriminators
156                // never linger after an ALTER ... DROP COLUMN.
157                self.purge_collection_entries(&collection);
158                self.insert_collection(&collection, &columns, &type_tags, description.as_deref());
159            }
160            DdlEvent::DropCollection { collection } => {
161                self.purge_collection_entries(&collection);
162            }
163            DdlEvent::CreateIndex {
164                collection,
165                index,
166                columns,
167            } => {
168                self.insert_index(&collection, &index, &columns);
169            }
170            DdlEvent::DropIndex { collection, index } => {
171                self.remove_token_for(&index, &collection, |hit| hit.column.is_none());
172            }
173            DdlEvent::CreatePolicy { collection, policy } => {
174                self.insert_token(
175                    &policy,
176                    VocabHit {
177                        collection: collection.clone(),
178                        column: None,
179                        type_tag: Some(format!("policy:{}", policy)),
180                    },
181                );
182            }
183            DdlEvent::DropPolicy { collection, policy } => {
184                let tag = format!("policy:{}", policy);
185                self.remove_token_for(&policy, &collection, |hit| {
186                    hit.type_tag.as_deref() == Some(tag.as_str())
187                });
188            }
189        }
190    }
191
192    fn insert_collection(
193        &mut self,
194        collection: &str,
195        columns: &[ColumnName],
196        type_tags: &[String],
197        description: Option<&str>,
198    ) {
199        // Collection-name-only hit.
200        self.insert_token(
201            collection,
202            VocabHit {
203                collection: collection.to_string(),
204                column: None,
205                type_tag: None,
206            },
207        );
208        // One hit per column.
209        for column in columns {
210            self.insert_token(
211                column,
212                VocabHit {
213                    collection: collection.to_string(),
214                    column: Some(column.clone()),
215                    type_tag: None,
216                },
217            );
218        }
219        // One hit per type tag, attached to the collection (no column).
220        for tag in type_tags {
221            self.insert_token(
222                tag,
223                VocabHit {
224                    collection: collection.to_string(),
225                    column: None,
226                    type_tag: Some(tag.clone()),
227                },
228            );
229        }
230        // Description tokens fan into individual word entries.
231        if let Some(text) = description {
232            for word in tokenise_description(text) {
233                self.insert_token(
234                    &word,
235                    VocabHit {
236                        collection: collection.to_string(),
237                        column: None,
238                        type_tag: Some("description".to_string()),
239                    },
240                );
241            }
242        }
243    }
244
245    fn insert_index(&mut self, collection: &str, index: &str, columns: &[ColumnName]) {
246        self.insert_token(
247            index,
248            VocabHit {
249                collection: collection.to_string(),
250                column: None,
251                type_tag: Some(format!("index:{}", index)),
252            },
253        );
254        for column in columns {
255            // Surface the column too so a token match still leads back
256            // to the collection even if the column wasn't declared in
257            // the CollectionContract (e.g. CREATE INDEX over a dynamic
258            // doc shape).
259            self.insert_token(
260                column,
261                VocabHit {
262                    collection: collection.to_string(),
263                    column: Some(column.clone()),
264                    type_tag: Some(format!("index:{}", index)),
265                },
266            );
267        }
268    }
269
270    fn insert_token(&mut self, raw: &str, hit: VocabHit) {
271        let Some(key) = normalise(raw) else { return };
272        let bucket = self.inverted.entry(key).or_default();
273        if !bucket.iter().any(|existing| existing == &hit) {
274            bucket.push(hit);
275        }
276    }
277
278    /// Remove every entry whose `collection` matches. Empty buckets are
279    /// pruned so the HashMap stays compact.
280    fn purge_collection_entries(&mut self, collection: &str) {
281        let mut empty_keys: Vec<String> = Vec::new();
282        for (key, bucket) in self.inverted.iter_mut() {
283            bucket.retain(|hit| hit.collection != collection);
284            if bucket.is_empty() {
285                empty_keys.push(key.clone());
286            }
287        }
288        for key in empty_keys {
289            self.inverted.remove(&key);
290        }
291    }
292
293    /// Remove the entries under one token whose hit predicate matches
294    /// for the given collection.
295    fn remove_token_for<F>(&mut self, raw: &str, collection: &str, predicate: F)
296    where
297        F: Fn(&VocabHit) -> bool,
298    {
299        let Some(key) = normalise(raw) else { return };
300        let Some(bucket) = self.inverted.get_mut(&key) else {
301            return;
302        };
303        bucket.retain(|hit| !(hit.collection == collection && predicate(hit)));
304        if bucket.is_empty() {
305            self.inverted.remove(&key);
306        }
307    }
308
309    /// Number of distinct tokens currently indexed. Test helper.
310    #[cfg(test)]
311    pub(crate) fn token_count(&self) -> usize {
312        self.inverted.len()
313    }
314
315    /// Distinct collections covered. Test helper.
316    #[cfg(test)]
317    pub(crate) fn collections(&self) -> HashSet<String> {
318        self.inverted
319            .values()
320            .flat_map(|bucket| bucket.iter().map(|hit| hit.collection.clone()))
321            .collect()
322    }
323}
324
325// `HashSet` is used by test-only helpers; the import has to stay
326// outside the `cfg(test)` block so doc-builds do not trip an unused
327// import warning when tests are off.
328#[allow(dead_code)]
329fn _force_hashset_use(_: HashSet<String>) {}
330
331/// Normalise a raw token to its index key. Returns `None` when the
332/// resulting string would be empty (e.g. caller passed `"---"`).
333pub fn normalise(raw: &str) -> Option<String> {
334    let mut buf = String::with_capacity(raw.len());
335    for ch in nfkd_decompose(raw) {
336        // Strip Unicode combining marks (accents / diacritics). The
337        // common Latin set we care about (`U+0300..=U+036F`) covers
338        // every diacritic that the issue's test corpus exercises;
339        // wider blocks (`U+1AB0..U+1AFF`, `U+1DC0..U+1DFF`,
340        // `U+20D0..U+20FF`, `U+FE20..U+FE2F`) are also stripped so
341        // exotic inputs don't smuggle invisible marks through.
342        if is_combining_mark(ch) {
343            continue;
344        }
345        buf.push(ch);
346    }
347
348    let lowered = buf.to_lowercase();
349    let trimmed = lowered.trim_matches(|c: char| !c.is_alphanumeric());
350    if trimmed.is_empty() {
351        None
352    } else {
353        Some(trimmed.to_string())
354    }
355}
356
357/// Lazy NFKD decomposition without pulling a new crate. The runtime
358/// already lives without `unicode-normalization`, so we hand-roll a
359/// minimal pre-composed-Latin folder. It covers every accented letter
360/// the issue's `passaporte`/`pässäpörte` test exercises plus the
361/// common European diacritics. Anything outside the table passes
362/// through unchanged.
363fn nfkd_decompose(input: &str) -> impl Iterator<Item = char> + '_ {
364    input.chars().flat_map(decompose_char)
365}
366
367fn decompose_char(ch: char) -> Vec<char> {
368    // Map of pre-composed → (base, combining mark). Only the marks we
369    // need for accent-folding common European text. Anything else is
370    // emitted verbatim.
371    let pair: Option<(char, char)> = match ch {
372        // Lowercase a
373        'à' | 'á' | 'â' | 'ã' | 'ä' | 'å' => Some((
374            'a',
375            match ch {
376                'à' => '\u{0300}',
377                'á' => '\u{0301}',
378                'â' => '\u{0302}',
379                'ã' => '\u{0303}',
380                'ä' => '\u{0308}',
381                'å' => '\u{030A}',
382                _ => unreachable!(),
383            },
384        )),
385        // Uppercase A
386        'À' | 'Á' | 'Â' | 'Ã' | 'Ä' | 'Å' => Some((
387            'A',
388            match ch {
389                'À' => '\u{0300}',
390                'Á' => '\u{0301}',
391                'Â' => '\u{0302}',
392                'Ã' => '\u{0303}',
393                'Ä' => '\u{0308}',
394                'Å' => '\u{030A}',
395                _ => unreachable!(),
396            },
397        )),
398        // Lowercase e
399        'è' | 'é' | 'ê' | 'ë' => Some((
400            'e',
401            match ch {
402                'è' => '\u{0300}',
403                'é' => '\u{0301}',
404                'ê' => '\u{0302}',
405                'ë' => '\u{0308}',
406                _ => unreachable!(),
407            },
408        )),
409        'È' | 'É' | 'Ê' | 'Ë' => Some((
410            'E',
411            match ch {
412                'È' => '\u{0300}',
413                'É' => '\u{0301}',
414                'Ê' => '\u{0302}',
415                'Ë' => '\u{0308}',
416                _ => unreachable!(),
417            },
418        )),
419        'ì' | 'í' | 'î' | 'ï' => Some((
420            'i',
421            match ch {
422                'ì' => '\u{0300}',
423                'í' => '\u{0301}',
424                'î' => '\u{0302}',
425                'ï' => '\u{0308}',
426                _ => unreachable!(),
427            },
428        )),
429        'Ì' | 'Í' | 'Î' | 'Ï' => Some((
430            'I',
431            match ch {
432                'Ì' => '\u{0300}',
433                'Í' => '\u{0301}',
434                'Î' => '\u{0302}',
435                'Ï' => '\u{0308}',
436                _ => unreachable!(),
437            },
438        )),
439        'ò' | 'ó' | 'ô' | 'õ' | 'ö' => Some((
440            'o',
441            match ch {
442                'ò' => '\u{0300}',
443                'ó' => '\u{0301}',
444                'ô' => '\u{0302}',
445                'õ' => '\u{0303}',
446                'ö' => '\u{0308}',
447                _ => unreachable!(),
448            },
449        )),
450        'Ò' | 'Ó' | 'Ô' | 'Õ' | 'Ö' => Some((
451            'O',
452            match ch {
453                'Ò' => '\u{0300}',
454                'Ó' => '\u{0301}',
455                'Ô' => '\u{0302}',
456                'Õ' => '\u{0303}',
457                'Ö' => '\u{0308}',
458                _ => unreachable!(),
459            },
460        )),
461        'ù' | 'ú' | 'û' | 'ü' => Some((
462            'u',
463            match ch {
464                'ù' => '\u{0300}',
465                'ú' => '\u{0301}',
466                'û' => '\u{0302}',
467                'ü' => '\u{0308}',
468                _ => unreachable!(),
469            },
470        )),
471        'Ù' | 'Ú' | 'Û' | 'Ü' => Some((
472            'U',
473            match ch {
474                'Ù' => '\u{0300}',
475                'Ú' => '\u{0301}',
476                'Û' => '\u{0302}',
477                'Ü' => '\u{0308}',
478                _ => unreachable!(),
479            },
480        )),
481        'ñ' => Some(('n', '\u{0303}')),
482        'Ñ' => Some(('N', '\u{0303}')),
483        'ç' => Some(('c', '\u{0327}')),
484        'Ç' => Some(('C', '\u{0327}')),
485        'ý' | 'ÿ' => Some((
486            'y',
487            match ch {
488                'ý' => '\u{0301}',
489                'ÿ' => '\u{0308}',
490                _ => unreachable!(),
491            },
492        )),
493        'Ý' => Some(('Y', '\u{0301}')),
494        _ => None,
495    };
496    match pair {
497        Some((base, mark)) => vec![base, mark],
498        None => vec![ch],
499    }
500}
501
502fn is_combining_mark(ch: char) -> bool {
503    matches!(ch as u32,
504        0x0300..=0x036F
505        | 0x1AB0..=0x1AFF
506        | 0x1DC0..=0x1DFF
507        | 0x20D0..=0x20FF
508        | 0xFE20..=0xFE2F)
509}
510
511/// Split a description into normalised word tokens.
512fn tokenise_description(text: &str) -> Vec<String> {
513    text.split(|c: char| !c.is_alphanumeric())
514        .filter_map(normalise)
515        .collect()
516}
517
518#[cfg(test)]
519mod tests {
520    use super::*;
521
522    fn create_event(name: &str, columns: &[&str]) -> DdlEvent {
523        DdlEvent::CreateCollection {
524            collection: name.to_string(),
525            columns: columns.iter().map(|s| s.to_string()).collect(),
526            type_tags: Vec::new(),
527            description: None,
528        }
529    }
530
531    #[test]
532    fn normalisation_lowercases() {
533        assert_eq!(normalise("PASSPORT").as_deref(), Some("passport"));
534        assert_eq!(normalise("Passport").as_deref(), Some("passport"));
535    }
536
537    #[test]
538    fn normalisation_strips_accents() {
539        // Pin: passport ≡ PASSPORT ≡ pässpört.
540        let lowered = normalise("passport").unwrap();
541        let upper = normalise("PASSPORT").unwrap();
542        let accented = normalise("pässpört").unwrap();
543        assert_eq!(lowered, "passport");
544        assert_eq!(upper, "passport");
545        assert_eq!(accented, "passport");
546        // Portuguese passaporte vs PASSAPORTE vs pässäpörte all collapse.
547        let pt = normalise("passaporte").unwrap();
548        let pt_upper = normalise("PASSAPORTE").unwrap();
549        let pt_accented = normalise("pässäpörte").unwrap();
550        assert_eq!(pt, "passaporte");
551        assert_eq!(pt_upper, "passaporte");
552        assert_eq!(pt_accented, "passaporte");
553    }
554
555    #[test]
556    fn normalisation_strips_surrounding_punctuation() {
557        assert_eq!(normalise("---email---").as_deref(), Some("email"));
558        assert_eq!(normalise("?id?").as_deref(), Some("id"));
559        // Inner punctuation is preserved (single-token rule).
560        assert_eq!(normalise("snake_case").as_deref(), Some("snake_case"));
561    }
562
563    #[test]
564    fn normalisation_returns_none_for_empty_or_punct_only() {
565        assert!(normalise("").is_none());
566        assert!(normalise("---").is_none());
567        assert!(normalise("   ").is_none());
568    }
569
570    #[test]
571    fn lookup_finds_collection_name_hit() {
572        let mut vocab = SchemaVocabulary::new();
573        vocab.on_ddl(create_event("passports", &["id", "country"]));
574        let hits = vocab.lookup("passports");
575        assert_eq!(hits.len(), 1);
576        assert_eq!(hits[0].collection, "passports");
577        assert!(hits[0].column.is_none());
578    }
579
580    #[test]
581    fn lookup_finds_column_hits_across_collections() {
582        let mut vocab = SchemaVocabulary::new();
583        vocab.on_ddl(create_event("users", &["id", "email"]));
584        vocab.on_ddl(create_event("orders", &["id", "user_id"]));
585        let id_hits = vocab.lookup("id");
586        let collections: HashSet<_> = id_hits.iter().map(|h| h.collection.as_str()).collect();
587        assert!(collections.contains("users"));
588        assert!(collections.contains("orders"));
589        assert!(id_hits.iter().all(|h| h.column.as_deref() == Some("id")));
590    }
591
592    #[test]
593    fn lookup_is_accent_fold_aware() {
594        let mut vocab = SchemaVocabulary::new();
595        vocab.on_ddl(create_event("passaporte", &["id"]));
596        // Hits arrive even when the caller passes a different
597        // accent / case form than the one the DDL declared.
598        let via_accents = vocab.lookup("PÄSSÄPÖRTE");
599        assert_eq!(via_accents.len(), 1);
600        assert_eq!(via_accents[0].collection, "passaporte");
601    }
602
603    #[test]
604    fn drop_collection_invalidates_completely() {
605        let mut vocab = SchemaVocabulary::new();
606        vocab.on_ddl(create_event("users", &["id", "email"]));
607        vocab.on_ddl(create_event("orders", &["id"]));
608        assert!(!vocab.lookup("users").is_empty());
609        vocab.on_ddl(DdlEvent::DropCollection {
610            collection: "users".to_string(),
611        });
612        // No stale entries.
613        assert!(vocab.lookup("users").is_empty());
614        assert!(vocab.lookup("email").is_empty());
615        // Other collection still resolves.
616        let id_hits = vocab.lookup("id");
617        assert_eq!(id_hits.len(), 1);
618        assert_eq!(id_hits[0].collection, "orders");
619    }
620
621    #[test]
622    fn alter_collection_replaces_column_set() {
623        let mut vocab = SchemaVocabulary::new();
624        vocab.on_ddl(create_event("users", &["id", "email"]));
625        vocab.on_ddl(DdlEvent::AlterCollection {
626            collection: "users".to_string(),
627            columns: vec!["id".to_string(), "username".to_string()],
628            type_tags: Vec::new(),
629            description: None,
630        });
631        // Old column dropped.
632        assert!(vocab.lookup("email").is_empty());
633        // New column visible.
634        let username_hits = vocab.lookup("username");
635        assert_eq!(username_hits.len(), 1);
636        assert_eq!(username_hits[0].column.as_deref(), Some("username"));
637    }
638
639    #[test]
640    fn index_create_and_drop_round_trip() {
641        let mut vocab = SchemaVocabulary::new();
642        vocab.on_ddl(create_event("users", &["email"]));
643        vocab.on_ddl(DdlEvent::CreateIndex {
644            collection: "users".to_string(),
645            index: "idx_users_email".to_string(),
646            columns: vec!["email".to_string()],
647        });
648        let idx_hits = vocab.lookup("idx_users_email");
649        assert_eq!(idx_hits.len(), 1);
650        assert!(idx_hits[0]
651            .type_tag
652            .as_deref()
653            .map(|t| t.starts_with("index:"))
654            .unwrap_or(false));
655        vocab.on_ddl(DdlEvent::DropIndex {
656            collection: "users".to_string(),
657            index: "idx_users_email".to_string(),
658        });
659        assert!(vocab.lookup("idx_users_email").is_empty());
660    }
661
662    #[test]
663    fn policy_create_and_drop_round_trip() {
664        let mut vocab = SchemaVocabulary::new();
665        vocab.on_ddl(create_event("users", &["id"]));
666        vocab.on_ddl(DdlEvent::CreatePolicy {
667            collection: "users".to_string(),
668            policy: "tenant_isolation".to_string(),
669        });
670        let hits = vocab.lookup("tenant_isolation");
671        assert_eq!(hits.len(), 1);
672        assert_eq!(hits[0].collection, "users");
673        vocab.on_ddl(DdlEvent::DropPolicy {
674            collection: "users".to_string(),
675            policy: "tenant_isolation".to_string(),
676        });
677        assert!(vocab.lookup("tenant_isolation").is_empty());
678    }
679
680    #[test]
681    fn type_tags_resolve_doc_shape_discriminators() {
682        let mut vocab = SchemaVocabulary::new();
683        vocab.on_ddl(DdlEvent::CreateCollection {
684            collection: "events".to_string(),
685            columns: vec!["id".to_string(), "type".to_string()],
686            type_tags: vec!["payment".to_string(), "refund".to_string()],
687            description: None,
688        });
689        let hits = vocab.lookup("payment");
690        assert_eq!(hits.len(), 1);
691        assert_eq!(hits[0].collection, "events");
692        assert_eq!(hits[0].type_tag.as_deref(), Some("payment"));
693    }
694
695    #[test]
696    fn description_words_resolve_to_collection() {
697        let mut vocab = SchemaVocabulary::new();
698        vocab.on_ddl(DdlEvent::CreateCollection {
699            collection: "users".to_string(),
700            columns: vec!["id".to_string()],
701            type_tags: Vec::new(),
702            description: Some("Customer accounts and sign-in info".to_string()),
703        });
704        let hits = vocab.lookup("customer");
705        assert_eq!(hits.len(), 1);
706        assert_eq!(hits[0].collection, "users");
707    }
708
709    // -- Property test (issue #120 acceptance row) --
710    //
711    // For 256 random (schema, query) pairs: every token the lookup
712    // returns must point at a collection that *actually* contains the
713    // referenced column. No phantom hits.
714    use proptest::prelude::*;
715
716    fn ascii_ident() -> impl Strategy<Value = String> {
717        // Compact alphabetic identifier so the property test stays
718        // deterministic and bounded.
719        "[a-z][a-z0-9_]{0,8}".prop_map(|s| s)
720    }
721
722    fn schema_strategy() -> impl Strategy<Value = Vec<(String, Vec<String>)>> {
723        prop::collection::vec(
724            (ascii_ident(), prop::collection::vec(ascii_ident(), 0..6)),
725            1..6,
726        )
727    }
728
729    proptest! {
730        #![proptest_config(ProptestConfig {
731            cases: 256,
732            .. ProptestConfig::default()
733        })]
734        #[test]
735        fn property_every_hit_points_at_real_column(
736            schema in schema_strategy(),
737            query_tokens in prop::collection::vec(ascii_ident(), 1..8),
738        ) {
739            // Build the fixture from the random schema. Disambiguate
740            // collection names so two CREATE TABLE events on the same
741            // name don't double-count.
742            let mut seen = HashSet::new();
743            let mut deduped: Vec<(String, Vec<String>)> = Vec::new();
744            for (collection, columns) in schema {
745                if seen.insert(collection.clone()) {
746                    let mut cols_seen = HashSet::new();
747                    let cols: Vec<String> = columns
748                        .into_iter()
749                        .filter(|c| cols_seen.insert(c.clone()))
750                        .collect();
751                    deduped.push((collection, cols));
752                }
753            }
754            let mut vocab = SchemaVocabulary::new();
755            for (collection, columns) in &deduped {
756                vocab.on_ddl(DdlEvent::CreateCollection {
757                    collection: collection.clone(),
758                    columns: columns.clone(),
759                    type_tags: Vec::new(),
760                    description: None,
761                });
762            }
763            // Index for verification.
764            let by_collection: HashMap<String, HashSet<String>> = deduped
765                .iter()
766                .map(|(c, cols)| (c.clone(), cols.iter().cloned().collect()))
767                .collect();
768            for token in query_tokens {
769                let hits = vocab.lookup(&token);
770                for hit in hits {
771                    let cols = by_collection.get(&hit.collection).expect(
772                        "lookup returned a collection that was never created",
773                    );
774                    match &hit.column {
775                        Some(column) => {
776                            prop_assert!(
777                                cols.contains(column),
778                                "phantom column {} on {}",
779                                column,
780                                hit.collection
781                            );
782                        }
783                        None => {
784                            // Collection-name hit: token must normalise
785                            // to the collection name itself (or be a
786                            // type_tag, but the schema strategy doesn't
787                            // emit type_tags).
788                            let normalised_collection = normalise(&hit.collection)
789                                .unwrap_or_default();
790                            let normalised_token = normalise(&token).unwrap_or_default();
791                            prop_assert_eq!(normalised_collection, normalised_token);
792                        }
793                    }
794                }
795            }
796        }
797    }
798}