1use std::collections::{HashMap, HashSet};
37
38pub type CollectionId = String;
41pub type ColumnName = String;
43
44#[derive(Debug, Clone, PartialEq, Eq, Hash)]
46pub struct VocabHit {
47 pub collection: CollectionId,
48 pub column: Option<ColumnName>,
51 pub type_tag: Option<String>,
53}
54
55#[derive(Debug, Clone)]
60pub enum DdlEvent {
61 CreateCollection {
66 collection: CollectionId,
67 columns: Vec<ColumnName>,
68 type_tags: Vec<String>,
69 description: Option<String>,
70 },
71 AlterCollection {
75 collection: CollectionId,
76 columns: Vec<ColumnName>,
77 type_tags: Vec<String>,
78 description: Option<String>,
79 },
80 DropCollection { collection: CollectionId },
84 CreateIndex {
88 collection: CollectionId,
89 index: String,
90 columns: Vec<ColumnName>,
91 },
92 DropIndex {
94 collection: CollectionId,
95 index: String,
96 },
97 CreatePolicy {
101 collection: CollectionId,
102 policy: String,
103 },
104 DropPolicy {
106 collection: CollectionId,
107 policy: String,
108 },
109}
110
111#[derive(Debug, Clone, Default)]
117pub struct SchemaVocabulary {
118 inverted: HashMap<String, Vec<VocabHit>>,
119}
120
121impl SchemaVocabulary {
122 pub fn new() -> Self {
123 Self::default()
124 }
125
126 pub fn lookup(&self, token: &str) -> &[VocabHit] {
130 let key = match normalise(token) {
131 Some(k) => k,
132 None => return &[],
133 };
134 self.inverted.get(&key).map(|v| v.as_slice()).unwrap_or(&[])
135 }
136
137 pub fn on_ddl(&mut self, event: DdlEvent) {
139 match event {
140 DdlEvent::CreateCollection {
141 collection,
142 columns,
143 type_tags,
144 description,
145 } => {
146 self.insert_collection(&collection, &columns, &type_tags, description.as_deref());
147 }
148 DdlEvent::AlterCollection {
149 collection,
150 columns,
151 type_tags,
152 description,
153 } => {
154 self.purge_collection_entries(&collection);
158 self.insert_collection(&collection, &columns, &type_tags, description.as_deref());
159 }
160 DdlEvent::DropCollection { collection } => {
161 self.purge_collection_entries(&collection);
162 }
163 DdlEvent::CreateIndex {
164 collection,
165 index,
166 columns,
167 } => {
168 self.insert_index(&collection, &index, &columns);
169 }
170 DdlEvent::DropIndex { collection, index } => {
171 self.remove_token_for(&index, &collection, |hit| hit.column.is_none());
172 }
173 DdlEvent::CreatePolicy { collection, policy } => {
174 self.insert_token(
175 &policy,
176 VocabHit {
177 collection: collection.clone(),
178 column: None,
179 type_tag: Some(format!("policy:{}", policy)),
180 },
181 );
182 }
183 DdlEvent::DropPolicy { collection, policy } => {
184 let tag = format!("policy:{}", policy);
185 self.remove_token_for(&policy, &collection, |hit| {
186 hit.type_tag.as_deref() == Some(tag.as_str())
187 });
188 }
189 }
190 }
191
192 fn insert_collection(
193 &mut self,
194 collection: &str,
195 columns: &[ColumnName],
196 type_tags: &[String],
197 description: Option<&str>,
198 ) {
199 self.insert_token(
201 collection,
202 VocabHit {
203 collection: collection.to_string(),
204 column: None,
205 type_tag: None,
206 },
207 );
208 for column in columns {
210 self.insert_token(
211 column,
212 VocabHit {
213 collection: collection.to_string(),
214 column: Some(column.clone()),
215 type_tag: None,
216 },
217 );
218 }
219 for tag in type_tags {
221 self.insert_token(
222 tag,
223 VocabHit {
224 collection: collection.to_string(),
225 column: None,
226 type_tag: Some(tag.clone()),
227 },
228 );
229 }
230 if let Some(text) = description {
232 for word in tokenise_description(text) {
233 self.insert_token(
234 &word,
235 VocabHit {
236 collection: collection.to_string(),
237 column: None,
238 type_tag: Some("description".to_string()),
239 },
240 );
241 }
242 }
243 }
244
245 fn insert_index(&mut self, collection: &str, index: &str, columns: &[ColumnName]) {
246 self.insert_token(
247 index,
248 VocabHit {
249 collection: collection.to_string(),
250 column: None,
251 type_tag: Some(format!("index:{}", index)),
252 },
253 );
254 for column in columns {
255 self.insert_token(
260 column,
261 VocabHit {
262 collection: collection.to_string(),
263 column: Some(column.clone()),
264 type_tag: Some(format!("index:{}", index)),
265 },
266 );
267 }
268 }
269
270 fn insert_token(&mut self, raw: &str, hit: VocabHit) {
271 let Some(key) = normalise(raw) else { return };
272 let bucket = self.inverted.entry(key).or_default();
273 if !bucket.iter().any(|existing| existing == &hit) {
274 bucket.push(hit);
275 }
276 }
277
278 fn purge_collection_entries(&mut self, collection: &str) {
281 let mut empty_keys: Vec<String> = Vec::new();
282 for (key, bucket) in self.inverted.iter_mut() {
283 bucket.retain(|hit| hit.collection != collection);
284 if bucket.is_empty() {
285 empty_keys.push(key.clone());
286 }
287 }
288 for key in empty_keys {
289 self.inverted.remove(&key);
290 }
291 }
292
293 fn remove_token_for<F>(&mut self, raw: &str, collection: &str, predicate: F)
296 where
297 F: Fn(&VocabHit) -> bool,
298 {
299 let Some(key) = normalise(raw) else { return };
300 let Some(bucket) = self.inverted.get_mut(&key) else {
301 return;
302 };
303 bucket.retain(|hit| !(hit.collection == collection && predicate(hit)));
304 if bucket.is_empty() {
305 self.inverted.remove(&key);
306 }
307 }
308
309 #[cfg(test)]
311 pub(crate) fn token_count(&self) -> usize {
312 self.inverted.len()
313 }
314
315 #[cfg(test)]
317 pub(crate) fn collections(&self) -> HashSet<String> {
318 self.inverted
319 .values()
320 .flat_map(|bucket| bucket.iter().map(|hit| hit.collection.clone()))
321 .collect()
322 }
323}
324
325#[allow(dead_code)]
329fn _force_hashset_use(_: HashSet<String>) {}
330
331pub fn normalise(raw: &str) -> Option<String> {
334 let mut buf = String::with_capacity(raw.len());
335 for ch in nfkd_decompose(raw) {
336 if is_combining_mark(ch) {
343 continue;
344 }
345 buf.push(ch);
346 }
347
348 let lowered = buf.to_lowercase();
349 let trimmed = lowered.trim_matches(|c: char| !c.is_alphanumeric());
350 if trimmed.is_empty() {
351 None
352 } else {
353 Some(trimmed.to_string())
354 }
355}
356
357fn nfkd_decompose(input: &str) -> impl Iterator<Item = char> + '_ {
364 input.chars().flat_map(decompose_char)
365}
366
367fn decompose_char(ch: char) -> Vec<char> {
368 let pair: Option<(char, char)> = match ch {
372 'à' | 'á' | 'â' | 'ã' | 'ä' | 'å' => Some((
374 'a',
375 match ch {
376 'à' => '\u{0300}',
377 'á' => '\u{0301}',
378 'â' => '\u{0302}',
379 'ã' => '\u{0303}',
380 'ä' => '\u{0308}',
381 'å' => '\u{030A}',
382 _ => unreachable!(),
383 },
384 )),
385 'À' | 'Á' | 'Â' | 'Ã' | 'Ä' | 'Å' => Some((
387 'A',
388 match ch {
389 'À' => '\u{0300}',
390 'Á' => '\u{0301}',
391 'Â' => '\u{0302}',
392 'Ã' => '\u{0303}',
393 'Ä' => '\u{0308}',
394 'Å' => '\u{030A}',
395 _ => unreachable!(),
396 },
397 )),
398 'è' | 'é' | 'ê' | 'ë' => Some((
400 'e',
401 match ch {
402 'è' => '\u{0300}',
403 'é' => '\u{0301}',
404 'ê' => '\u{0302}',
405 'ë' => '\u{0308}',
406 _ => unreachable!(),
407 },
408 )),
409 'È' | 'É' | 'Ê' | 'Ë' => Some((
410 'E',
411 match ch {
412 'È' => '\u{0300}',
413 'É' => '\u{0301}',
414 'Ê' => '\u{0302}',
415 'Ë' => '\u{0308}',
416 _ => unreachable!(),
417 },
418 )),
419 'ì' | 'í' | 'î' | 'ï' => Some((
420 'i',
421 match ch {
422 'ì' => '\u{0300}',
423 'í' => '\u{0301}',
424 'î' => '\u{0302}',
425 'ï' => '\u{0308}',
426 _ => unreachable!(),
427 },
428 )),
429 'Ì' | 'Í' | 'Î' | 'Ï' => Some((
430 'I',
431 match ch {
432 'Ì' => '\u{0300}',
433 'Í' => '\u{0301}',
434 'Î' => '\u{0302}',
435 'Ï' => '\u{0308}',
436 _ => unreachable!(),
437 },
438 )),
439 'ò' | 'ó' | 'ô' | 'õ' | 'ö' => Some((
440 'o',
441 match ch {
442 'ò' => '\u{0300}',
443 'ó' => '\u{0301}',
444 'ô' => '\u{0302}',
445 'õ' => '\u{0303}',
446 'ö' => '\u{0308}',
447 _ => unreachable!(),
448 },
449 )),
450 'Ò' | 'Ó' | 'Ô' | 'Õ' | 'Ö' => Some((
451 'O',
452 match ch {
453 'Ò' => '\u{0300}',
454 'Ó' => '\u{0301}',
455 'Ô' => '\u{0302}',
456 'Õ' => '\u{0303}',
457 'Ö' => '\u{0308}',
458 _ => unreachable!(),
459 },
460 )),
461 'ù' | 'ú' | 'û' | 'ü' => Some((
462 'u',
463 match ch {
464 'ù' => '\u{0300}',
465 'ú' => '\u{0301}',
466 'û' => '\u{0302}',
467 'ü' => '\u{0308}',
468 _ => unreachable!(),
469 },
470 )),
471 'Ù' | 'Ú' | 'Û' | 'Ü' => Some((
472 'U',
473 match ch {
474 'Ù' => '\u{0300}',
475 'Ú' => '\u{0301}',
476 'Û' => '\u{0302}',
477 'Ü' => '\u{0308}',
478 _ => unreachable!(),
479 },
480 )),
481 'ñ' => Some(('n', '\u{0303}')),
482 'Ñ' => Some(('N', '\u{0303}')),
483 'ç' => Some(('c', '\u{0327}')),
484 'Ç' => Some(('C', '\u{0327}')),
485 'ý' | 'ÿ' => Some((
486 'y',
487 match ch {
488 'ý' => '\u{0301}',
489 'ÿ' => '\u{0308}',
490 _ => unreachable!(),
491 },
492 )),
493 'Ý' => Some(('Y', '\u{0301}')),
494 _ => None,
495 };
496 match pair {
497 Some((base, mark)) => vec![base, mark],
498 None => vec![ch],
499 }
500}
501
502fn is_combining_mark(ch: char) -> bool {
503 matches!(ch as u32,
504 0x0300..=0x036F
505 | 0x1AB0..=0x1AFF
506 | 0x1DC0..=0x1DFF
507 | 0x20D0..=0x20FF
508 | 0xFE20..=0xFE2F)
509}
510
511fn tokenise_description(text: &str) -> Vec<String> {
513 text.split(|c: char| !c.is_alphanumeric())
514 .filter_map(normalise)
515 .collect()
516}
517
518#[cfg(test)]
519mod tests {
520 use super::*;
521
522 fn create_event(name: &str, columns: &[&str]) -> DdlEvent {
523 DdlEvent::CreateCollection {
524 collection: name.to_string(),
525 columns: columns.iter().map(|s| s.to_string()).collect(),
526 type_tags: Vec::new(),
527 description: None,
528 }
529 }
530
531 #[test]
532 fn normalisation_lowercases() {
533 assert_eq!(normalise("PASSPORT").as_deref(), Some("passport"));
534 assert_eq!(normalise("Passport").as_deref(), Some("passport"));
535 }
536
537 #[test]
538 fn normalisation_strips_accents() {
539 let lowered = normalise("passport").unwrap();
541 let upper = normalise("PASSPORT").unwrap();
542 let accented = normalise("pässpört").unwrap();
543 assert_eq!(lowered, "passport");
544 assert_eq!(upper, "passport");
545 assert_eq!(accented, "passport");
546 let pt = normalise("passaporte").unwrap();
548 let pt_upper = normalise("PASSAPORTE").unwrap();
549 let pt_accented = normalise("pässäpörte").unwrap();
550 assert_eq!(pt, "passaporte");
551 assert_eq!(pt_upper, "passaporte");
552 assert_eq!(pt_accented, "passaporte");
553 }
554
555 #[test]
556 fn normalisation_strips_surrounding_punctuation() {
557 assert_eq!(normalise("---email---").as_deref(), Some("email"));
558 assert_eq!(normalise("?id?").as_deref(), Some("id"));
559 assert_eq!(normalise("snake_case").as_deref(), Some("snake_case"));
561 }
562
563 #[test]
564 fn normalisation_returns_none_for_empty_or_punct_only() {
565 assert!(normalise("").is_none());
566 assert!(normalise("---").is_none());
567 assert!(normalise(" ").is_none());
568 }
569
570 #[test]
571 fn lookup_finds_collection_name_hit() {
572 let mut vocab = SchemaVocabulary::new();
573 vocab.on_ddl(create_event("passports", &["id", "country"]));
574 let hits = vocab.lookup("passports");
575 assert_eq!(hits.len(), 1);
576 assert_eq!(hits[0].collection, "passports");
577 assert!(hits[0].column.is_none());
578 }
579
580 #[test]
581 fn lookup_finds_column_hits_across_collections() {
582 let mut vocab = SchemaVocabulary::new();
583 vocab.on_ddl(create_event("users", &["id", "email"]));
584 vocab.on_ddl(create_event("orders", &["id", "user_id"]));
585 let id_hits = vocab.lookup("id");
586 let collections: HashSet<_> = id_hits.iter().map(|h| h.collection.as_str()).collect();
587 assert!(collections.contains("users"));
588 assert!(collections.contains("orders"));
589 assert!(id_hits.iter().all(|h| h.column.as_deref() == Some("id")));
590 }
591
592 #[test]
593 fn lookup_is_accent_fold_aware() {
594 let mut vocab = SchemaVocabulary::new();
595 vocab.on_ddl(create_event("passaporte", &["id"]));
596 let via_accents = vocab.lookup("PÄSSÄPÖRTE");
599 assert_eq!(via_accents.len(), 1);
600 assert_eq!(via_accents[0].collection, "passaporte");
601 }
602
603 #[test]
604 fn drop_collection_invalidates_completely() {
605 let mut vocab = SchemaVocabulary::new();
606 vocab.on_ddl(create_event("users", &["id", "email"]));
607 vocab.on_ddl(create_event("orders", &["id"]));
608 assert!(!vocab.lookup("users").is_empty());
609 vocab.on_ddl(DdlEvent::DropCollection {
610 collection: "users".to_string(),
611 });
612 assert!(vocab.lookup("users").is_empty());
614 assert!(vocab.lookup("email").is_empty());
615 let id_hits = vocab.lookup("id");
617 assert_eq!(id_hits.len(), 1);
618 assert_eq!(id_hits[0].collection, "orders");
619 }
620
621 #[test]
622 fn alter_collection_replaces_column_set() {
623 let mut vocab = SchemaVocabulary::new();
624 vocab.on_ddl(create_event("users", &["id", "email"]));
625 vocab.on_ddl(DdlEvent::AlterCollection {
626 collection: "users".to_string(),
627 columns: vec!["id".to_string(), "username".to_string()],
628 type_tags: Vec::new(),
629 description: None,
630 });
631 assert!(vocab.lookup("email").is_empty());
633 let username_hits = vocab.lookup("username");
635 assert_eq!(username_hits.len(), 1);
636 assert_eq!(username_hits[0].column.as_deref(), Some("username"));
637 }
638
639 #[test]
640 fn index_create_and_drop_round_trip() {
641 let mut vocab = SchemaVocabulary::new();
642 vocab.on_ddl(create_event("users", &["email"]));
643 vocab.on_ddl(DdlEvent::CreateIndex {
644 collection: "users".to_string(),
645 index: "idx_users_email".to_string(),
646 columns: vec!["email".to_string()],
647 });
648 let idx_hits = vocab.lookup("idx_users_email");
649 assert_eq!(idx_hits.len(), 1);
650 assert!(idx_hits[0]
651 .type_tag
652 .as_deref()
653 .map(|t| t.starts_with("index:"))
654 .unwrap_or(false));
655 vocab.on_ddl(DdlEvent::DropIndex {
656 collection: "users".to_string(),
657 index: "idx_users_email".to_string(),
658 });
659 assert!(vocab.lookup("idx_users_email").is_empty());
660 }
661
662 #[test]
663 fn policy_create_and_drop_round_trip() {
664 let mut vocab = SchemaVocabulary::new();
665 vocab.on_ddl(create_event("users", &["id"]));
666 vocab.on_ddl(DdlEvent::CreatePolicy {
667 collection: "users".to_string(),
668 policy: "tenant_isolation".to_string(),
669 });
670 let hits = vocab.lookup("tenant_isolation");
671 assert_eq!(hits.len(), 1);
672 assert_eq!(hits[0].collection, "users");
673 vocab.on_ddl(DdlEvent::DropPolicy {
674 collection: "users".to_string(),
675 policy: "tenant_isolation".to_string(),
676 });
677 assert!(vocab.lookup("tenant_isolation").is_empty());
678 }
679
680 #[test]
681 fn type_tags_resolve_doc_shape_discriminators() {
682 let mut vocab = SchemaVocabulary::new();
683 vocab.on_ddl(DdlEvent::CreateCollection {
684 collection: "events".to_string(),
685 columns: vec!["id".to_string(), "type".to_string()],
686 type_tags: vec!["payment".to_string(), "refund".to_string()],
687 description: None,
688 });
689 let hits = vocab.lookup("payment");
690 assert_eq!(hits.len(), 1);
691 assert_eq!(hits[0].collection, "events");
692 assert_eq!(hits[0].type_tag.as_deref(), Some("payment"));
693 }
694
695 #[test]
696 fn description_words_resolve_to_collection() {
697 let mut vocab = SchemaVocabulary::new();
698 vocab.on_ddl(DdlEvent::CreateCollection {
699 collection: "users".to_string(),
700 columns: vec!["id".to_string()],
701 type_tags: Vec::new(),
702 description: Some("Customer accounts and sign-in info".to_string()),
703 });
704 let hits = vocab.lookup("customer");
705 assert_eq!(hits.len(), 1);
706 assert_eq!(hits[0].collection, "users");
707 }
708
709 use proptest::prelude::*;
715
716 fn ascii_ident() -> impl Strategy<Value = String> {
717 "[a-z][a-z0-9_]{0,8}".prop_map(|s| s)
720 }
721
722 fn schema_strategy() -> impl Strategy<Value = Vec<(String, Vec<String>)>> {
723 prop::collection::vec(
724 (ascii_ident(), prop::collection::vec(ascii_ident(), 0..6)),
725 1..6,
726 )
727 }
728
729 proptest! {
730 #![proptest_config(ProptestConfig {
731 cases: 256,
732 .. ProptestConfig::default()
733 })]
734 #[test]
735 fn property_every_hit_points_at_real_column(
736 schema in schema_strategy(),
737 query_tokens in prop::collection::vec(ascii_ident(), 1..8),
738 ) {
739 let mut seen = HashSet::new();
743 let mut deduped: Vec<(String, Vec<String>)> = Vec::new();
744 for (collection, columns) in schema {
745 if seen.insert(collection.clone()) {
746 let mut cols_seen = HashSet::new();
747 let cols: Vec<String> = columns
748 .into_iter()
749 .filter(|c| cols_seen.insert(c.clone()))
750 .collect();
751 deduped.push((collection, cols));
752 }
753 }
754 let mut vocab = SchemaVocabulary::new();
755 for (collection, columns) in &deduped {
756 vocab.on_ddl(DdlEvent::CreateCollection {
757 collection: collection.clone(),
758 columns: columns.clone(),
759 type_tags: Vec::new(),
760 description: None,
761 });
762 }
763 let by_collection: HashMap<String, HashSet<String>> = deduped
765 .iter()
766 .map(|(c, cols)| (c.clone(), cols.iter().cloned().collect()))
767 .collect();
768 for token in query_tokens {
769 let hits = vocab.lookup(&token);
770 for hit in hits {
771 let cols = by_collection.get(&hit.collection).expect(
772 "lookup returned a collection that was never created",
773 );
774 match &hit.column {
775 Some(column) => {
776 prop_assert!(
777 cols.contains(column),
778 "phantom column {} on {}",
779 column,
780 hit.collection
781 );
782 }
783 None => {
784 let normalised_collection = normalise(&hit.collection)
789 .unwrap_or_default();
790 let normalised_token = normalise(&token).unwrap_or_default();
791 prop_assert_eq!(normalised_collection, normalised_token);
792 }
793 }
794 }
795 }
796 }
797 }
798}