graph_api_simplegraph/index/
full_text.rs

1use fastbloom::BloomFilter;
2use rphonetic::{Encoder, Metaphone};
3use std::collections::HashMap;
4use std::hash::Hash;
5
6/// A phonetic full-text search index that uses Bloom filters for efficient text matching
7///
8/// This index provides fuzzy text search capabilities by:
9/// - Using metaphone phonetic encoding to match similar-sounding words
10/// - Employing Bloom filters to efficiently store and query word sets
11/// - Supporting multi-word searches with "AND" semantics
12///
13/// # Type Parameters
14/// - `V`: The type of values/keys to associate with indexed text. Must be `Eq`, `Hash` and `Copy`
15#[derive(Default, Debug)]
16pub struct FullTextIndex<V>
17where
18    V: Eq + Hash + Copy,
19{
20    filters: HashMap<V, BloomFilter>,
21    metaphone: Metaphone,
22}
23
24impl<V> FullTextIndex<V>
25where
26    V: Eq + Hash + Copy,
27{
28    /// Indexes the given text and associates it with the specified key
29    ///
30    /// The text is split into words, each word is phonetically encoded,
31    /// and stored in a Bloom filter optimized for the text length.
32    ///
33    /// # Arguments
34    /// * `key` - The key to associate with this text
35    /// * `text` - The text to index
36    pub(crate) fn insert(&mut self, key: V, text: &str) {
37        let word_count = text.split_whitespace().count();
38        let mut filter = BloomFilter::with_false_pos(0.001).expected_items(word_count);
39
40        for word in text.split_whitespace() {
41            let encoded = self.metaphone.encode(word);
42            filter.insert(&encoded);
43        }
44
45        self.filters.insert(key, filter);
46    }
47
48    /// Searches the index for entries matching the given search text
49    ///
50    /// The search uses AND semantics - all words in the search text must match
51    /// for an entry to be returned. Matching is phonetic, so similar-sounding
52    /// words will match (e.g., "phone" matches "fone").
53    ///
54    /// # Arguments
55    /// * `search` - The text to search for
56    ///
57    /// # Returns
58    /// An iterator over matching keys
59    pub(crate) fn search<'a>(&'a self, search: &str) -> impl Iterator<Item = V> + 'a {
60        let search = search
61            .split_whitespace()
62            .map(|w| self.metaphone.encode(w))
63            .collect::<Vec<_>>();
64        self.filters.iter().filter_map(move |(v, filter)| {
65            for word in &search {
66                if !filter.contains(&word) {
67                    return None;
68                }
69            }
70            Some(*v)
71        })
72    }
73
74    /// Removes an entry from the index
75    ///
76    /// # Arguments
77    /// * `key` - The key of the entry to remove
78    pub(crate) fn remove(&mut self, key: &V) {
79        self.filters.remove(key);
80    }
81}
82#[cfg(test)]
83mod tests {
84    use super::*;
85
86    #[test]
87    fn test_basic_insert_and_search() {
88        let mut index = FullTextIndex::default();
89        index.insert(1, "hello world");
90
91        let results: Vec<_> = index.search("hello").collect();
92        assert_eq!(results, vec![1]);
93
94        let results: Vec<_> = index.search("world").collect();
95        assert_eq!(results, vec![1]);
96    }
97
98    #[test]
99    fn test_phonetic_matching() {
100        let mut index = FullTextIndex::default();
101        index.insert(1, "phone");
102
103        let results: Vec<_> = index.search("fone").collect();
104        assert_eq!(results, vec![1]);
105    }
106
107    #[test]
108    fn test_multiple_entries() {
109        let mut index = FullTextIndex::default();
110        index.insert(1, "hello world");
111        index.insert(2, "hello there");
112        index.insert(3, "goodbye world");
113
114        let results: Vec<_> = index.search("hello").collect();
115        assert_eq!(results.len(), 2);
116        assert!(results.contains(&1));
117        assert!(results.contains(&2));
118    }
119
120    #[test]
121    fn test_remove() {
122        let mut index = FullTextIndex::default();
123        index.insert(1, "hello world");
124        index.remove(&1);
125
126        let results: Vec<_> = index.search("hello").collect();
127        assert!(results.is_empty());
128    }
129
130    #[test]
131    fn test_multi_word_search() {
132        let mut index = FullTextIndex::default();
133        index.insert(1, "hello beautiful world");
134        index.insert(2, "hello ugly world");
135
136        let results: Vec<_> = index.search("hello world").collect();
137        assert_eq!(results.len(), 2);
138
139        let results: Vec<_> = index.search("beautiful world").collect();
140        assert_eq!(results, vec![1]);
141    }
142
143    #[test]
144    fn test_empty_cases() {
145        let mut index = FullTextIndex::<i32>::default();
146        let results: Vec<_> = index.search("anything").collect();
147        assert!(results.is_empty());
148
149        index.insert(1, "");
150        let results: Vec<_> = index.search("").collect();
151        assert_eq!(results, vec![1]);
152    }
153}