fm_index/
frontend.rs

1// This module contains the API exposed to the frontend.
2//
3// It consists of concrete implementations of the search index and search
4// traits; one for count-only and the other supporting locate queries.
5//
6// This wrapper code is very verbose, so written using macros to avoid repetition.
7//
8// It's written around the wrapper module which provides the implementation of
9// the behavior. This module only exists so we can avoid exposing implementation
10// traits.
11
12use crate::character::Character;
13use crate::error::Error;
14use crate::fm_index::FMIndexBackend;
15use crate::multi_pieces::FMIndexMultiPiecesBackend;
16use crate::piece::PieceId;
17use crate::rlfmi::RLFMIndexBackend;
18use crate::suffix_array::discard::DiscardedSuffixArray;
19use crate::suffix_array::sample::SOSampledSuffixArray;
20use crate::text::Text;
21use crate::wrapper::{MatchWrapper, SearchIndexWrapper, SearchWrapper};
22
23/// Trait for searching in an index.
24///
25/// You can use this to search in an index generically.
26pub trait SearchIndex<C> {
27    /// Search for a pattern in the text.
28    ///
29    /// Return a [`Search`] object with information about the search
30    /// result.
31    fn search<K>(&self, pattern: K) -> impl Search<C>
32    where
33        K: AsRef<[C]>;
34
35    /// The size of the text in the index
36    ///
37    /// Note that this includes an ending \0 (terminator) character
38    /// so will be one more than the length of the text.
39    fn len(&self) -> usize;
40
41    /// The size of the data used by this structure on the heap, in bytes.
42    ///
43    /// This does not include non-used pre-allocated space.
44    fn heap_size(&self) -> usize;
45}
46
47/// Trait for searching in an index that supports multiple texts.
48pub trait SearchIndexWithMultiPieces<C>: SearchIndex<C> {
49    /// Search for a pattern that is a prefix of a text.
50    fn search_prefix<K>(&self, pattern: K) -> impl Search<C>
51    where
52        K: AsRef<[C]>;
53
54    /// Search for a pattern that is a suffix of a text.
55    fn search_suffix<K>(&self, pattern: K) -> impl Search<C>
56    where
57        K: AsRef<[C]>;
58
59    /// Search for a pattern that is an exact match of a text.
60    fn search_exact<K>(&self, pattern: K) -> impl Search<C>
61    where
62        K: AsRef<[C]>;
63}
64
65/// The result of a search.
66///
67/// A search result can be refined by adding more characters to the
68/// search pattern.
69/// A search result contains matches, which can be iterated over.
70pub trait Search<'a, C> {
71    /// Associated type for matches.
72    type Match: Match<'a, C>;
73
74    /// Search in the current search result, refining it.
75    ///
76    /// This adds a prefix `pattern` to the existing pattern, and
77    /// looks for those expanded patterns in the text.
78    fn search<K: AsRef<[C]>>(&self, pattern: K) -> Self;
79    /// Count the number of occurrences.
80    fn count(&self) -> usize;
81    /// Get an iterator over all matches.
82    fn iter_matches(&'a self) -> impl Iterator<Item = Self::Match> + 'a;
83}
84
85/// A match in the text.
86pub trait Match<'a, C> {
87    /// Iterate over the characters of the match.
88    fn iter_chars_forward(&self) -> impl Iterator<Item = C> + 'a;
89
90    /// Iterate over the characters of the match in reverse.
91    fn iter_chars_backward(&self) -> impl Iterator<Item = C> + 'a;
92}
93
94/// A match in the text that contains its location on the text.
95pub trait MatchWithLocate<'a, C>: Match<'a, C> {
96    /// Get the location of the match in the text.
97    fn locate(&self) -> usize;
98}
99
100/// A match in the text that contains the ID of the piece where the pattern is found.
101pub trait MatchWithPieceId<'a, C>: Match<'a, C> {
102    /// Get the ID of the text that the character at the matched position belongs to.
103    fn piece_id(&self) -> PieceId;
104}
105
106/// FMIndex, count only.
107///
108/// The FM-Index is both a search index as well as compact representation of
109/// the text.
110pub struct FMIndex<C: Character>(SearchIndexWrapper<FMIndexBackend<C, DiscardedSuffixArray>>);
111/// Search result for FMIndex, count only.
112pub struct FMIndexSearch<'a, C: Character>(
113    SearchWrapper<'a, FMIndexBackend<C, DiscardedSuffixArray>>,
114);
115/// Match in the text for FMIndex.
116pub struct FMIndexMatch<'a, C: Character>(
117    MatchWrapper<'a, FMIndexBackend<C, DiscardedSuffixArray>>,
118);
119
120/// FMIndex with locate support.
121///
122/// This is an FM-Index which uses additional storage to support locate queries.
123pub struct FMIndexWithLocate<C: Character>(
124    SearchIndexWrapper<FMIndexBackend<C, SOSampledSuffixArray>>,
125);
126/// Search result for FMIndex with locate support.
127pub struct FMIndexSearchWithLocate<'a, C: Character>(
128    SearchWrapper<'a, FMIndexBackend<C, SOSampledSuffixArray>>,
129);
130/// Match in the text for FMIndex with locate support.
131pub struct FMIndexMatchWithLocate<'a, C: Character>(
132    MatchWrapper<'a, FMIndexBackend<C, SOSampledSuffixArray>>,
133);
134
135/// RLFMIndex, count only.
136///
137/// This is a version of the FM-Index that uses less space, but is also less efficient.
138pub struct RLFMIndex<C: Character>(SearchIndexWrapper<RLFMIndexBackend<C, DiscardedSuffixArray>>);
139/// Search result for RLFMIndex, count only.
140pub struct RLFMIndexSearch<'a, C: Character>(
141    SearchWrapper<'a, RLFMIndexBackend<C, DiscardedSuffixArray>>,
142);
143/// Match in the text for RLFMIndex.
144pub struct RLFMIndexMatch<'a, C: Character>(
145    MatchWrapper<'a, RLFMIndexBackend<C, DiscardedSuffixArray>>,
146);
147
148/// RLFMIndex with locate support.
149///
150/// This is a version of the FM-Index that uses less space, but is also less efficient.
151/// It uses additional storage to support locate queries.
152pub struct RLFMIndexWithLocate<C: Character>(
153    SearchIndexWrapper<RLFMIndexBackend<C, SOSampledSuffixArray>>,
154);
155/// Search result for RLFMIndex with locate support.
156pub struct RLFMIndexSearchWithLocate<'a, C: Character>(
157    SearchWrapper<'a, RLFMIndexBackend<C, SOSampledSuffixArray>>,
158);
159/// Match in the text for RLFMIndex with locate support.
160pub struct RLFMIndexMatchWithLocate<'a, C: Character>(
161    MatchWrapper<'a, RLFMIndexBackend<C, SOSampledSuffixArray>>,
162);
163
164/// MultiText index, count only.
165///
166/// This is a multi-text version of the FM-Index. It allows \0 separated strings.
167pub struct FMIndexMultiPieces<C: Character>(
168    SearchIndexWrapper<FMIndexMultiPiecesBackend<C, DiscardedSuffixArray>>,
169);
170/// Search result for MultiText index, count only.
171pub struct FMIndexMultiPiecesSearch<'a, C: Character>(
172    SearchWrapper<'a, FMIndexMultiPiecesBackend<C, DiscardedSuffixArray>>,
173);
174/// Match in the text for MultiText index.
175pub struct FMIndexMultiPiecesMatch<'a, C: Character>(
176    MatchWrapper<'a, FMIndexMultiPiecesBackend<C, DiscardedSuffixArray>>,
177);
178
179/// MultiText index with locate support.
180///
181/// This is a multi-text version of the FM-Index. It allows \0 separated strings.
182/// It uses additional storage to support locate queries.
183pub struct FMIndexMultiPiecesWithLocate<C: Character>(
184    SearchIndexWrapper<FMIndexMultiPiecesBackend<C, SOSampledSuffixArray>>,
185);
186/// Search result for MultiText index with locate support.
187pub struct FMIndexMultiPiecesSearchWithLocate<'a, C: Character>(
188    SearchWrapper<'a, FMIndexMultiPiecesBackend<C, SOSampledSuffixArray>>,
189);
190/// Match in the text for MultiText index with locate support.
191pub struct FMIndexMultiPiecesMatchWithLocate<'a, C: Character>(
192    MatchWrapper<'a, FMIndexMultiPiecesBackend<C, SOSampledSuffixArray>>,
193);
194
195impl<C: Character> FMIndex<C> {
196    /// Create a new FMIndex without locate support.
197    pub fn new<T: AsRef<[C]>>(text: &Text<C, T>) -> Result<Self, Error> {
198        Ok(FMIndex(SearchIndexWrapper::new(FMIndexBackend::new(
199            text,
200            |_| DiscardedSuffixArray {},
201        )?)))
202    }
203}
204
205impl<C: Character> FMIndexWithLocate<C> {
206    /// Create a new FMIndex with locate support.
207    ///
208    /// The level argument controls the sampling rate used. Higher levels use
209    /// less storage, at the cost of performance of locate queries. A level of
210    /// 0 means no sampling, and a level of 1 means half of the suffix array is
211    /// sampled, a level of 2 means a quarter of the suffix array is sampled,
212    /// and so on.
213    pub fn new<T: AsRef<[C]>>(text: &Text<C, T>, level: usize) -> Result<Self, Error> {
214        Ok(FMIndexWithLocate(SearchIndexWrapper::new(
215            FMIndexBackend::new(text, |sa| SOSampledSuffixArray::sample(sa, level))?,
216        )))
217    }
218}
219
220impl<C: Character> RLFMIndex<C> {
221    /// Create a new RLFMIndex without locate support.
222    pub fn new<T: AsRef<[C]>>(text: &Text<C, T>) -> Result<Self, Error> {
223        Ok(RLFMIndex(SearchIndexWrapper::new(RLFMIndexBackend::new(
224            text,
225            |_| DiscardedSuffixArray {},
226        )?)))
227    }
228}
229
230impl<C: Character> RLFMIndexWithLocate<C> {
231    /// Create a new RLFMIndex with locate support.
232    ///
233    /// The level argument controls the sampling rate used. Higher levels use
234    /// less storage, at the cost of performance of locate queries. A level of
235    /// 0 means no sampling, and a level of 1 means half of the suffix array is
236    /// sampled, a level of 2 means a quarter of the suffix array is sampled,
237    /// and so on.
238    pub fn new<T: AsRef<[C]>>(text: &Text<C, T>, level: usize) -> Result<Self, Error> {
239        Ok(RLFMIndexWithLocate(SearchIndexWrapper::new(
240            RLFMIndexBackend::new(text, |sa| SOSampledSuffixArray::sample(sa, level))?,
241        )))
242    }
243}
244
245impl<C: Character> FMIndexMultiPieces<C> {
246    /// Create a new FMIndexMultiPieces without locate support.
247    pub fn new<T: AsRef<[C]>>(text: &Text<C, T>) -> Result<Self, Error> {
248        Ok(FMIndexMultiPieces(SearchIndexWrapper::new(
249            FMIndexMultiPiecesBackend::new(text, |_| DiscardedSuffixArray {})?,
250        )))
251    }
252}
253
254impl<C: Character> FMIndexMultiPiecesWithLocate<C> {
255    /// Create a new FMIndexMultiPieces with locate support.
256    ///
257    /// The level argument controls the sampling rate used. Higher levels use
258    /// less storage, at the cost of performance of locate queries. A level of
259    /// 0 means no sampling, and a level of 1 means half of the suffix array is
260    /// sampled, a level of 2 means a quarter of the suffix array is sampled,
261    /// and so on.
262    pub fn new<T: AsRef<[C]>>(text: &Text<C, T>, level: usize) -> Result<Self, Error> {
263        Ok(FMIndexMultiPiecesWithLocate(SearchIndexWrapper::new(
264            FMIndexMultiPiecesBackend::new(text, |sa| SOSampledSuffixArray::sample(sa, level))?,
265        )))
266    }
267}
268
269macro_rules! impl_search_index {
270    ($t:ty, $s:ident, $st:ty) => {
271        impl<C: Character> SearchIndex<C> for $t {
272            fn search<K>(&self, pattern: K) -> impl Search<C>
273            where
274                K: AsRef<[C]>,
275            {
276                $s(self.0.search(pattern))
277            }
278
279            fn len(&self) -> usize {
280                self.0.len()
281            }
282
283            fn heap_size(&self) -> usize {
284                self.0.heap_size()
285            }
286        }
287
288        // inherent
289        impl<C: Character> $t {
290            /// Search for a pattern in the text.
291            pub fn search<K>(&self, pattern: K) -> $st
292            where
293                K: AsRef<[C]>,
294            {
295                $s(self.0.search(pattern))
296            }
297            /// The size of the text in the index
298            pub fn len(&self) -> usize {
299                SearchIndex::len(self)
300            }
301        }
302    };
303}
304
305macro_rules! impl_search_index_with_locate {
306    ($t:ty, $s:ident, $st:ty) => {
307        impl<C: Character> SearchIndex<C> for $t {
308            fn search<K>(&self, pattern: K) -> impl Search<C>
309            where
310                K: AsRef<[C]>,
311            {
312                $s(self.0.search(pattern))
313            }
314
315            fn len(&self) -> usize {
316                self.0.len()
317            }
318
319            fn heap_size(&self) -> usize {
320                self.0.heap_size()
321            }
322        }
323
324        // inherent
325        impl<C: Character> $t {
326            /// Search for a pattern in the text.
327            pub fn search<K>(&self, pattern: K) -> $st
328            where
329                K: AsRef<[C]>,
330            {
331                $s(self.0.search(pattern))
332            }
333            /// The size of the text in the index
334            pub fn len(&self) -> usize {
335                SearchIndex::len(self)
336            }
337        }
338    };
339}
340
341macro_rules! impl_search_index_with_multi_pieces {
342    ($t:ty, $s:ident, $st:ty) => {
343        impl<C: Character> SearchIndexWithMultiPieces<C> for $t {
344            fn search_prefix<K>(&self, pattern: K) -> impl Search<C>
345            where
346                K: AsRef<[C]>,
347            {
348                $s(self.0.search_prefix(pattern))
349            }
350
351            fn search_suffix<K>(&self, pattern: K) -> impl Search<C>
352            where
353                K: AsRef<[C]>,
354            {
355                $s(self.0.search_suffix(pattern))
356            }
357
358            fn search_exact<K>(&self, pattern: K) -> impl Search<C>
359            where
360                K: AsRef<[C]>,
361            {
362                $s(self.0.search_exact(pattern))
363            }
364        }
365
366        // inherent
367        impl<C: Character> $t {
368            /// Search for a pattern that is a prefix of a text.
369            pub fn search_prefix<K>(&self, pattern: K) -> $st
370            where
371                K: AsRef<[C]>,
372            {
373                $s(self.0.search_prefix(pattern))
374            }
375
376            /// Search for a pattern that is a suffix of a text.
377            pub fn search_suffix<K>(&self, pattern: K) -> $st
378            where
379                K: AsRef<[C]>,
380            {
381                $s(self.0.search_suffix(pattern))
382            }
383
384            /// Search for a pattern that is an exact match of a text.
385            pub fn search_exact<K>(&self, pattern: K) -> $st
386            where
387                K: AsRef<[C]>,
388            {
389                $s(self.0.search_exact(pattern))
390            }
391        }
392    };
393}
394
395macro_rules! impl_search {
396    ($t:ty, $m:ident, $mt:ty) => {
397        impl<'a, C: Character> Search<'a, C> for $t {
398            type Match = $mt;
399
400            fn search<K>(&self, pattern: K) -> Self
401            where
402                K: AsRef<[C]>,
403            {
404                Self(self.0.search(pattern))
405            }
406
407            fn count(&self) -> usize {
408                self.0.count()
409            }
410
411            fn iter_matches(&'a self) -> impl Iterator<Item = Self::Match> + 'a {
412                self.0.iter_matches().map(|m| $m(m))
413            }
414        }
415        // inherent
416        impl<'a, C: Character> $t {
417            /// Search in the current search result, refining it.
418            ///
419            /// This adds a prefix `pattern` to the existing pattern, and
420            /// looks for those expanded patterns in the text.
421            pub fn search<K>(&self, pattern: K) -> Self
422            where
423                K: AsRef<[C]>,
424            {
425                Search::search(self, pattern)
426            }
427
428            /// Count the number of occurrences.
429            pub fn count(&self) -> usize {
430                Search::count(self)
431            }
432        }
433    };
434}
435
436macro_rules! impl_match {
437    ($t:ty) => {
438        impl<'a, C: Character> Match<'a, C> for $t {
439            fn iter_chars_forward(&self) -> impl Iterator<Item = C> + 'a {
440                self.0.iter_chars_forward()
441            }
442
443            fn iter_chars_backward(&self) -> impl Iterator<Item = C> + 'a {
444                self.0.iter_chars_backward()
445            }
446        }
447    };
448}
449
450macro_rules! impl_match_locate {
451    ($t:ty) => {
452        impl<'a, C: Character> MatchWithLocate<'a, C> for $t {
453            fn locate(&self) -> usize {
454                self.0.locate()
455            }
456        }
457    };
458}
459
460macro_rules! impl_match_piece_id {
461    ($t:ty) => {
462        impl<'a, C: Character> MatchWithPieceId<'a, C> for $t {
463            fn piece_id(&self) -> PieceId {
464                self.0.piece_id()
465            }
466        }
467    };
468}
469
470impl_search_index!(FMIndex<C>, FMIndexSearch, FMIndexSearch<C>);
471impl_search!(FMIndexSearch<'a, C>, FMIndexMatch, FMIndexMatch<'a, C>);
472impl_match!(FMIndexMatch<'a, C>);
473
474impl_search_index_with_locate!(
475    FMIndexWithLocate<C>,
476    FMIndexSearchWithLocate,
477    FMIndexSearchWithLocate<C>
478);
479impl_search!(
480    FMIndexSearchWithLocate<'a, C>,
481    FMIndexMatchWithLocate,
482    FMIndexMatchWithLocate<'a, C>
483);
484impl_match!(FMIndexMatchWithLocate<'a, C>);
485impl_match_locate!(FMIndexMatchWithLocate<'a, C>);
486
487impl_search_index!(RLFMIndex<C>, RLFMIndexSearch, RLFMIndexSearch<C>);
488impl_search!(
489    RLFMIndexSearch<'a, C>,
490    RLFMIndexMatch,
491    RLFMIndexMatch<'a, C>
492);
493impl_match!(RLFMIndexMatch<'a, C>);
494
495impl_search_index_with_locate!(
496    RLFMIndexWithLocate<C>,
497    RLFMIndexSearchWithLocate,
498    RLFMIndexSearchWithLocate<C>
499);
500impl_search!(
501    RLFMIndexSearchWithLocate<'a, C>,
502    RLFMIndexMatchWithLocate,
503    RLFMIndexMatchWithLocate<'a, C>
504);
505impl_match!(RLFMIndexMatchWithLocate<'a, C>);
506impl_match_locate!(RLFMIndexMatchWithLocate<'a, C>);
507
508impl_search_index!(
509    FMIndexMultiPieces<C>,
510    FMIndexMultiPiecesSearch,
511    FMIndexMultiPiecesSearch<C>
512);
513impl_search_index_with_multi_pieces!(
514    FMIndexMultiPieces<C>,
515    FMIndexMultiPiecesSearch,
516    FMIndexMultiPiecesSearch<C>
517);
518impl_search!(
519    FMIndexMultiPiecesSearch<'a, C>,
520    FMIndexMultiPiecesMatch,
521    FMIndexMultiPiecesMatch<'a, C>
522);
523impl_match!(FMIndexMultiPiecesMatch<'a, C>);
524
525impl_search_index_with_locate!(
526    FMIndexMultiPiecesWithLocate<C>,
527    FMIndexMultiPiecesSearchWithLocate,
528    FMIndexMultiPiecesSearchWithLocate<C>
529);
530impl_search_index_with_multi_pieces!(
531    FMIndexMultiPiecesWithLocate<C>,
532    FMIndexMultiPiecesSearchWithLocate,
533    FMIndexMultiPiecesSearchWithLocate<C>
534);
535impl_search!(
536    FMIndexMultiPiecesSearchWithLocate<'a, C>,
537    FMIndexMultiPiecesMatchWithLocate,
538    FMIndexMultiPiecesMatchWithLocate<'a, C>
539);
540impl_match!(FMIndexMultiPiecesMatchWithLocate<'a, C>);
541impl_match_locate!(FMIndexMultiPiecesMatchWithLocate<'a, C>);
542impl_match_piece_id!(FMIndexMultiPiecesMatchWithLocate<'a, C>);