boyer_moore_magiclen/
byte.rs

1use alloc::{
2    fmt::{self, Debug, Formatter},
3    string::String,
4    vec::Vec,
5};
6use core::{ops::Deref, slice::Iter};
7
8// TODO Searchable
9
10#[allow(clippy::len_without_is_empty)]
11pub trait BMByteSearchable {
12    fn len(&self) -> usize;
13
14    fn value_at(&self, index: usize) -> u8;
15
16    fn iter(&self) -> Iter<'_, u8>;
17}
18
19impl BMByteSearchable for String {
20    #[inline]
21    fn len(&self) -> usize {
22        String::len(self)
23    }
24
25    #[inline]
26    fn value_at(&self, index: usize) -> u8 {
27        self.as_bytes()[index]
28    }
29
30    #[inline]
31    fn iter(&self) -> Iter<'_, u8> {
32        self.as_bytes().iter()
33    }
34}
35
36impl BMByteSearchable for &str {
37    #[inline]
38    fn len(&self) -> usize {
39        str::len(self)
40    }
41
42    #[inline]
43    fn value_at(&self, index: usize) -> u8 {
44        unsafe { (*(*self as *const str as *const [u8]))[index] }
45    }
46
47    #[inline]
48    fn iter(&self) -> Iter<'_, u8> {
49        self.as_bytes().iter()
50    }
51}
52
53impl BMByteSearchable for dyn Deref<Target = [u8]> {
54    #[inline]
55    fn len(&self) -> usize {
56        <[u8]>::len(self)
57    }
58
59    #[inline]
60    fn value_at(&self, index: usize) -> u8 {
61        self[index]
62    }
63
64    #[inline]
65    fn iter(&self) -> Iter<'_, u8> {
66        <[u8]>::iter(self)
67    }
68}
69
70impl BMByteSearchable for Vec<u8> {
71    #[inline]
72    fn len(&self) -> usize {
73        Vec::len(self)
74    }
75
76    #[inline]
77    fn value_at(&self, index: usize) -> u8 {
78        self[index]
79    }
80
81    #[inline]
82    fn iter(&self) -> Iter<'_, u8> {
83        self.as_slice().iter()
84    }
85}
86
87impl<T: BMByteSearchable> BMByteSearchable for &T {
88    #[inline]
89    fn len(&self) -> usize {
90        <dyn BMByteSearchable>::len(*self)
91    }
92
93    #[inline]
94    fn value_at(&self, index: usize) -> u8 {
95        <dyn BMByteSearchable>::value_at(*self, index)
96    }
97
98    #[inline]
99    fn iter(&self) -> Iter<'_, u8> {
100        <dyn BMByteSearchable>::iter(*self)
101    }
102}
103
104// TODO BasCharShiftMap
105
106pub struct BMByteBadCharShiftMap {
107    t: [usize; 256],
108}
109
110impl Debug for BMByteBadCharShiftMap {
111    #[inline]
112    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
113        debug_helper::impl_debug_for_struct!(BMByteBadCharShiftMap, f, self, let .t = self.t.as_ref());
114    }
115}
116
117impl Deref for BMByteBadCharShiftMap {
118    type Target = [usize];
119
120    #[inline]
121    fn deref(&self) -> &[usize] {
122        self.t.as_ref()
123    }
124}
125
126pub struct BMByteBadCharShiftMapRev {
127    t: [usize; 256],
128}
129
130impl Debug for BMByteBadCharShiftMapRev {
131    #[inline]
132    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
133        debug_helper::impl_debug_for_struct!(BMByteBadCharShiftMapRev, f, self, let .t = self.t.as_ref());
134    }
135}
136
137impl Deref for BMByteBadCharShiftMapRev {
138    type Target = [usize];
139
140    #[inline]
141    fn deref(&self) -> &[usize] {
142        self.t.as_ref()
143    }
144}
145
146impl BMByteBadCharShiftMap {
147    pub fn create_bad_char_shift_map<T: BMByteSearchable>(
148        pattern: T,
149    ) -> Option<BMByteBadCharShiftMap> {
150        let pattern_len = pattern.len();
151
152        if pattern_len == 0 {
153            return None;
154        }
155
156        let pattern_len_dec = pattern_len - 1;
157
158        let mut bad_char_shift_map = [pattern_len; 256];
159
160        for (i, c) in pattern.iter().take(pattern_len_dec).map(|&c| c as usize).enumerate() {
161            bad_char_shift_map[c] = pattern_len_dec - i;
162        }
163
164        Some(BMByteBadCharShiftMap {
165            t: bad_char_shift_map
166        })
167    }
168}
169
170impl BMByteBadCharShiftMapRev {
171    pub fn create_bad_char_shift_map<T: BMByteSearchable>(
172        pattern: T,
173    ) -> Option<BMByteBadCharShiftMapRev> {
174        let pattern_len = pattern.len();
175
176        if pattern_len == 0 {
177            return None;
178        }
179
180        let pattern_len_dec = pattern_len - 1;
181
182        let mut bad_char_shift_map = [pattern_len; 256];
183
184        for (i, c) in
185            pattern.iter().enumerate().rev().take(pattern_len_dec).map(|(i, &c)| (i, c as usize))
186        {
187            bad_char_shift_map[c] = i;
188        }
189
190        Some(BMByteBadCharShiftMapRev {
191            t: bad_char_shift_map
192        })
193    }
194}
195
196// TODO BM
197
198/// Using Boyer-Moore-MagicLen to search byte sub-sequences in any byte sequence, including self-synchronizing string encoding data such as UTF-8.
199#[derive(Debug)]
200pub struct BMByte {
201    bad_char_shift_map:     BMByteBadCharShiftMap,
202    bad_char_shift_map_rev: BMByteBadCharShiftMapRev,
203    pattern:                Vec<u8>,
204}
205
206impl BMByte {
207    /// Create a `BMByte` instance from a pattern (the needle).
208    ///
209    /// ```
210    /// use boyer_moore_magiclen::BMByte;
211    ///
212    /// let bmb = BMByte::from("oocoo").unwrap();
213    /// ```
214    pub fn from<T: BMByteSearchable>(pattern: T) -> Option<BMByte> {
215        let bad_char_shift_map = BMByteBadCharShiftMap::create_bad_char_shift_map(&pattern)?;
216        let bad_char_shift_map_rev = BMByteBadCharShiftMapRev::create_bad_char_shift_map(&pattern)?;
217
218        Some(BMByte {
219            bad_char_shift_map,
220            bad_char_shift_map_rev,
221            pattern: pattern.iter().copied().collect(),
222        })
223    }
224}
225
226// TODO Find Full
227
228impl BMByte {
229    /// Find and return the positions of all matched sub-sequences in any text (the haystack).
230    ///
231    /// ```
232    /// use boyer_moore_magiclen::BMByte;
233    ///
234    /// let bmb = BMByte::from("oocoo").unwrap();
235    ///
236    /// assert_eq!(vec![1, 4, 7], bmb.find_full_all_in("coocoocoocoo"));
237    /// ```
238    pub fn find_full_all_in<T: BMByteSearchable>(&self, text: T) -> Vec<usize> {
239        find_full(text, &self.pattern, &self.bad_char_shift_map, 0)
240    }
241
242    /// Find and return the positions of matched sub-sequences in any text (the haystack). If the `limit` is set to `0`, all sub-sequences will be found.
243    ///
244    /// ```
245    /// use boyer_moore_magiclen::BMByte;
246    ///
247    /// let bmb = BMByte::from("oocoo").unwrap();
248    ///
249    /// assert_eq!(vec![1, 4], bmb.find_full_in("coocoocoocoo", 2));
250    /// ```
251    pub fn find_full_in<T: BMByteSearchable>(&self, text: T, limit: usize) -> Vec<usize> {
252        find_full(text, &self.pattern, &self.bad_char_shift_map, limit)
253    }
254}
255
256impl BMByte {
257    /// Find and return the positions of all matched sub-sequences in any text (the haystack) from its tail to its head.
258    ///
259    /// ```
260    /// use boyer_moore_magiclen::BMByte;
261    ///
262    /// let bmb = BMByte::from("oocoo").unwrap();
263    ///
264    /// assert_eq!(vec![7, 4, 1], bmb.rfind_full_all_in("coocoocoocoo"));
265    /// ```
266    pub fn rfind_full_all_in<T: BMByteSearchable>(&self, text: T) -> Vec<usize> {
267        rfind_full(text, &self.pattern, &self.bad_char_shift_map_rev, 0)
268    }
269
270    /// Find and return the positions of matched sub-sequences in any text (the haystack) from its tail to its head. If the `limit` is set to `0`, all sub-sequences will be found.
271    ///
272    /// ```
273    /// use boyer_moore_magiclen::BMByte;
274    ///
275    /// let bmb = BMByte::from("oocoo").unwrap();
276    ///
277    /// assert_eq!(vec![7, 4], bmb.rfind_full_in("coocoocoocoo", 2));
278    /// ```
279    pub fn rfind_full_in<T: BMByteSearchable>(&self, text: T, limit: usize) -> Vec<usize> {
280        rfind_full(text, &self.pattern, &self.bad_char_shift_map_rev, limit)
281    }
282}
283
284pub fn find_full<TT: BMByteSearchable, TP: BMByteSearchable>(
285    text: TT,
286    pattern: TP,
287    bad_char_shift_map: &BMByteBadCharShiftMap,
288    limit: usize,
289) -> Vec<usize> {
290    let text_len = text.len();
291    let pattern_len = pattern.len();
292
293    if text_len == 0 || pattern_len == 0 || text_len < pattern_len {
294        return vec![];
295    }
296
297    let pattern_len_dec = pattern_len - 1;
298
299    let last_pattern_char = pattern.value_at(pattern_len_dec);
300
301    let mut shift = 0;
302
303    let end_index = text_len - pattern_len;
304
305    let mut result = vec![];
306
307    'outer: loop {
308        for (i, pc) in pattern.iter().copied().enumerate().rev() {
309            if text.value_at(shift + i) != pc {
310                let p = shift + pattern_len;
311                if p == text_len {
312                    break 'outer;
313                }
314                shift += bad_char_shift_map[text.value_at(shift + pattern_len_dec) as usize].max({
315                    let c = text.value_at(p);
316
317                    if c == last_pattern_char {
318                        1
319                    } else {
320                        bad_char_shift_map[c as usize] + 1
321                    }
322                });
323                if shift > end_index {
324                    break 'outer;
325                }
326                continue 'outer;
327            }
328        }
329        result.push(shift);
330
331        if shift == end_index {
332            break;
333        }
334
335        if result.len() == limit {
336            break;
337        }
338
339        shift += bad_char_shift_map[text.value_at(shift + pattern_len_dec) as usize].max({
340            let c = text.value_at(shift + pattern_len);
341
342            if c == last_pattern_char {
343                1
344            } else {
345                bad_char_shift_map[c as usize] + 1
346            }
347        });
348        if shift > end_index {
349            break;
350        }
351    }
352
353    result
354}
355
356pub fn rfind_full<TT: BMByteSearchable, TP: BMByteSearchable>(
357    text: TT,
358    pattern: TP,
359    bad_char_shift_map: &BMByteBadCharShiftMapRev,
360    limit: usize,
361) -> Vec<usize> {
362    let text_len = text.len();
363    let pattern_len = pattern.len();
364
365    if text_len == 0 || pattern_len == 0 || text_len < pattern_len {
366        return vec![];
367    }
368
369    let pattern_len_dec = pattern_len - 1;
370
371    let first_pattern_char = pattern.value_at(0);
372
373    let mut shift = text_len - 1;
374
375    let start_index = pattern_len_dec;
376
377    let mut result = vec![];
378
379    'outer: loop {
380        for (i, pc) in pattern.iter().copied().enumerate() {
381            if text.value_at(shift - pattern_len_dec + i) != pc {
382                if shift < pattern_len {
383                    break 'outer;
384                }
385                let s = bad_char_shift_map[text.value_at(shift - pattern_len_dec) as usize].max({
386                    let c = text.value_at(shift - pattern_len);
387
388                    if c == first_pattern_char {
389                        1
390                    } else {
391                        bad_char_shift_map[c as usize] + 1
392                    }
393                });
394                if shift < s {
395                    break 'outer;
396                }
397                shift -= s;
398                if shift < start_index {
399                    break 'outer;
400                }
401                continue 'outer;
402            }
403        }
404        result.push(shift - pattern_len_dec);
405
406        if shift == start_index {
407            break;
408        }
409
410        if result.len() == limit {
411            break;
412        }
413
414        let s = bad_char_shift_map[text.value_at(shift - pattern_len_dec) as usize].max({
415            let c = text.value_at(shift - pattern_len);
416
417            if c == first_pattern_char {
418                1
419            } else {
420                bad_char_shift_map[c as usize] + 1
421            }
422        });
423        if shift < s {
424            break;
425        }
426        shift -= s;
427        if shift < start_index {
428            break;
429        }
430    }
431
432    result
433}
434
435// TODO Find
436
437impl BMByte {
438    /// Find and return the positions of all matched sub-sequences in any text (the haystack) but not including the overlap.
439    ///
440    /// ```
441    /// use boyer_moore_magiclen::BMByte;
442    ///
443    /// let bmb = BMByte::from("oocoo").unwrap();
444    ///
445    /// assert_eq!(vec![1, 7], bmb.find_all_in("coocoocoocoo"));
446    /// ```
447    pub fn find_all_in<T: BMByteSearchable>(&self, text: T) -> Vec<usize> {
448        find(text, &self.pattern, &self.bad_char_shift_map, 0)
449    }
450
451    /// Find and return the position of the first matched sub-sequence in any text (the haystack).
452    ///
453    /// ```
454    /// use boyer_moore_magiclen::BMByte;
455    ///
456    /// let bmb = BMByte::from("oocoo").unwrap();
457    ///
458    /// assert_eq!(Some(1), bmb.find_first_in("coocoocoocoo"));
459    /// ```
460    pub fn find_first_in<T: BMByteSearchable>(&self, text: T) -> Option<usize> {
461        find(text, &self.pattern, &self.bad_char_shift_map, 1).first().copied()
462    }
463
464    /// Find and return the positions of matched sub-sequences in any text (the haystack) but not including the overlap. If the `limit` is set to `0`, all sub-sequences will be found.
465    ///
466    /// ```
467    /// use boyer_moore_magiclen::BMByte;
468    ///
469    /// let bmb = BMByte::from("oocoo").unwrap();
470    ///
471    /// assert_eq!(vec![1], bmb.find_in("coocoocoocoo", 1));
472    /// ```
473    pub fn find_in<T: BMByteSearchable>(&self, text: T, limit: usize) -> Vec<usize> {
474        find(text, &self.pattern, &self.bad_char_shift_map, limit)
475    }
476}
477
478impl BMByte {
479    /// Find and return the positions of all matched sub-sequences in any text (the haystack) but not including the overlap from its tail to its head.
480    ///
481    /// ```
482    /// use boyer_moore_magiclen::BMByte;
483    ///
484    /// let bmb = BMByte::from("oocoo").unwrap();
485    ///
486    /// assert_eq!(vec![7, 1], bmb.rfind_all_in("coocoocoocoo"));
487    /// ```
488    pub fn rfind_all_in<T: BMByteSearchable>(&self, text: T) -> Vec<usize> {
489        rfind(text, &self.pattern, &self.bad_char_shift_map_rev, 0)
490    }
491
492    /// Find and return the position of the first matched sub-sequence in any text (the haystack) from its tail to its head.
493    ///
494    /// ```
495    /// use boyer_moore_magiclen::BMByte;
496    ///
497    /// let bmb = BMByte::from("oocoo").unwrap();
498    ///
499    /// assert_eq!(Some(7), bmb.rfind_first_in("coocoocoocoo"));
500    /// ```
501    pub fn rfind_first_in<T: BMByteSearchable>(&self, text: T) -> Option<usize> {
502        rfind(text, &self.pattern, &self.bad_char_shift_map_rev, 1).first().copied()
503    }
504
505    /// Find and return the positions of matched sub-sequences in any text (the haystack) but not including the overlap from its tail to its head. If the `limit` is set to `0`, all sub-sequences will be found.
506    ///
507    /// ```
508    /// use boyer_moore_magiclen::BMByte;
509    ///
510    /// let bmb = BMByte::from("oocoo").unwrap();
511    ///
512    /// assert_eq!(vec![7], bmb.rfind_in("coocoocoocoo", 1));
513    /// ```
514    pub fn rfind_in<T: BMByteSearchable>(&self, text: T, limit: usize) -> Vec<usize> {
515        rfind(text, &self.pattern, &self.bad_char_shift_map_rev, limit)
516    }
517}
518
519pub fn find<TT: BMByteSearchable, TP: BMByteSearchable>(
520    text: TT,
521    pattern: TP,
522    bad_char_shift_map: &BMByteBadCharShiftMap,
523    limit: usize,
524) -> Vec<usize> {
525    let text_len = text.len();
526    let pattern_len = pattern.len();
527
528    if text_len == 0 || pattern_len == 0 || text_len < pattern_len {
529        return vec![];
530    }
531
532    let pattern_len_dec = pattern_len - 1;
533
534    let last_pattern_char = pattern.value_at(pattern_len_dec);
535
536    let mut shift = 0;
537
538    let end_index = text_len - pattern_len;
539
540    let mut result = vec![];
541
542    'outer: loop {
543        for (i, pc) in pattern.iter().copied().enumerate().rev() {
544            if text.value_at(shift + i) != pc {
545                let p = shift + pattern_len;
546                if p == text_len {
547                    break 'outer;
548                }
549                shift += bad_char_shift_map[text.value_at(shift + pattern_len_dec) as usize].max({
550                    let c = text.value_at(p);
551
552                    if c == last_pattern_char {
553                        1
554                    } else {
555                        bad_char_shift_map[c as usize] + 1
556                    }
557                });
558                if shift > end_index {
559                    break 'outer;
560                }
561                continue 'outer;
562            }
563        }
564        result.push(shift);
565
566        if shift == end_index {
567            break;
568        }
569
570        if result.len() == limit {
571            break;
572        }
573
574        shift += pattern_len;
575        if shift > end_index {
576            break;
577        }
578    }
579
580    result
581}
582
583pub fn rfind<TT: BMByteSearchable, TP: BMByteSearchable>(
584    text: TT,
585    pattern: TP,
586    bad_char_shift_map: &BMByteBadCharShiftMapRev,
587    limit: usize,
588) -> Vec<usize> {
589    let text_len = text.len();
590    let pattern_len = pattern.len();
591
592    if text_len == 0 || pattern_len == 0 || text_len < pattern_len {
593        return vec![];
594    }
595
596    let pattern_len_dec = pattern_len - 1;
597
598    let first_pattern_char = pattern.value_at(0);
599
600    let mut shift = text_len - 1;
601
602    let start_index = pattern_len_dec;
603
604    let mut result = vec![];
605
606    'outer: loop {
607        for (i, pc) in pattern.iter().copied().enumerate() {
608            if text.value_at(shift - pattern_len_dec + i) != pc {
609                if shift < pattern_len {
610                    break 'outer;
611                }
612                let s = bad_char_shift_map[text.value_at(shift - pattern_len_dec) as usize].max({
613                    let c = text.value_at(shift - pattern_len);
614
615                    if c == first_pattern_char {
616                        1
617                    } else {
618                        bad_char_shift_map[c as usize] + 1
619                    }
620                });
621                if shift < s {
622                    break 'outer;
623                }
624                shift -= s;
625                if shift < start_index {
626                    break 'outer;
627                }
628                continue 'outer;
629            }
630        }
631        result.push(shift - pattern_len_dec);
632
633        if shift == start_index {
634            break;
635        }
636
637        if result.len() == limit {
638            break;
639        }
640
641        shift -= pattern_len;
642        if shift < start_index {
643            break;
644        }
645    }
646
647    result
648}