stringzilla/
lib.rs

1#![cfg_attr(not(test), no_std)]
2
3/// The `sz` module provides a collection of string searching and manipulation functionality,
4/// designed for high efficiency and compatibility with no_std environments. This module offers
5/// various utilities for byte string manipulation, including search, reverse search, and
6/// edit-distance calculations, suitable for a wide range of applications from basic string
7/// processing to complex text analysis tasks.
8
9pub mod sz {
10
11    use core::ffi::c_void;
12
13    // Import the functions from the StringZilla C library.
14    extern "C" {
15        fn sz_find(
16            haystack: *const c_void,
17            haystack_length: usize,
18            needle: *const c_void,
19            needle_length: usize,
20        ) -> *const c_void;
21
22        fn sz_rfind(
23            haystack: *const c_void,
24            haystack_length: usize,
25            needle: *const c_void,
26            needle_length: usize,
27        ) -> *const c_void;
28
29        fn sz_find_char_from(
30            haystack: *const c_void,
31            haystack_length: usize,
32            needle: *const c_void,
33            needle_length: usize,
34        ) -> *const c_void;
35
36        fn sz_rfind_char_from(
37            haystack: *const c_void,
38            haystack_length: usize,
39            needle: *const c_void,
40            needle_length: usize,
41        ) -> *const c_void;
42
43        fn sz_find_char_not_from(
44            haystack: *const c_void,
45            haystack_length: usize,
46            needle: *const c_void,
47            needle_length: usize,
48        ) -> *const c_void;
49
50        fn sz_rfind_char_not_from(
51            haystack: *const c_void,
52            haystack_length: usize,
53            needle: *const c_void,
54            needle_length: usize,
55        ) -> *const c_void;
56
57        fn sz_edit_distance(
58            haystack1: *const c_void,
59            haystack1_length: usize,
60            haystack2: *const c_void,
61            haystack2_length: usize,
62            bound: usize,
63            allocator: *const c_void,
64        ) -> usize;
65
66        fn sz_edit_distance_utf8(
67            haystack1: *const c_void,
68            haystack1_length: usize,
69            haystack2: *const c_void,
70            haystack2_length: usize,
71            bound: usize,
72            allocator: *const c_void,
73        ) -> usize;
74
75        fn sz_hamming_distance(
76            haystack1: *const c_void,
77            haystack1_length: usize,
78            haystack2: *const c_void,
79            haystack2_length: usize,
80            bound: usize,
81        ) -> usize;
82
83        fn sz_hamming_distance_utf8(
84            haystack1: *const c_void,
85            haystack1_length: usize,
86            haystack2: *const c_void,
87            haystack2_length: usize,
88            bound: usize,
89        ) -> usize;
90
91        fn sz_alignment_score(
92            haystack1: *const c_void,
93            haystack1_length: usize,
94            haystack2: *const c_void,
95            haystack2_length: usize,
96            matrix: *const c_void,
97            gap: i8,
98            allocator: *const c_void,
99        ) -> isize;
100
101        // type RandomGeneratorT = fn(*mut c_void) -> u64;
102
103        fn sz_generate(
104            alphabet: *const c_void,
105            alphabet_size: usize,
106            text: *mut c_void,
107            length: usize,
108            generate: *const c_void,
109            generator: *mut c_void,
110        );
111    }
112
113    /// Locates the first matching substring within `haystack` that equals `needle`.
114    /// This function is similar to the `memmem()` function in LibC, but, unlike `strstr()`,
115    /// it requires the length of both haystack and needle to be known beforehand.
116    ///
117    /// # Arguments
118    ///
119    /// * `haystack`: The byte slice to search.
120    /// * `needle`: The byte slice to find within the haystack.
121    ///
122    /// # Returns
123    ///
124    /// An `Option<usize>` representing the starting index of the first occurrence of `needle`
125    /// within `haystack` if found, otherwise `None`.
126    pub fn find<H, N>(haystack: H, needle: N) -> Option<usize>
127    where
128        H: AsRef<[u8]>,
129        N: AsRef<[u8]>,
130    {
131        let haystack_ref = haystack.as_ref();
132        let needle_ref = needle.as_ref();
133        let haystack_pointer = haystack_ref.as_ptr() as _;
134        let haystack_length = haystack_ref.len();
135        let needle_pointer = needle_ref.as_ptr() as _;
136        let needle_length = needle_ref.len();
137        let result = unsafe {
138            sz_find(
139                haystack_pointer,
140                haystack_length,
141                needle_pointer,
142                needle_length,
143            )
144        };
145
146        if result.is_null() {
147            None
148        } else {
149            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
150        }
151    }
152
153    /// Locates the last matching substring within `haystack` that equals `needle`.
154    /// This function is useful for finding the most recent or last occurrence of a pattern
155    /// within a byte slice.
156    ///
157    /// # Arguments
158    ///
159    /// * `haystack`: The byte slice to search.
160    /// * `needle`: The byte slice to find within the haystack.
161    ///
162    /// # Returns
163    ///
164    /// An `Option<usize>` representing the starting index of the last occurrence of `needle`
165    /// within `haystack` if found, otherwise `None`.
166    pub fn rfind<H, N>(haystack: H, needle: N) -> Option<usize>
167    where
168        H: AsRef<[u8]>,
169        N: AsRef<[u8]>,
170    {
171        let haystack_ref = haystack.as_ref();
172        let needle_ref = needle.as_ref();
173        let haystack_pointer = haystack_ref.as_ptr() as _;
174        let haystack_length = haystack_ref.len();
175        let needle_pointer = needle_ref.as_ptr() as _;
176        let needle_length = needle_ref.len();
177        let result = unsafe {
178            sz_rfind(
179                haystack_pointer,
180                haystack_length,
181                needle_pointer,
182                needle_length,
183            )
184        };
185
186        if result.is_null() {
187            None
188        } else {
189            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
190        }
191    }
192
193    /// Finds the index of the first character in `haystack` that is also present in `needles`.
194    /// This function is particularly useful for parsing and tokenization tasks where a set of
195    /// delimiter characters is used.
196    ///
197    /// # Arguments
198    ///
199    /// * `haystack`: The byte slice to search.
200    /// * `needles`: The set of bytes to search for within the haystack.
201    ///
202    /// # Returns
203    ///
204    /// An `Option<usize>` representing the index of the first occurrence of any byte from
205    /// `needles` within `haystack`, if found, otherwise `None`.
206    pub fn find_char_from<H, N>(haystack: H, needles: N) -> Option<usize>
207    where
208        H: AsRef<[u8]>,
209        N: AsRef<[u8]>,
210    {
211        let haystack_ref = haystack.as_ref();
212        let needles_ref = needles.as_ref();
213        let haystack_pointer = haystack_ref.as_ptr() as _;
214        let haystack_length = haystack_ref.len();
215        let needles_pointer = needles_ref.as_ptr() as _;
216        let needles_length = needles_ref.len();
217        let result = unsafe {
218            sz_find_char_from(
219                haystack_pointer,
220                haystack_length,
221                needles_pointer,
222                needles_length,
223            )
224        };
225        if result.is_null() {
226            None
227        } else {
228            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
229        }
230    }
231
232    /// Finds the index of the last character in `haystack` that is also present in `needles`.
233    /// This can be used to find the last occurrence of any character from a specified set,
234    /// useful in parsing scenarios such as finding the last delimiter in a string.
235    ///
236    /// # Arguments
237    ///
238    /// * `haystack`: The byte slice to search.
239    /// * `needles`: The set of bytes to search for within the haystack.
240    ///
241    /// # Returns
242    ///
243    /// An `Option<usize>` representing the index of the last occurrence of any byte from
244    /// `needles` within `haystack`, if found, otherwise `None`.
245    pub fn rfind_char_from<H, N>(haystack: H, needles: N) -> Option<usize>
246    where
247        H: AsRef<[u8]>,
248        N: AsRef<[u8]>,
249    {
250        let haystack_ref = haystack.as_ref();
251        let needles_ref = needles.as_ref();
252        let haystack_pointer = haystack_ref.as_ptr() as _;
253        let haystack_length = haystack_ref.len();
254        let needles_pointer = needles_ref.as_ptr() as _;
255        let needles_length = needles_ref.len();
256        let result = unsafe {
257            sz_rfind_char_from(
258                haystack_pointer,
259                haystack_length,
260                needles_pointer,
261                needles_length,
262            )
263        };
264        if result.is_null() {
265            None
266        } else {
267            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
268        }
269    }
270
271    /// Finds the index of the first character in `haystack` that is not present in `needles`.
272    /// This function is useful for skipping over a known set of characters and finding the
273    /// first character that does not belong to that set.
274    ///
275    /// # Arguments
276    ///
277    /// * `haystack`: The byte slice to search.
278    /// * `needles`: The set of bytes that should not be matched within the haystack.
279    ///
280    /// # Returns
281    ///
282    /// An `Option<usize>` representing the index of the first occurrence of any byte not in
283    /// `needles` within `haystack`, if found, otherwise `None`.
284    pub fn find_char_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
285    where
286        H: AsRef<[u8]>,
287        N: AsRef<[u8]>,
288    {
289        let haystack_ref = haystack.as_ref();
290        let needles_ref = needles.as_ref();
291        let haystack_pointer = haystack_ref.as_ptr() as _;
292        let haystack_length = haystack_ref.len();
293        let needles_pointer = needles_ref.as_ptr() as _;
294        let needles_length = needles_ref.len();
295        let result = unsafe {
296            sz_find_char_not_from(
297                haystack_pointer,
298                haystack_length,
299                needles_pointer,
300                needles_length,
301            )
302        };
303        if result.is_null() {
304            None
305        } else {
306            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
307        }
308    }
309
310    /// Finds the index of the last character in `haystack` that is not present in `needles`.
311    /// Useful for text processing tasks such as trimming trailing characters that belong to
312    /// a specified set.
313    ///
314    /// # Arguments
315    ///
316    /// * `haystack`: The byte slice to search.
317    /// * `needles`: The set of bytes that should not be matched within the haystack.
318    ///
319    /// # Returns
320    ///
321    /// An `Option<usize>` representing the index of the last occurrence of any byte not in
322    /// `needles` within `haystack`, if found, otherwise `None`.
323    pub fn rfind_char_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
324    where
325        H: AsRef<[u8]>,
326        N: AsRef<[u8]>,
327    {
328        let haystack_ref = haystack.as_ref();
329        let needles_ref = needles.as_ref();
330        let haystack_pointer = haystack_ref.as_ptr() as _;
331        let haystack_length = haystack_ref.len();
332        let needles_pointer = needles_ref.as_ptr() as _;
333        let needles_length = needles_ref.len();
334        let result = unsafe {
335            sz_rfind_char_not_from(
336                haystack_pointer,
337                haystack_length,
338                needles_pointer,
339                needles_length,
340            )
341        };
342        if result.is_null() {
343            None
344        } else {
345            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
346        }
347    }
348
349    /// Computes the Levenshtein edit distance between two strings, using the Wagner-Fisher
350    /// algorithm. This measure is widely used in applications like spell-checking, DNA sequence
351    /// analysis.
352    ///
353    /// # Arguments
354    ///
355    /// * `first`: The first byte slice.
356    /// * `second`: The second byte slice.
357    /// * `bound`: The maximum distance to compute, allowing for early exit.
358    ///
359    /// # Returns
360    ///
361    /// A `usize` representing the minimum number of single-character edits (insertions,
362    /// deletions, or substitutions) required to change `first` into `second`.
363    pub fn edit_distance_bounded<F, S>(first: F, second: S, bound: usize) -> usize
364    where
365        F: AsRef<[u8]>,
366        S: AsRef<[u8]>,
367    {
368        let first_ref = first.as_ref();
369        let second_ref = second.as_ref();
370        let first_length = first_ref.len();
371        let second_length = second_ref.len();
372        let first_pointer = first_ref.as_ptr() as _;
373        let second_pointer = second_ref.as_ptr() as _;
374        unsafe {
375            sz_edit_distance(
376                first_pointer,
377                first_length,
378                second_pointer,
379                second_length,
380                // Upper bound on the distance, that allows us to exit early. If zero is
381                // passed, the maximum possible distance will be equal to the length of
382                // the longer input.
383                bound,
384                // Uses the default allocator
385                core::ptr::null(),
386            )
387        }
388    }
389
390    /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
391    /// algorithm. This measure is widely used in applications like spell-checking.
392    ///
393    /// # Arguments
394    ///
395    /// * `first`: The first byte slice.
396    /// * `second`: The second byte slice.
397    /// * `bound`: The maximum distance to compute, allowing for early exit.
398    ///
399    /// # Returns
400    ///
401    /// A `usize` representing the minimum number of single-character edits (insertions,
402    /// deletions, or substitutions) required to change `first` into `second`.
403    pub fn edit_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> usize
404    where
405        F: AsRef<[u8]>,
406        S: AsRef<[u8]>,
407    {
408        let first_ref = first.as_ref();
409        let second_ref = second.as_ref();
410        let first_length = first_ref.len();
411        let second_length = second_ref.len();
412        let first_pointer = first_ref.as_ptr() as _;
413        let second_pointer = second_ref.as_ptr() as _;
414        unsafe {
415            sz_edit_distance_utf8(
416                first_pointer,
417                first_length,
418                second_pointer,
419                second_length,
420                // Upper bound on the distance, that allows us to exit early. If zero is
421                // passed, the maximum possible distance will be equal to the length of
422                // the longer input.
423                bound,
424                // Uses the default allocator
425                core::ptr::null(),
426            )
427        }
428    }
429
430    /// Computes the Levenshtein edit distance between two strings, using the Wagner-Fisher
431    /// algorithm. This measure is widely used in applications like spell-checking, DNA sequence
432    /// analysis.
433    ///
434    /// # Arguments
435    ///
436    /// * `first`: The first byte slice.
437    /// * `second`: The second byte slice.
438    ///
439    /// # Returns
440    ///
441    /// A `usize` representing the minimum number of single-character edits (insertions,
442    /// deletions, or substitutions) required to change `first` into `second`.
443    pub fn edit_distance<F, S>(first: F, second: S) -> usize
444    where
445        F: AsRef<[u8]>,
446        S: AsRef<[u8]>,
447    {
448        edit_distance_bounded(first, second, 0)
449    }
450
451    /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
452    /// algorithm. This measure is widely used in applications like spell-checking.
453    ///
454    /// # Arguments
455    ///
456    /// * `first`: The first byte slice.
457    /// * `second`: The second byte slice.
458    ///
459    /// # Returns
460    ///
461    /// A `usize` representing the minimum number of single-character edits (insertions,
462    /// deletions, or substitutions) required to change `first` into `second`.
463    pub fn edit_distance_utf8<F, S>(first: F, second: S) -> usize
464    where
465        F: AsRef<[u8]>,
466        S: AsRef<[u8]>,
467    {
468        edit_distance_utf8_bounded(first, second, 0)
469    }
470
471    /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
472    /// Difference in length is added to the result as well.
473    ///
474    /// # Arguments
475    ///
476    /// * `first`: The first byte slice.
477    /// * `second`: The second byte slice.
478    /// * `bound`: The maximum distance to compute, allowing for early exit.
479    ///
480    /// # Returns
481    ///
482    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
483    /// change `first` into `second`.
484    pub fn hamming_distance_bounded<F, S>(first: F, second: S, bound: usize) -> usize
485    where
486        F: AsRef<[u8]>,
487        S: AsRef<[u8]>,
488    {
489        let first_ref = first.as_ref();
490        let second_ref = second.as_ref();
491        let first_length = first_ref.len();
492        let second_length = second_ref.len();
493        let first_pointer = first_ref.as_ptr() as _;
494        let second_pointer = second_ref.as_ptr() as _;
495        unsafe {
496            sz_hamming_distance(
497                first_pointer,
498                first_length,
499                second_pointer,
500                second_length,
501                // Upper bound on the distance, that allows us to exit early. If zero is
502                // passed, the maximum possible distance will be equal to the length of
503                // the longer input.
504                bound,
505            )
506        }
507    }
508
509    /// Computes the Hamming edit distance between two UTF8 strings, counting the number of substituted
510    /// variable-length characters. Difference in length is added to the result as well.
511    ///
512    /// # Arguments
513    ///
514    /// * `first`: The first byte slice.
515    /// * `second`: The second byte slice.
516    /// * `bound`: The maximum distance to compute, allowing for early exit.
517    ///
518    /// # Returns
519    ///
520    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
521    /// change `first` into `second`.
522    pub fn hamming_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> usize
523    where
524        F: AsRef<[u8]>,
525        S: AsRef<[u8]>,
526    {
527        let first_ref = first.as_ref();
528        let second_ref = second.as_ref();
529        let first_length = first_ref.len();
530        let second_length = second_ref.len();
531        let first_pointer = first_ref.as_ptr() as _;
532        let second_pointer = second_ref.as_ptr() as _;
533        unsafe {
534            sz_hamming_distance_utf8(
535                first_pointer,
536                first_length,
537                second_pointer,
538                second_length,
539                // Upper bound on the distance, that allows us to exit early. If zero is
540                // passed, the maximum possible distance will be equal to the length of
541                // the longer input.
542                bound,
543            )
544        }
545    }
546
547    /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
548    /// Difference in length is added to the result as well.
549    ///
550    /// # Arguments
551    ///
552    /// * `first`: The first byte slice.
553    /// * `second`: The second byte slice.
554    ///
555    /// # Returns
556    ///
557    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
558    /// change `first` into `second`.
559    pub fn hamming_distance<F, S>(first: F, second: S) -> usize
560    where
561        F: AsRef<[u8]>,
562        S: AsRef<[u8]>,
563    {
564        hamming_distance_bounded(first, second, 0)
565    }
566
567    /// Computes the Hamming edit distance between two UTF8 strings, counting the number of substituted
568    /// variable-length characters. Difference in length is added to the result as well.
569    ///
570    /// # Arguments
571    ///
572    /// * `first`: The first byte slice.
573    /// * `second`: The second byte slice.
574    ///
575    /// # Returns
576    ///
577    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
578    /// change `first` into `second`.
579    pub fn hamming_distance_utf8<F, S>(first: F, second: S) -> usize
580    where
581        F: AsRef<[u8]>,
582        S: AsRef<[u8]>,
583    {
584        hamming_distance_utf8_bounded(first, second, 0)
585    }
586
587    /// Computes the Needleman-Wunsch alignment score for two strings. This function is
588    /// particularly used in bioinformatics for sequence alignment but is also applicable in
589    /// other domains requiring detailed comparison between two strings, including gap and
590    /// substitution penalties.
591    ///
592    /// # Arguments
593    ///
594    /// * `first`: The first byte slice to align.
595    /// * `second`: The second byte slice to align.
596    /// * `matrix`: The substitution matrix used for scoring.
597    /// * `gap`: The penalty for each gap introduced during alignment.
598    ///
599    /// # Returns
600    ///
601    /// An `isize` representing the total alignment score, where higher scores indicate better
602    /// alignment between the two strings, considering the specified gap penalties and
603    /// substitution matrix.
604    pub fn alignment_score<F, S>(first: F, second: S, matrix: [[i8; 256]; 256], gap: i8) -> isize
605    where
606        F: AsRef<[u8]>,
607        S: AsRef<[u8]>,
608    {
609        let first_ref = first.as_ref();
610        let second_ref = second.as_ref();
611        let first_length = first_ref.len();
612        let second_length = second_ref.len();
613        let first_pointer = first_ref.as_ptr() as _;
614        let second_pointer = second_ref.as_ptr() as _;
615        unsafe {
616            sz_alignment_score(
617                first_pointer,
618                first_length,
619                second_pointer,
620                second_length,
621                matrix.as_ptr() as _,
622                gap,
623                core::ptr::null(),
624            )
625        }
626    }
627
628    /// Generates a default substitution matrix for use with the Needleman-Wunsch
629    /// alignment algorithm. This matrix is initialized such that diagonal entries
630    /// (representing matching characters) are zero, and off-diagonal entries
631    /// (representing mismatches) are -1. This setup effectively produces distances
632    /// equal to the negative Levenshtein edit distance, suitable for basic sequence
633    /// alignment tasks where all mismatches are penalized equally and there are no
634    /// rewards for matches.
635    ///
636    /// # Returns
637    ///
638    /// A 256x256 array of `i8`, where each element represents the substitution cost
639    /// between two characters (byte values). Matching characters are assigned a cost
640    /// of 0, and non-matching characters are assigned a cost of -1.
641    pub fn unary_substitution_costs() -> [[i8; 256]; 256] {
642        let mut result = [[0; 256]; 256];
643
644        for i in 0..256 {
645            for j in 0..256 {
646                result[i][j] = if i == j { 0 } else { -1 };
647            }
648        }
649
650        result
651    }
652
653    /// Randomizes the contents of a given byte slice `text` using characters from
654    /// a specified `alphabet`. This function mutates `text` in place, replacing each
655    /// byte with a random one from `alphabet`. It is designed for situations where
656    /// you need to generate random strings or data sequences based on a specific set
657    /// of characters, such as generating random DNA sequences or testing inputs.
658    ///
659    /// # Type Parameters
660    ///
661    /// * `T`: The type of the text to be randomized. Must be mutable and convertible to a byte slice.
662    /// * `A`: The type of the alphabet. Must be convertible to a byte slice.
663    ///
664    /// # Arguments
665    ///
666    /// * `text`: A mutable reference to the data to randomize. This data will be mutated in place.
667    /// * `alphabet`: A reference to the byte slice representing the alphabet to use for randomization.
668    ///
669    /// # Examples
670    ///
671    /// ```
672    /// use stringzilla::sz;
673    /// let mut my_text = vec![0; 10]; // A buffer to randomize
674    /// let alphabet = b"ACTG"; // Using a DNA alphabet
675    /// sz::randomize(&mut my_text, &alphabet);
676    /// ```
677    ///
678    /// After than,  `my_text` is filled with random 'A', 'C', 'T', or 'G' values.
679    pub fn randomize<T, A>(text: &mut T, alphabet: &A)
680    where
681        T: AsMut<[u8]> + ?Sized, // Allows for mutable references to dynamically sized types.
682        A: AsRef<[u8]> + ?Sized, // Allows for references to dynamically sized types.
683    {
684        let text_slice = text.as_mut();
685        let alphabet_slice = alphabet.as_ref();
686        unsafe {
687            sz_generate(
688                alphabet_slice.as_ptr() as *const c_void,
689                alphabet_slice.len(),
690                text_slice.as_mut_ptr() as *mut c_void,
691                text_slice.len(),
692                core::ptr::null(),
693                core::ptr::null_mut(),
694            );
695        }
696    }
697}
698
699pub trait Matcher<'a> {
700    fn find(&self, haystack: &'a [u8]) -> Option<usize>;
701    fn needle_length(&self) -> usize;
702    fn skip_length(&self, include_overlaps: bool, is_reverse: bool) -> usize;
703}
704
705pub enum MatcherType<'a> {
706    Find(&'a [u8]),
707    RFind(&'a [u8]),
708    FindFirstOf(&'a [u8]),
709    FindLastOf(&'a [u8]),
710    FindFirstNotOf(&'a [u8]),
711    FindLastNotOf(&'a [u8]),
712}
713
714impl<'a> Matcher<'a> for MatcherType<'a> {
715    fn find(&self, haystack: &'a [u8]) -> Option<usize> {
716        match self {
717            MatcherType::Find(needle) => sz::find(haystack, needle),
718            MatcherType::RFind(needle) => sz::rfind(haystack, needle),
719            MatcherType::FindFirstOf(needles) => sz::find_char_from(haystack, needles),
720            MatcherType::FindLastOf(needles) => sz::rfind_char_from(haystack, needles),
721            MatcherType::FindFirstNotOf(needles) => sz::find_char_not_from(haystack, needles),
722            MatcherType::FindLastNotOf(needles) => sz::rfind_char_not_from(haystack, needles),
723        }
724    }
725
726    fn needle_length(&self) -> usize {
727        match self {
728            MatcherType::Find(needle) | MatcherType::RFind(needle) => needle.len(),
729            _ => 1,
730        }
731    }
732
733    fn skip_length(&self, include_overlaps: bool, is_reverse: bool) -> usize {
734        match (include_overlaps, is_reverse) {
735            (true, true) => self.needle_length().saturating_sub(1),
736            (true, false) => 1,
737            (false, true) => 0,
738            (false, false) => self.needle_length(),
739        }
740    }
741}
742
743/// An iterator over non-overlapping matches of a pattern in a string slice.
744/// This iterator yields the matched substrings in the order they are found.
745///
746/// # Examples
747///
748/// ```
749/// use stringzilla::{sz, MatcherType, RangeMatches};
750///
751/// let haystack = b"abababa";
752/// let matcher = MatcherType::Find(b"aba");
753/// let matches: Vec<&[u8]> = RangeMatches::new(haystack, matcher, false).collect();
754/// assert_eq!(matches, vec![b"aba", b"aba"]);
755/// ```
756pub struct RangeMatches<'a> {
757    haystack: &'a [u8],
758    matcher: MatcherType<'a>,
759    position: usize,
760    include_overlaps: bool,
761}
762
763impl<'a> RangeMatches<'a> {
764    pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>, include_overlaps: bool) -> Self {
765        Self {
766            haystack,
767            matcher,
768            position: 0,
769            include_overlaps,
770        }
771    }
772}
773
774impl<'a> Iterator for RangeMatches<'a> {
775    type Item = &'a [u8];
776
777    fn next(&mut self) -> Option<Self::Item> {
778        if self.position >= self.haystack.len() {
779            return None;
780        }
781
782        if let Some(index) = self.matcher.find(&self.haystack[self.position..]) {
783            let start = self.position + index;
784            let end = start + self.matcher.needle_length();
785            self.position = start + self.matcher.skip_length(self.include_overlaps, false);
786            Some(&self.haystack[start..end])
787        } else {
788            self.position = self.haystack.len();
789            None
790        }
791    }
792}
793
794/// An iterator over non-overlapping splits of a string slice by a pattern.
795/// This iterator yields the substrings between the matches of the pattern.
796///
797/// # Examples
798///
799/// ```
800/// use stringzilla::{sz, MatcherType, RangeSplits};
801///
802/// let haystack = b"a,b,c,d";
803/// let matcher = MatcherType::Find(b",");
804/// let splits: Vec<&[u8]> = RangeSplits::new(haystack, matcher).collect();
805/// assert_eq!(splits, vec![b"a", b"b", b"c", b"d"]);
806/// ```
807pub struct RangeSplits<'a> {
808    haystack: &'a [u8],
809    matcher: MatcherType<'a>,
810    position: usize,
811    last_match: Option<usize>,
812}
813
814impl<'a> RangeSplits<'a> {
815    pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>) -> Self {
816        Self {
817            haystack,
818            matcher,
819            position: 0,
820            last_match: None,
821        }
822    }
823}
824
825impl<'a> Iterator for RangeSplits<'a> {
826    type Item = &'a [u8];
827
828    fn next(&mut self) -> Option<Self::Item> {
829        if self.position > self.haystack.len() {
830            return None;
831        }
832
833        if let Some(index) = self.matcher.find(&self.haystack[self.position..]) {
834            let start = self.position;
835            let end = self.position + index;
836            self.position = end + self.matcher.needle_length();
837            self.last_match = Some(end);
838            Some(&self.haystack[start..end])
839        } else if self.position < self.haystack.len() || self.last_match.is_some() {
840            let start = self.position;
841            self.position = self.haystack.len() + 1;
842            Some(&self.haystack[start..])
843        } else {
844            None
845        }
846    }
847}
848
849/// An iterator over non-overlapping matches of a pattern in a string slice, searching from the end.
850/// This iterator yields the matched substrings in reverse order.
851///
852/// # Examples
853///
854/// ```
855/// use stringzilla::{sz, MatcherType, RangeRMatches};
856///
857/// let haystack = b"abababa";
858/// let matcher = MatcherType::RFind(b"aba");
859/// let matches: Vec<&[u8]> = RangeRMatches::new(haystack, matcher, false).collect();
860/// assert_eq!(matches, vec![b"aba", b"aba"]);
861/// ```
862pub struct RangeRMatches<'a> {
863    haystack: &'a [u8],
864    matcher: MatcherType<'a>,
865    position: usize,
866    include_overlaps: bool,
867}
868
869impl<'a> RangeRMatches<'a> {
870    pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>, include_overlaps: bool) -> Self {
871        Self {
872            haystack,
873            matcher,
874            position: haystack.len(),
875            include_overlaps,
876        }
877    }
878}
879
880impl<'a> Iterator for RangeRMatches<'a> {
881    type Item = &'a [u8];
882
883    fn next(&mut self) -> Option<Self::Item> {
884        if self.position == 0 {
885            return None;
886        }
887
888        let search_area = &self.haystack[..self.position];
889        if let Some(index) = self.matcher.find(search_area) {
890            let start = index;
891            let end = start + self.matcher.needle_length();
892            let result = Some(&self.haystack[start..end]);
893
894            let skip = self.matcher.skip_length(self.include_overlaps, true);
895            self.position = start + skip;
896
897            result
898        } else {
899            None
900        }
901    }
902}
903
904/// An iterator over non-overlapping splits of a string slice by a pattern, searching from the end.
905/// This iterator yields the substrings between the matches of the pattern in reverse order.
906///
907/// # Examples
908///
909/// ```
910/// use stringzilla::{sz, MatcherType, RangeRSplits};
911///
912/// let haystack = b"a,b,c,d";
913/// let matcher = MatcherType::RFind(b",");
914/// let splits: Vec<&[u8]> = RangeRSplits::new(haystack, matcher).collect();
915/// assert_eq!(splits, vec![b"d", b"c", b"b", b"a"]);
916/// ```
917pub struct RangeRSplits<'a> {
918    haystack: &'a [u8],
919    matcher: MatcherType<'a>,
920    position: usize,
921}
922
923impl<'a> RangeRSplits<'a> {
924    pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>) -> Self {
925        Self {
926            haystack,
927            matcher,
928            position: haystack.len(),
929        }
930    }
931}
932
933impl<'a> Iterator for RangeRSplits<'a> {
934    type Item = &'a [u8];
935
936    fn next(&mut self) -> Option<Self::Item> {
937        if self.position == 0 {
938            return None;
939        }
940
941        let search_area = &self.haystack[..self.position];
942        if let Some(index) = self.matcher.find(search_area) {
943            let end = self.position;
944            let start = index + self.matcher.needle_length();
945            let result = Some(&self.haystack[start..end]);
946
947            self.position = index;
948
949            result
950        } else {
951            let result = Some(&self.haystack[..self.position]);
952            self.position = 0;
953            result
954        }
955    }
956}
957
958/// Provides extensions for string searching and manipulation functionalities
959/// on types that can reference byte slices ([u8]). This trait extends the capability
960/// of any type implementing `AsRef<[u8]>`, allowing easy integration of SIMD-accelerated
961/// string processing functions.
962///
963/// # Examples
964///
965/// Basic usage on a `Vec<u8>`:
966///
967/// ```
968/// use stringzilla::StringZilla;
969///
970/// let haystack: &[u8] = &[b'a', b'b', b'c', b'd', b'e'];
971/// let needle: &[u8] = &[b'c', b'd'];
972///
973/// assert_eq!(haystack.sz_find(needle.as_ref()), Some(2));
974/// ```
975///
976/// Searching in a string slice:
977///
978/// ```
979/// use stringzilla::StringZilla;
980///
981/// let haystack = "abcdef";
982/// let needle = "cd";
983///
984/// assert_eq!(haystack.sz_find(needle.as_bytes()), Some(2));
985/// ```
986pub trait StringZilla<'a, N>
987where
988    N: AsRef<[u8]> + 'a,
989{
990    /// Searches for the first occurrence of `needle` in `self`.
991    ///
992    /// # Examples
993    ///
994    /// ```
995    /// use stringzilla::StringZilla;
996    ///
997    /// let haystack = "Hello, world!";
998    /// assert_eq!(haystack.sz_find("world".as_bytes()), Some(7));
999    /// ```
1000    fn sz_find(&self, needle: N) -> Option<usize>;
1001
1002    /// Searches for the last occurrence of `needle` in `self`.
1003    ///
1004    /// # Examples
1005    ///
1006    /// ```
1007    /// use stringzilla::StringZilla;
1008    ///
1009    /// let haystack = "Hello, world, world!";
1010    /// assert_eq!(haystack.sz_rfind("world".as_bytes()), Some(14));
1011    /// ```
1012    fn sz_rfind(&self, needle: N) -> Option<usize>;
1013
1014    /// Finds the index of the first character in `self` that is also present in `needles`.
1015    ///
1016    /// # Examples
1017    ///
1018    /// ```
1019    /// use stringzilla::StringZilla;
1020    ///
1021    /// let haystack = "Hello, world!";
1022    /// assert_eq!(haystack.sz_find_char_from("aeiou".as_bytes()), Some(1));
1023    /// ```
1024    fn sz_find_char_from(&self, needles: N) -> Option<usize>;
1025
1026    /// Finds the index of the last character in `self` that is also present in `needles`.
1027    ///
1028    /// # Examples
1029    ///
1030    /// ```
1031    /// use stringzilla::StringZilla;
1032    ///
1033    /// let haystack = "Hello, world!";
1034    /// assert_eq!(haystack.sz_rfind_char_from("aeiou".as_bytes()), Some(8));
1035    /// ```
1036    fn sz_rfind_char_from(&self, needles: N) -> Option<usize>;
1037
1038    /// Finds the index of the first character in `self` that is not present in `needles`.
1039    ///
1040    /// # Examples
1041    ///
1042    /// ```
1043    /// use stringzilla::StringZilla;
1044    ///
1045    /// let haystack = "Hello, world!";
1046    /// assert_eq!(haystack.sz_find_char_not_from("aeiou".as_bytes()), Some(0));
1047    /// ```
1048    fn sz_find_char_not_from(&self, needles: N) -> Option<usize>;
1049
1050    /// Finds the index of the last character in `self` that is not present in `needles`.
1051    ///
1052    /// # Examples
1053    ///
1054    /// ```
1055    /// use stringzilla::StringZilla;
1056    ///
1057    /// let haystack = "Hello, world!";
1058    /// assert_eq!(haystack.sz_rfind_char_not_from("aeiou".as_bytes()), Some(12));
1059    /// ```
1060    fn sz_rfind_char_not_from(&self, needles: N) -> Option<usize>;
1061
1062    /// Computes the Levenshtein edit distance between `self` and `other`.
1063    ///
1064    /// # Examples
1065    ///
1066    /// ```
1067    /// use stringzilla::StringZilla;
1068    ///
1069    /// let first = "kitten";
1070    /// let second = "sitting";
1071    /// assert_eq!(first.sz_edit_distance(second.as_bytes()), 3);
1072    /// ```
1073    fn sz_edit_distance(&self, other: N) -> usize;
1074
1075    /// Computes the alignment score between `self` and `other` using the specified
1076    /// substitution matrix and gap penalty.
1077    ///
1078    /// # Examples
1079    ///
1080    /// ```
1081    /// use stringzilla::{sz, StringZilla};
1082    ///
1083    /// let first = "kitten";
1084    /// let second = "sitting";
1085    /// let matrix = sz::unary_substitution_costs();
1086    /// let gap_penalty = -1;
1087    /// assert_eq!(first.sz_alignment_score(second.as_bytes(), matrix, gap_penalty), -3);
1088    /// ```
1089    fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize;
1090
1091    /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`.
1092    ///
1093    /// # Arguments
1094    ///
1095    /// * `needle`: The byte slice to search for within `self`.
1096    ///
1097    /// # Examples
1098    ///
1099    /// ```
1100    /// use stringzilla::StringZilla;
1101    ///
1102    /// let haystack = b"abababa";
1103    /// let needle = b"aba";
1104    /// let matches: Vec<&[u8]> = haystack.sz_matches(needle).collect();
1105    /// assert_eq!(matches, vec![b"aba", b"aba", b"aba"]);
1106    /// ```
1107    fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a>;
1108
1109    /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`, searching from the end.
1110    ///
1111    /// # Arguments
1112    ///
1113    /// * `needle`: The byte slice to search for within `self`.
1114    ///
1115    /// # Examples
1116    ///
1117    /// ```
1118    /// use stringzilla::StringZilla;
1119    ///
1120    /// let haystack = b"abababa";
1121    /// let needle = b"aba";
1122    /// let matches: Vec<&[u8]> = haystack.sz_rmatches(needle).collect();
1123    /// assert_eq!(matches, vec![b"aba", b"aba", b"aba"]);
1124    /// ```
1125    fn sz_rmatches(&'a self, needle: &'a N) -> RangeRMatches<'a>;
1126
1127    /// Returns an iterator over the substrings of `self` that are separated by the given `needle`.
1128    ///
1129    /// # Arguments
1130    ///
1131    /// * `needle`: The byte slice to split `self` by.
1132    ///
1133    /// # Examples
1134    ///
1135    /// ```
1136    /// use stringzilla::StringZilla;
1137    ///
1138    /// let haystack = b"a,b,c,d";
1139    /// let needle = b",";
1140    /// let splits: Vec<&[u8]> = haystack.sz_splits(needle).collect();
1141    /// assert_eq!(splits, vec![b"a", b"b", b"c", b"d"]);
1142    /// ```
1143    fn sz_splits(&'a self, needle: &'a N) -> RangeSplits<'a>;
1144
1145    /// Returns an iterator over the substrings of `self` that are separated by the given `needle`, searching from the end.
1146    ///
1147    /// # Arguments
1148    ///
1149    /// * `needle`: The byte slice to split `self` by.
1150    ///
1151    /// # Examples
1152    ///
1153    /// ```
1154    /// use stringzilla::StringZilla;
1155    ///
1156    /// let haystack = b"a,b,c,d";
1157    /// let needle = b",";
1158    /// let splits: Vec<&[u8]> = haystack.sz_rsplits(needle).collect();
1159    /// assert_eq!(splits, vec![b"d", b"c", b"b", b"a"]);
1160    /// ```
1161    fn sz_rsplits(&'a self, needle: &'a N) -> RangeRSplits<'a>;
1162
1163    /// Returns an iterator over all non-overlapping matches of any of the bytes in `needles` within `self`.
1164    ///
1165    /// # Arguments
1166    ///
1167    /// * `needles`: The set of bytes to search for within `self`.
1168    ///
1169    /// # Examples
1170    ///
1171    /// ```
1172    /// use stringzilla::StringZilla;
1173    ///
1174    /// let haystack = b"Hello, world!";
1175    /// let needles = b"aeiou";
1176    /// let matches: Vec<&[u8]> = haystack.sz_find_first_of(needles).collect();
1177    /// assert_eq!(matches, vec![b"e", b"o", b"o"]);
1178    /// ```
1179    fn sz_find_first_of(&'a self, needles: &'a N) -> RangeMatches<'a>;
1180
1181    /// Returns an iterator over all non-overlapping matches of any of the bytes in `needles` within `self`, searching from the end.
1182    ///
1183    /// # Arguments
1184    ///
1185    /// * `needles`: The set of bytes to search for within `self`.
1186    ///
1187    /// # Examples
1188    ///
1189    /// ```
1190    /// use stringzilla::StringZilla;
1191    ///
1192    /// let haystack = b"Hello, world!";
1193    /// let needles = b"aeiou";
1194    /// let matches: Vec<&[u8]> = haystack.sz_find_last_of(needles).collect();
1195    /// assert_eq!(matches, vec![b"o", b"o", b"e"]);
1196    /// ```
1197    fn sz_find_last_of(&'a self, needles: &'a N) -> RangeRMatches<'a>;
1198
1199    /// Returns an iterator over all non-overlapping matches of any byte not in `needles` within `self`.
1200    ///
1201    /// # Arguments
1202    ///
1203    /// * `needles`: The set of bytes that should not be matched within `self`.
1204    ///
1205    /// # Examples
1206    ///
1207    /// ```
1208    /// use stringzilla::StringZilla;
1209    ///
1210    /// let haystack = b"Hello, world!";
1211    /// let needles = b"aeiou";
1212    /// let matches: Vec<&[u8]> = haystack.sz_find_first_not_of(needles).collect();
1213    /// assert_eq!(matches, vec![b"H", b"l", b"l", b",", b" ", b"w", b"r", b"l", b"d", b"!"]);
1214    /// ```
1215    fn sz_find_first_not_of(&'a self, needles: &'a N) -> RangeMatches<'a>;
1216
1217    /// Returns an iterator over all non-overlapping matches of any byte not in `needles` within `self`, searching from the end.
1218    ///
1219    /// # Arguments
1220    ///
1221    /// * `needles`: The set of bytes that should not be matched within `self`.
1222    ///q
1223    /// # Examples
1224    ///
1225    /// ```
1226    /// use stringzilla::StringZilla;
1227    ///
1228    /// let haystack = b"Hello, world!";
1229    /// let needles = b"aeiou";
1230    /// let matches: Vec<&[u8]> = haystack.sz_find_last_not_of(needles).collect();
1231    /// assert_eq!(matches, vec![b"!", b"d", b"l", b"r", b"w", b" ", b",", b"l", b"l", b"H"]);
1232    /// ```
1233    fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a>;
1234
1235}
1236
1237impl<'a, T, N> StringZilla<'a, N> for T
1238where
1239    T: AsRef<[u8]> + ?Sized,
1240    N: AsRef<[u8]> + 'a,
1241{
1242    fn sz_find(&self, needle: N) -> Option<usize> {
1243        sz::find(self, needle)
1244    }
1245
1246    fn sz_rfind(&self, needle: N) -> Option<usize> {
1247        sz::rfind(self, needle)
1248    }
1249
1250    fn sz_find_char_from(&self, needles: N) -> Option<usize> {
1251        sz::find_char_from(self, needles)
1252    }
1253
1254    fn sz_rfind_char_from(&self, needles: N) -> Option<usize> {
1255        sz::rfind_char_from(self, needles)
1256    }
1257
1258    fn sz_find_char_not_from(&self, needles: N) -> Option<usize> {
1259        sz::find_char_not_from(self, needles)
1260    }
1261
1262    fn sz_rfind_char_not_from(&self, needles: N) -> Option<usize> {
1263        sz::rfind_char_not_from(self, needles)
1264    }
1265
1266    fn sz_edit_distance(&self, other: N) -> usize {
1267        sz::edit_distance(self, other)
1268    }
1269
1270    fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize {
1271        sz::alignment_score(self, other, matrix, gap)
1272    }
1273
1274    fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a> {
1275        RangeMatches::new(self.as_ref(), MatcherType::Find(needle.as_ref()), true)
1276    }
1277
1278    fn sz_rmatches(&'a self, needle: &'a N) -> RangeRMatches<'a> {
1279        RangeRMatches::new(self.as_ref(), MatcherType::RFind(needle.as_ref()), true)
1280    }
1281
1282    fn sz_splits(&'a self, needle: &'a N) -> RangeSplits<'a> {
1283        RangeSplits::new(self.as_ref(), MatcherType::Find(needle.as_ref()))
1284    }
1285
1286    fn sz_rsplits(&'a self, needle: &'a N) -> RangeRSplits<'a> {
1287        RangeRSplits::new(self.as_ref(), MatcherType::RFind(needle.as_ref()))
1288    }
1289
1290    fn sz_find_first_of(&'a self, needles: &'a N) -> RangeMatches<'a> {
1291        RangeMatches::new(
1292            self.as_ref(),
1293            MatcherType::FindFirstOf(needles.as_ref()),
1294            true,
1295        )
1296    }
1297
1298    fn sz_find_last_of(&'a self, needles: &'a N) -> RangeRMatches<'a> {
1299        RangeRMatches::new(
1300            self.as_ref(),
1301            MatcherType::FindLastOf(needles.as_ref()),
1302            true,
1303        )
1304    }
1305
1306    fn sz_find_first_not_of(&'a self, needles: &'a N) -> RangeMatches<'a> {
1307        RangeMatches::new(
1308            self.as_ref(),
1309            MatcherType::FindFirstNotOf(needles.as_ref()),
1310            true,
1311        )
1312    }
1313
1314    fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a> {
1315        RangeRMatches::new(
1316            self.as_ref(),
1317            MatcherType::FindLastNotOf(needles.as_ref()),
1318            true,
1319        )
1320    }
1321}
1322
1323/// Provides a tool for mutating a byte slice by filling it with random data from a specified alphabet.
1324/// This trait is especially useful for types that need to be mutable and can reference or be converted to byte slices.
1325///
1326/// # Examples
1327///
1328/// Filling a mutable byte buffer with random ASCII letters:
1329///
1330/// ```
1331/// use stringzilla::MutableStringZilla;
1332///
1333/// let mut buffer = vec![0u8; 10]; // A buffer to randomize
1334/// let alphabet = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; // Alphabet to use
1335/// buffer.sz_randomize(alphabet);
1336///
1337/// println!("Random buffer: {:?}", buffer);
1338/// // The buffer will now contain random ASCII letters.
1339/// ```
1340pub trait MutableStringZilla<A>
1341where
1342    A: AsRef<[u8]>,
1343{
1344    /// Fills the implementing byte slice with random bytes from the specified `alphabet`.
1345    ///
1346    /// # Examples
1347    ///
1348    /// ```
1349    /// use stringzilla::MutableStringZilla;
1350    ///
1351    /// let mut text = vec![0; 1000]; // A buffer to randomize
1352    /// let alphabet = b"AGTC"; // Using a DNA alphabet
1353    /// text.sz_randomize(alphabet);
1354    ///
1355    /// // `text` is now filled with random 'A', 'G', 'T', or 'C' values.
1356    /// ```
1357    fn sz_randomize(&mut self, alphabet: A);
1358}
1359
1360impl<T, A> MutableStringZilla<A> for T
1361where
1362    T: AsMut<[u8]>,
1363    A: AsRef<[u8]>,
1364{
1365    fn sz_randomize(&mut self, alphabet: A) {
1366        let self_mut = self.as_mut();
1367        let alphabet_ref = alphabet.as_ref();
1368        sz::randomize(self_mut, alphabet_ref);
1369    }
1370}
1371
1372#[cfg(test)]
1373mod tests {
1374    use std::borrow::Cow;
1375
1376    use crate::sz;
1377    use crate::MutableStringZilla;
1378    use crate::StringZilla;
1379
1380    #[test]
1381    fn hamming() {
1382        assert_eq!(sz::hamming_distance("hello", "hello"), 0);
1383        assert_eq!(sz::hamming_distance("hello", "hell"), 1);
1384        assert_eq!(sz::hamming_distance("abc", "adc"), 1);
1385
1386        assert_eq!(sz::hamming_distance_bounded("abcdefgh", "ABCDEFGH", 2), 2);
1387        assert_eq!(sz::hamming_distance_utf8("αβγδ", "αγγδ"), 1);
1388    }
1389
1390    #[test]
1391    fn levenshtein() {
1392        assert_eq!(sz::edit_distance("hello", "hell"), 1);
1393        assert_eq!(sz::edit_distance("hello", "hell"), 1);
1394        assert_eq!(sz::edit_distance("abc", ""), 3);
1395        assert_eq!(sz::edit_distance("abc", "ac"), 1);
1396        assert_eq!(sz::edit_distance("abc", "a_bc"), 1);
1397        assert_eq!(sz::edit_distance("abc", "adc"), 1);
1398        assert_eq!(sz::edit_distance("fitting", "kitty"), 4);
1399        assert_eq!(sz::edit_distance("smitten", "mitten"), 1);
1400        assert_eq!(sz::edit_distance("ggbuzgjux{}l", "gbuzgjux{}l"), 1);
1401        assert_eq!(sz::edit_distance("abcdefgABCDEFG", "ABCDEFGabcdefg"), 14);
1402
1403        assert_eq!(sz::edit_distance_bounded("fitting", "kitty", 2), 2);
1404        assert_eq!(sz::edit_distance_utf8("façade", "facade"), 1);
1405    }
1406
1407    #[test]
1408    fn needleman() {
1409        let costs_vector = sz::unary_substitution_costs();
1410        assert_eq!(
1411            sz::alignment_score("listen", "silent", costs_vector, -1),
1412            -4
1413        );
1414        assert_eq!(
1415            sz::alignment_score("abcdefgABCDEFG", "ABCDEFGabcdefg", costs_vector, -1),
1416            -14
1417        );
1418        assert_eq!(sz::alignment_score("hello", "hello", costs_vector, -1), 0);
1419        assert_eq!(sz::alignment_score("hello", "hell", costs_vector, -1), -1);
1420    }
1421
1422    #[test]
1423    fn search() {
1424        let my_string: String = String::from("Hello, world!");
1425        let my_str: &str = my_string.as_str();
1426        let my_cow_str: Cow<'_, str> = Cow::from(&my_string);
1427
1428        // Identical to `memchr::memmem::find` and `memchr::memmem::rfind` functions
1429        assert_eq!(sz::find("Hello, world!", "world"), Some(7));
1430        assert_eq!(sz::rfind("Hello, world!", "world"), Some(7));
1431
1432        // Use the generic function with a String
1433        assert_eq!(my_string.sz_find("world"), Some(7));
1434        assert_eq!(my_string.sz_rfind("world"), Some(7));
1435        assert_eq!(my_string.sz_find_char_from("world"), Some(2));
1436        assert_eq!(my_string.sz_rfind_char_from("world"), Some(11));
1437        assert_eq!(my_string.sz_find_char_not_from("world"), Some(0));
1438        assert_eq!(my_string.sz_rfind_char_not_from("world"), Some(12));
1439
1440        // Use the generic function with a &str
1441        assert_eq!(my_str.sz_find("world"), Some(7));
1442        assert_eq!(my_str.sz_find("world"), Some(7));
1443        assert_eq!(my_str.sz_find_char_from("world"), Some(2));
1444        assert_eq!(my_str.sz_rfind_char_from("world"), Some(11));
1445        assert_eq!(my_str.sz_find_char_not_from("world"), Some(0));
1446        assert_eq!(my_str.sz_rfind_char_not_from("world"), Some(12));
1447
1448        // Use the generic function with a Cow<'_, str>
1449        assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
1450        assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
1451        assert_eq!(my_cow_str.as_ref().sz_find_char_from("world"), Some(2));
1452        assert_eq!(my_cow_str.as_ref().sz_rfind_char_from("world"), Some(11));
1453        assert_eq!(my_cow_str.as_ref().sz_find_char_not_from("world"), Some(0));
1454        assert_eq!(
1455            my_cow_str.as_ref().sz_rfind_char_not_from("world"),
1456            Some(12)
1457        );
1458    }
1459
1460    #[test]
1461    fn randomize() {
1462        let mut text: Vec<u8> = vec![0; 10]; // A buffer of ten zeros
1463        let alphabet: &[u8] = b"abcd"; // A byte slice alphabet
1464        text.sz_randomize(alphabet);
1465
1466        // Iterate throught text and check that it only contains letters from the alphabet
1467        assert!(text
1468            .iter()
1469            .all(|&b| b == b'd' || b == b'c' || b == b'b' || b == b'a'));
1470    }
1471
1472    mod search_split_iterators {
1473        use super::*;
1474        use crate::{MatcherType, RangeMatches, RangeRMatches};
1475
1476        #[test]
1477        fn test_matches() {
1478            let haystack = b"hello world hello universe";
1479            let needle = b"hello";
1480            let matches: Vec<_> = haystack.sz_matches(needle).collect();
1481            assert_eq!(matches, vec![b"hello", b"hello"]);
1482        }
1483
1484        #[test]
1485        fn test_rmatches() {
1486            let haystack = b"hello world hello universe";
1487            let needle = b"hello";
1488            let matches: Vec<_> = haystack.sz_rmatches(needle).collect();
1489            assert_eq!(matches, vec![b"hello", b"hello"]);
1490        }
1491
1492        #[test]
1493        fn test_splits() {
1494            let haystack = b"alpha,beta;gamma";
1495            let needle = b",";
1496            let splits: Vec<_> = haystack.sz_splits(needle).collect();
1497            assert_eq!(splits, vec![&b"alpha"[..], &b"beta;gamma"[..]]);
1498        }
1499
1500        #[test]
1501        fn test_rsplits() {
1502            let haystack = b"alpha,beta;gamma";
1503            let needle = b";";
1504            let splits: Vec<_> = haystack.sz_rsplits(needle).collect();
1505            assert_eq!(splits, vec![&b"gamma"[..], &b"alpha,beta"[..]]);
1506        }
1507
1508        #[test]
1509        fn test_splits_with_empty_parts() {
1510            let haystack = b"a,,b,";
1511            let needle = b",";
1512            let splits: Vec<_> = haystack.sz_splits(needle).collect();
1513            assert_eq!(splits, vec![b"a", &b""[..], b"b", &b""[..]]);
1514        }
1515
1516        #[test]
1517        fn test_matches_with_overlaps() {
1518            let haystack = b"aaaa";
1519            let needle = b"aa";
1520            let matches: Vec<_> = haystack.sz_matches(needle).collect();
1521            assert_eq!(matches, vec![b"aa", b"aa", b"aa"]);
1522        }
1523
1524        #[test]
1525        fn test_splits_with_utf8() {
1526            let haystack = "こんにちは,世界".as_bytes();
1527            let needle = b",";
1528            let splits: Vec<_> = haystack.sz_splits(needle).collect();
1529            assert_eq!(splits, vec!["こんにちは".as_bytes(), "世界".as_bytes()]);
1530        }
1531
1532        #[test]
1533        fn test_find_first_of() {
1534            let haystack = b"hello world";
1535            let needles = b"or";
1536            let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
1537            assert_eq!(matches, vec![b"o", b"o", b"r"]);
1538        }
1539
1540        #[test]
1541        fn test_find_last_of() {
1542            let haystack = b"hello world";
1543            let needles = b"or";
1544            let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
1545            assert_eq!(matches, vec![b"r", b"o", b"o"]);
1546        }
1547
1548        #[test]
1549        fn test_find_first_not_of() {
1550            let haystack = b"aabbbcccd";
1551            let needles = b"ab";
1552            let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
1553            assert_eq!(matches, vec![b"c", b"c", b"c", b"d"]);
1554        }
1555
1556        #[test]
1557        fn test_find_last_not_of() {
1558            let haystack = b"aabbbcccd";
1559            let needles = b"cd";
1560            let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
1561            assert_eq!(matches, vec![b"b", b"b", b"b", b"a", b"a"]);
1562        }
1563
1564        #[test]
1565        fn test_find_first_of_empty_needles() {
1566            let haystack = b"hello world";
1567            let needles = b"";
1568            let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
1569            assert_eq!(matches, Vec::<&[u8]>::new());
1570        }
1571
1572        #[test]
1573        fn test_find_last_of_empty_haystack() {
1574            let haystack = b"";
1575            let needles = b"abc";
1576            let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
1577            assert_eq!(matches, Vec::<&[u8]>::new());
1578        }
1579
1580        #[test]
1581        fn test_find_first_not_of_all_matching() {
1582            let haystack = b"aaabbbccc";
1583            let needles = b"abc";
1584            let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
1585            assert_eq!(matches, Vec::<&[u8]>::new());
1586        }
1587
1588        #[test]
1589        fn test_find_last_not_of_all_not_matching() {
1590            let haystack = b"hello world";
1591            let needles = b"xyz";
1592            let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
1593            assert_eq!(
1594                matches,
1595                vec![b"d", b"l", b"r", b"o", b"w", b" ", b"o", b"l", b"l", b"e", b"h"]
1596            );
1597        }
1598
1599        #[test]
1600        fn test_range_matches_overlapping() {
1601            let haystack = b"aaaa";
1602            let matcher = MatcherType::Find(b"aa");
1603            let matches: Vec<_> = RangeMatches::new(haystack, matcher, true).collect();
1604            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
1605        }
1606
1607        #[test]
1608        fn test_range_matches_non_overlapping() {
1609            let haystack = b"aaaa";
1610            let matcher = MatcherType::Find(b"aa");
1611            let matches: Vec<_> = RangeMatches::new(haystack, matcher, false).collect();
1612            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
1613        }
1614
1615        #[test]
1616        fn test_range_rmatches_overlapping() {
1617            let haystack = b"aaaa";
1618            let matcher = MatcherType::RFind(b"aa");
1619            let matches: Vec<_> = RangeRMatches::new(haystack, matcher, true).collect();
1620            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
1621        }
1622
1623        #[test]
1624        fn test_range_rmatches_non_overlapping() {
1625            let haystack = b"aaaa";
1626            let matcher = MatcherType::RFind(b"aa");
1627            let matches: Vec<_> = RangeRMatches::new(haystack, matcher, false).collect();
1628            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
1629        }
1630    }
1631}
stringzilla/lib.rs

stringzilla/
lib.rs