stringzilla/lib.rs
1#![cfg_attr(not(test), no_std)]
2
3/// The `sz` module provides a collection of string searching and manipulation functionality,
4/// designed for high efficiency and compatibility with no_std environments. This module offers
5/// various utilities for byte string manipulation, including search, reverse search, and
6/// edit-distance calculations, suitable for a wide range of applications from basic string
7/// processing to complex text analysis tasks.
8
9pub mod sz {
10
11 use core::ffi::c_void;
12
13 // Import the functions from the StringZilla C library.
14 extern "C" {
15 fn sz_find(
16 haystack: *const c_void,
17 haystack_length: usize,
18 needle: *const c_void,
19 needle_length: usize,
20 ) -> *const c_void;
21
22 fn sz_rfind(
23 haystack: *const c_void,
24 haystack_length: usize,
25 needle: *const c_void,
26 needle_length: usize,
27 ) -> *const c_void;
28
29 fn sz_find_char_from(
30 haystack: *const c_void,
31 haystack_length: usize,
32 needle: *const c_void,
33 needle_length: usize,
34 ) -> *const c_void;
35
36 fn sz_rfind_char_from(
37 haystack: *const c_void,
38 haystack_length: usize,
39 needle: *const c_void,
40 needle_length: usize,
41 ) -> *const c_void;
42
43 fn sz_find_char_not_from(
44 haystack: *const c_void,
45 haystack_length: usize,
46 needle: *const c_void,
47 needle_length: usize,
48 ) -> *const c_void;
49
50 fn sz_rfind_char_not_from(
51 haystack: *const c_void,
52 haystack_length: usize,
53 needle: *const c_void,
54 needle_length: usize,
55 ) -> *const c_void;
56
57 fn sz_edit_distance(
58 haystack1: *const c_void,
59 haystack1_length: usize,
60 haystack2: *const c_void,
61 haystack2_length: usize,
62 bound: usize,
63 allocator: *const c_void,
64 ) -> usize;
65
66 fn sz_edit_distance_utf8(
67 haystack1: *const c_void,
68 haystack1_length: usize,
69 haystack2: *const c_void,
70 haystack2_length: usize,
71 bound: usize,
72 allocator: *const c_void,
73 ) -> usize;
74
75 fn sz_hamming_distance(
76 haystack1: *const c_void,
77 haystack1_length: usize,
78 haystack2: *const c_void,
79 haystack2_length: usize,
80 bound: usize,
81 ) -> usize;
82
83 fn sz_hamming_distance_utf8(
84 haystack1: *const c_void,
85 haystack1_length: usize,
86 haystack2: *const c_void,
87 haystack2_length: usize,
88 bound: usize,
89 ) -> usize;
90
91 fn sz_alignment_score(
92 haystack1: *const c_void,
93 haystack1_length: usize,
94 haystack2: *const c_void,
95 haystack2_length: usize,
96 matrix: *const c_void,
97 gap: i8,
98 allocator: *const c_void,
99 ) -> isize;
100
101 // type RandomGeneratorT = fn(*mut c_void) -> u64;
102
103 fn sz_generate(
104 alphabet: *const c_void,
105 alphabet_size: usize,
106 text: *mut c_void,
107 length: usize,
108 generate: *const c_void,
109 generator: *mut c_void,
110 );
111 }
112
113 /// Locates the first matching substring within `haystack` that equals `needle`.
114 /// This function is similar to the `memmem()` function in LibC, but, unlike `strstr()`,
115 /// it requires the length of both haystack and needle to be known beforehand.
116 ///
117 /// # Arguments
118 ///
119 /// * `haystack`: The byte slice to search.
120 /// * `needle`: The byte slice to find within the haystack.
121 ///
122 /// # Returns
123 ///
124 /// An `Option<usize>` representing the starting index of the first occurrence of `needle`
125 /// within `haystack` if found, otherwise `None`.
126 pub fn find<H, N>(haystack: H, needle: N) -> Option<usize>
127 where
128 H: AsRef<[u8]>,
129 N: AsRef<[u8]>,
130 {
131 let haystack_ref = haystack.as_ref();
132 let needle_ref = needle.as_ref();
133 let haystack_pointer = haystack_ref.as_ptr() as _;
134 let haystack_length = haystack_ref.len();
135 let needle_pointer = needle_ref.as_ptr() as _;
136 let needle_length = needle_ref.len();
137 let result = unsafe {
138 sz_find(
139 haystack_pointer,
140 haystack_length,
141 needle_pointer,
142 needle_length,
143 )
144 };
145
146 if result.is_null() {
147 None
148 } else {
149 Some(unsafe { result.offset_from(haystack_pointer) } as usize)
150 }
151 }
152
153 /// Locates the last matching substring within `haystack` that equals `needle`.
154 /// This function is useful for finding the most recent or last occurrence of a pattern
155 /// within a byte slice.
156 ///
157 /// # Arguments
158 ///
159 /// * `haystack`: The byte slice to search.
160 /// * `needle`: The byte slice to find within the haystack.
161 ///
162 /// # Returns
163 ///
164 /// An `Option<usize>` representing the starting index of the last occurrence of `needle`
165 /// within `haystack` if found, otherwise `None`.
166 pub fn rfind<H, N>(haystack: H, needle: N) -> Option<usize>
167 where
168 H: AsRef<[u8]>,
169 N: AsRef<[u8]>,
170 {
171 let haystack_ref = haystack.as_ref();
172 let needle_ref = needle.as_ref();
173 let haystack_pointer = haystack_ref.as_ptr() as _;
174 let haystack_length = haystack_ref.len();
175 let needle_pointer = needle_ref.as_ptr() as _;
176 let needle_length = needle_ref.len();
177 let result = unsafe {
178 sz_rfind(
179 haystack_pointer,
180 haystack_length,
181 needle_pointer,
182 needle_length,
183 )
184 };
185
186 if result.is_null() {
187 None
188 } else {
189 Some(unsafe { result.offset_from(haystack_pointer) } as usize)
190 }
191 }
192
193 /// Finds the index of the first character in `haystack` that is also present in `needles`.
194 /// This function is particularly useful for parsing and tokenization tasks where a set of
195 /// delimiter characters is used.
196 ///
197 /// # Arguments
198 ///
199 /// * `haystack`: The byte slice to search.
200 /// * `needles`: The set of bytes to search for within the haystack.
201 ///
202 /// # Returns
203 ///
204 /// An `Option<usize>` representing the index of the first occurrence of any byte from
205 /// `needles` within `haystack`, if found, otherwise `None`.
206 pub fn find_char_from<H, N>(haystack: H, needles: N) -> Option<usize>
207 where
208 H: AsRef<[u8]>,
209 N: AsRef<[u8]>,
210 {
211 let haystack_ref = haystack.as_ref();
212 let needles_ref = needles.as_ref();
213 let haystack_pointer = haystack_ref.as_ptr() as _;
214 let haystack_length = haystack_ref.len();
215 let needles_pointer = needles_ref.as_ptr() as _;
216 let needles_length = needles_ref.len();
217 let result = unsafe {
218 sz_find_char_from(
219 haystack_pointer,
220 haystack_length,
221 needles_pointer,
222 needles_length,
223 )
224 };
225 if result.is_null() {
226 None
227 } else {
228 Some(unsafe { result.offset_from(haystack_pointer) } as usize)
229 }
230 }
231
232 /// Finds the index of the last character in `haystack` that is also present in `needles`.
233 /// This can be used to find the last occurrence of any character from a specified set,
234 /// useful in parsing scenarios such as finding the last delimiter in a string.
235 ///
236 /// # Arguments
237 ///
238 /// * `haystack`: The byte slice to search.
239 /// * `needles`: The set of bytes to search for within the haystack.
240 ///
241 /// # Returns
242 ///
243 /// An `Option<usize>` representing the index of the last occurrence of any byte from
244 /// `needles` within `haystack`, if found, otherwise `None`.
245 pub fn rfind_char_from<H, N>(haystack: H, needles: N) -> Option<usize>
246 where
247 H: AsRef<[u8]>,
248 N: AsRef<[u8]>,
249 {
250 let haystack_ref = haystack.as_ref();
251 let needles_ref = needles.as_ref();
252 let haystack_pointer = haystack_ref.as_ptr() as _;
253 let haystack_length = haystack_ref.len();
254 let needles_pointer = needles_ref.as_ptr() as _;
255 let needles_length = needles_ref.len();
256 let result = unsafe {
257 sz_rfind_char_from(
258 haystack_pointer,
259 haystack_length,
260 needles_pointer,
261 needles_length,
262 )
263 };
264 if result.is_null() {
265 None
266 } else {
267 Some(unsafe { result.offset_from(haystack_pointer) } as usize)
268 }
269 }
270
271 /// Finds the index of the first character in `haystack` that is not present in `needles`.
272 /// This function is useful for skipping over a known set of characters and finding the
273 /// first character that does not belong to that set.
274 ///
275 /// # Arguments
276 ///
277 /// * `haystack`: The byte slice to search.
278 /// * `needles`: The set of bytes that should not be matched within the haystack.
279 ///
280 /// # Returns
281 ///
282 /// An `Option<usize>` representing the index of the first occurrence of any byte not in
283 /// `needles` within `haystack`, if found, otherwise `None`.
284 pub fn find_char_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
285 where
286 H: AsRef<[u8]>,
287 N: AsRef<[u8]>,
288 {
289 let haystack_ref = haystack.as_ref();
290 let needles_ref = needles.as_ref();
291 let haystack_pointer = haystack_ref.as_ptr() as _;
292 let haystack_length = haystack_ref.len();
293 let needles_pointer = needles_ref.as_ptr() as _;
294 let needles_length = needles_ref.len();
295 let result = unsafe {
296 sz_find_char_not_from(
297 haystack_pointer,
298 haystack_length,
299 needles_pointer,
300 needles_length,
301 )
302 };
303 if result.is_null() {
304 None
305 } else {
306 Some(unsafe { result.offset_from(haystack_pointer) } as usize)
307 }
308 }
309
310 /// Finds the index of the last character in `haystack` that is not present in `needles`.
311 /// Useful for text processing tasks such as trimming trailing characters that belong to
312 /// a specified set.
313 ///
314 /// # Arguments
315 ///
316 /// * `haystack`: The byte slice to search.
317 /// * `needles`: The set of bytes that should not be matched within the haystack.
318 ///
319 /// # Returns
320 ///
321 /// An `Option<usize>` representing the index of the last occurrence of any byte not in
322 /// `needles` within `haystack`, if found, otherwise `None`.
323 pub fn rfind_char_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
324 where
325 H: AsRef<[u8]>,
326 N: AsRef<[u8]>,
327 {
328 let haystack_ref = haystack.as_ref();
329 let needles_ref = needles.as_ref();
330 let haystack_pointer = haystack_ref.as_ptr() as _;
331 let haystack_length = haystack_ref.len();
332 let needles_pointer = needles_ref.as_ptr() as _;
333 let needles_length = needles_ref.len();
334 let result = unsafe {
335 sz_rfind_char_not_from(
336 haystack_pointer,
337 haystack_length,
338 needles_pointer,
339 needles_length,
340 )
341 };
342 if result.is_null() {
343 None
344 } else {
345 Some(unsafe { result.offset_from(haystack_pointer) } as usize)
346 }
347 }
348
349 /// Computes the Levenshtein edit distance between two strings, using the Wagner-Fisher
350 /// algorithm. This measure is widely used in applications like spell-checking, DNA sequence
351 /// analysis.
352 ///
353 /// # Arguments
354 ///
355 /// * `first`: The first byte slice.
356 /// * `second`: The second byte slice.
357 /// * `bound`: The maximum distance to compute, allowing for early exit.
358 ///
359 /// # Returns
360 ///
361 /// A `usize` representing the minimum number of single-character edits (insertions,
362 /// deletions, or substitutions) required to change `first` into `second`.
363 pub fn edit_distance_bounded<F, S>(first: F, second: S, bound: usize) -> usize
364 where
365 F: AsRef<[u8]>,
366 S: AsRef<[u8]>,
367 {
368 let first_ref = first.as_ref();
369 let second_ref = second.as_ref();
370 let first_length = first_ref.len();
371 let second_length = second_ref.len();
372 let first_pointer = first_ref.as_ptr() as _;
373 let second_pointer = second_ref.as_ptr() as _;
374 unsafe {
375 sz_edit_distance(
376 first_pointer,
377 first_length,
378 second_pointer,
379 second_length,
380 // Upper bound on the distance, that allows us to exit early. If zero is
381 // passed, the maximum possible distance will be equal to the length of
382 // the longer input.
383 bound,
384 // Uses the default allocator
385 core::ptr::null(),
386 )
387 }
388 }
389
390 /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
391 /// algorithm. This measure is widely used in applications like spell-checking.
392 ///
393 /// # Arguments
394 ///
395 /// * `first`: The first byte slice.
396 /// * `second`: The second byte slice.
397 /// * `bound`: The maximum distance to compute, allowing for early exit.
398 ///
399 /// # Returns
400 ///
401 /// A `usize` representing the minimum number of single-character edits (insertions,
402 /// deletions, or substitutions) required to change `first` into `second`.
403 pub fn edit_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> usize
404 where
405 F: AsRef<[u8]>,
406 S: AsRef<[u8]>,
407 {
408 let first_ref = first.as_ref();
409 let second_ref = second.as_ref();
410 let first_length = first_ref.len();
411 let second_length = second_ref.len();
412 let first_pointer = first_ref.as_ptr() as _;
413 let second_pointer = second_ref.as_ptr() as _;
414 unsafe {
415 sz_edit_distance_utf8(
416 first_pointer,
417 first_length,
418 second_pointer,
419 second_length,
420 // Upper bound on the distance, that allows us to exit early. If zero is
421 // passed, the maximum possible distance will be equal to the length of
422 // the longer input.
423 bound,
424 // Uses the default allocator
425 core::ptr::null(),
426 )
427 }
428 }
429
430 /// Computes the Levenshtein edit distance between two strings, using the Wagner-Fisher
431 /// algorithm. This measure is widely used in applications like spell-checking, DNA sequence
432 /// analysis.
433 ///
434 /// # Arguments
435 ///
436 /// * `first`: The first byte slice.
437 /// * `second`: The second byte slice.
438 ///
439 /// # Returns
440 ///
441 /// A `usize` representing the minimum number of single-character edits (insertions,
442 /// deletions, or substitutions) required to change `first` into `second`.
443 pub fn edit_distance<F, S>(first: F, second: S) -> usize
444 where
445 F: AsRef<[u8]>,
446 S: AsRef<[u8]>,
447 {
448 edit_distance_bounded(first, second, 0)
449 }
450
451 /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
452 /// algorithm. This measure is widely used in applications like spell-checking.
453 ///
454 /// # Arguments
455 ///
456 /// * `first`: The first byte slice.
457 /// * `second`: The second byte slice.
458 ///
459 /// # Returns
460 ///
461 /// A `usize` representing the minimum number of single-character edits (insertions,
462 /// deletions, or substitutions) required to change `first` into `second`.
463 pub fn edit_distance_utf8<F, S>(first: F, second: S) -> usize
464 where
465 F: AsRef<[u8]>,
466 S: AsRef<[u8]>,
467 {
468 edit_distance_utf8_bounded(first, second, 0)
469 }
470
471 /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
472 /// Difference in length is added to the result as well.
473 ///
474 /// # Arguments
475 ///
476 /// * `first`: The first byte slice.
477 /// * `second`: The second byte slice.
478 /// * `bound`: The maximum distance to compute, allowing for early exit.
479 ///
480 /// # Returns
481 ///
482 /// A `usize` representing the minimum number of single-character edits (substitutions) required to
483 /// change `first` into `second`.
484 pub fn hamming_distance_bounded<F, S>(first: F, second: S, bound: usize) -> usize
485 where
486 F: AsRef<[u8]>,
487 S: AsRef<[u8]>,
488 {
489 let first_ref = first.as_ref();
490 let second_ref = second.as_ref();
491 let first_length = first_ref.len();
492 let second_length = second_ref.len();
493 let first_pointer = first_ref.as_ptr() as _;
494 let second_pointer = second_ref.as_ptr() as _;
495 unsafe {
496 sz_hamming_distance(
497 first_pointer,
498 first_length,
499 second_pointer,
500 second_length,
501 // Upper bound on the distance, that allows us to exit early. If zero is
502 // passed, the maximum possible distance will be equal to the length of
503 // the longer input.
504 bound,
505 )
506 }
507 }
508
509 /// Computes the Hamming edit distance between two UTF8 strings, counting the number of substituted
510 /// variable-length characters. Difference in length is added to the result as well.
511 ///
512 /// # Arguments
513 ///
514 /// * `first`: The first byte slice.
515 /// * `second`: The second byte slice.
516 /// * `bound`: The maximum distance to compute, allowing for early exit.
517 ///
518 /// # Returns
519 ///
520 /// A `usize` representing the minimum number of single-character edits (substitutions) required to
521 /// change `first` into `second`.
522 pub fn hamming_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> usize
523 where
524 F: AsRef<[u8]>,
525 S: AsRef<[u8]>,
526 {
527 let first_ref = first.as_ref();
528 let second_ref = second.as_ref();
529 let first_length = first_ref.len();
530 let second_length = second_ref.len();
531 let first_pointer = first_ref.as_ptr() as _;
532 let second_pointer = second_ref.as_ptr() as _;
533 unsafe {
534 sz_hamming_distance_utf8(
535 first_pointer,
536 first_length,
537 second_pointer,
538 second_length,
539 // Upper bound on the distance, that allows us to exit early. If zero is
540 // passed, the maximum possible distance will be equal to the length of
541 // the longer input.
542 bound,
543 )
544 }
545 }
546
547 /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
548 /// Difference in length is added to the result as well.
549 ///
550 /// # Arguments
551 ///
552 /// * `first`: The first byte slice.
553 /// * `second`: The second byte slice.
554 ///
555 /// # Returns
556 ///
557 /// A `usize` representing the minimum number of single-character edits (substitutions) required to
558 /// change `first` into `second`.
559 pub fn hamming_distance<F, S>(first: F, second: S) -> usize
560 where
561 F: AsRef<[u8]>,
562 S: AsRef<[u8]>,
563 {
564 hamming_distance_bounded(first, second, 0)
565 }
566
567 /// Computes the Hamming edit distance between two UTF8 strings, counting the number of substituted
568 /// variable-length characters. Difference in length is added to the result as well.
569 ///
570 /// # Arguments
571 ///
572 /// * `first`: The first byte slice.
573 /// * `second`: The second byte slice.
574 ///
575 /// # Returns
576 ///
577 /// A `usize` representing the minimum number of single-character edits (substitutions) required to
578 /// change `first` into `second`.
579 pub fn hamming_distance_utf8<F, S>(first: F, second: S) -> usize
580 where
581 F: AsRef<[u8]>,
582 S: AsRef<[u8]>,
583 {
584 hamming_distance_utf8_bounded(first, second, 0)
585 }
586
587 /// Computes the Needleman-Wunsch alignment score for two strings. This function is
588 /// particularly used in bioinformatics for sequence alignment but is also applicable in
589 /// other domains requiring detailed comparison between two strings, including gap and
590 /// substitution penalties.
591 ///
592 /// # Arguments
593 ///
594 /// * `first`: The first byte slice to align.
595 /// * `second`: The second byte slice to align.
596 /// * `matrix`: The substitution matrix used for scoring.
597 /// * `gap`: The penalty for each gap introduced during alignment.
598 ///
599 /// # Returns
600 ///
601 /// An `isize` representing the total alignment score, where higher scores indicate better
602 /// alignment between the two strings, considering the specified gap penalties and
603 /// substitution matrix.
604 pub fn alignment_score<F, S>(first: F, second: S, matrix: [[i8; 256]; 256], gap: i8) -> isize
605 where
606 F: AsRef<[u8]>,
607 S: AsRef<[u8]>,
608 {
609 let first_ref = first.as_ref();
610 let second_ref = second.as_ref();
611 let first_length = first_ref.len();
612 let second_length = second_ref.len();
613 let first_pointer = first_ref.as_ptr() as _;
614 let second_pointer = second_ref.as_ptr() as _;
615 unsafe {
616 sz_alignment_score(
617 first_pointer,
618 first_length,
619 second_pointer,
620 second_length,
621 matrix.as_ptr() as _,
622 gap,
623 core::ptr::null(),
624 )
625 }
626 }
627
628 /// Generates a default substitution matrix for use with the Needleman-Wunsch
629 /// alignment algorithm. This matrix is initialized such that diagonal entries
630 /// (representing matching characters) are zero, and off-diagonal entries
631 /// (representing mismatches) are -1. This setup effectively produces distances
632 /// equal to the negative Levenshtein edit distance, suitable for basic sequence
633 /// alignment tasks where all mismatches are penalized equally and there are no
634 /// rewards for matches.
635 ///
636 /// # Returns
637 ///
638 /// A 256x256 array of `i8`, where each element represents the substitution cost
639 /// between two characters (byte values). Matching characters are assigned a cost
640 /// of 0, and non-matching characters are assigned a cost of -1.
641 pub fn unary_substitution_costs() -> [[i8; 256]; 256] {
642 let mut result = [[0; 256]; 256];
643
644 for i in 0..256 {
645 for j in 0..256 {
646 result[i][j] = if i == j { 0 } else { -1 };
647 }
648 }
649
650 result
651 }
652
653 /// Randomizes the contents of a given byte slice `text` using characters from
654 /// a specified `alphabet`. This function mutates `text` in place, replacing each
655 /// byte with a random one from `alphabet`. It is designed for situations where
656 /// you need to generate random strings or data sequences based on a specific set
657 /// of characters, such as generating random DNA sequences or testing inputs.
658 ///
659 /// # Type Parameters
660 ///
661 /// * `T`: The type of the text to be randomized. Must be mutable and convertible to a byte slice.
662 /// * `A`: The type of the alphabet. Must be convertible to a byte slice.
663 ///
664 /// # Arguments
665 ///
666 /// * `text`: A mutable reference to the data to randomize. This data will be mutated in place.
667 /// * `alphabet`: A reference to the byte slice representing the alphabet to use for randomization.
668 ///
669 /// # Examples
670 ///
671 /// ```
672 /// use stringzilla::sz;
673 /// let mut my_text = vec![0; 10]; // A buffer to randomize
674 /// let alphabet = b"ACTG"; // Using a DNA alphabet
675 /// sz::randomize(&mut my_text, &alphabet);
676 /// ```
677 ///
678 /// After than, `my_text` is filled with random 'A', 'C', 'T', or 'G' values.
679 pub fn randomize<T, A>(text: &mut T, alphabet: &A)
680 where
681 T: AsMut<[u8]> + ?Sized, // Allows for mutable references to dynamically sized types.
682 A: AsRef<[u8]> + ?Sized, // Allows for references to dynamically sized types.
683 {
684 let text_slice = text.as_mut();
685 let alphabet_slice = alphabet.as_ref();
686 unsafe {
687 sz_generate(
688 alphabet_slice.as_ptr() as *const c_void,
689 alphabet_slice.len(),
690 text_slice.as_mut_ptr() as *mut c_void,
691 text_slice.len(),
692 core::ptr::null(),
693 core::ptr::null_mut(),
694 );
695 }
696 }
697}
698
699pub trait Matcher<'a> {
700 fn find(&self, haystack: &'a [u8]) -> Option<usize>;
701 fn needle_length(&self) -> usize;
702 fn skip_length(&self, include_overlaps: bool, is_reverse: bool) -> usize;
703}
704
705pub enum MatcherType<'a> {
706 Find(&'a [u8]),
707 RFind(&'a [u8]),
708 FindFirstOf(&'a [u8]),
709 FindLastOf(&'a [u8]),
710 FindFirstNotOf(&'a [u8]),
711 FindLastNotOf(&'a [u8]),
712}
713
714impl<'a> Matcher<'a> for MatcherType<'a> {
715 fn find(&self, haystack: &'a [u8]) -> Option<usize> {
716 match self {
717 MatcherType::Find(needle) => sz::find(haystack, needle),
718 MatcherType::RFind(needle) => sz::rfind(haystack, needle),
719 MatcherType::FindFirstOf(needles) => sz::find_char_from(haystack, needles),
720 MatcherType::FindLastOf(needles) => sz::rfind_char_from(haystack, needles),
721 MatcherType::FindFirstNotOf(needles) => sz::find_char_not_from(haystack, needles),
722 MatcherType::FindLastNotOf(needles) => sz::rfind_char_not_from(haystack, needles),
723 }
724 }
725
726 fn needle_length(&self) -> usize {
727 match self {
728 MatcherType::Find(needle) | MatcherType::RFind(needle) => needle.len(),
729 _ => 1,
730 }
731 }
732
733 fn skip_length(&self, include_overlaps: bool, is_reverse: bool) -> usize {
734 match (include_overlaps, is_reverse) {
735 (true, true) => self.needle_length().saturating_sub(1),
736 (true, false) => 1,
737 (false, true) => 0,
738 (false, false) => self.needle_length(),
739 }
740 }
741}
742
743/// An iterator over non-overlapping matches of a pattern in a string slice.
744/// This iterator yields the matched substrings in the order they are found.
745///
746/// # Examples
747///
748/// ```
749/// use stringzilla::{sz, MatcherType, RangeMatches};
750///
751/// let haystack = b"abababa";
752/// let matcher = MatcherType::Find(b"aba");
753/// let matches: Vec<&[u8]> = RangeMatches::new(haystack, matcher, false).collect();
754/// assert_eq!(matches, vec![b"aba", b"aba"]);
755/// ```
756pub struct RangeMatches<'a> {
757 haystack: &'a [u8],
758 matcher: MatcherType<'a>,
759 position: usize,
760 include_overlaps: bool,
761}
762
763impl<'a> RangeMatches<'a> {
764 pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>, include_overlaps: bool) -> Self {
765 Self {
766 haystack,
767 matcher,
768 position: 0,
769 include_overlaps,
770 }
771 }
772}
773
774impl<'a> Iterator for RangeMatches<'a> {
775 type Item = &'a [u8];
776
777 fn next(&mut self) -> Option<Self::Item> {
778 if self.position >= self.haystack.len() {
779 return None;
780 }
781
782 if let Some(index) = self.matcher.find(&self.haystack[self.position..]) {
783 let start = self.position + index;
784 let end = start + self.matcher.needle_length();
785 self.position = start + self.matcher.skip_length(self.include_overlaps, false);
786 Some(&self.haystack[start..end])
787 } else {
788 self.position = self.haystack.len();
789 None
790 }
791 }
792}
793
794/// An iterator over non-overlapping splits of a string slice by a pattern.
795/// This iterator yields the substrings between the matches of the pattern.
796///
797/// # Examples
798///
799/// ```
800/// use stringzilla::{sz, MatcherType, RangeSplits};
801///
802/// let haystack = b"a,b,c,d";
803/// let matcher = MatcherType::Find(b",");
804/// let splits: Vec<&[u8]> = RangeSplits::new(haystack, matcher).collect();
805/// assert_eq!(splits, vec![b"a", b"b", b"c", b"d"]);
806/// ```
807pub struct RangeSplits<'a> {
808 haystack: &'a [u8],
809 matcher: MatcherType<'a>,
810 position: usize,
811 last_match: Option<usize>,
812}
813
814impl<'a> RangeSplits<'a> {
815 pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>) -> Self {
816 Self {
817 haystack,
818 matcher,
819 position: 0,
820 last_match: None,
821 }
822 }
823}
824
825impl<'a> Iterator for RangeSplits<'a> {
826 type Item = &'a [u8];
827
828 fn next(&mut self) -> Option<Self::Item> {
829 if self.position > self.haystack.len() {
830 return None;
831 }
832
833 if let Some(index) = self.matcher.find(&self.haystack[self.position..]) {
834 let start = self.position;
835 let end = self.position + index;
836 self.position = end + self.matcher.needle_length();
837 self.last_match = Some(end);
838 Some(&self.haystack[start..end])
839 } else if self.position < self.haystack.len() || self.last_match.is_some() {
840 let start = self.position;
841 self.position = self.haystack.len() + 1;
842 Some(&self.haystack[start..])
843 } else {
844 None
845 }
846 }
847}
848
849/// An iterator over non-overlapping matches of a pattern in a string slice, searching from the end.
850/// This iterator yields the matched substrings in reverse order.
851///
852/// # Examples
853///
854/// ```
855/// use stringzilla::{sz, MatcherType, RangeRMatches};
856///
857/// let haystack = b"abababa";
858/// let matcher = MatcherType::RFind(b"aba");
859/// let matches: Vec<&[u8]> = RangeRMatches::new(haystack, matcher, false).collect();
860/// assert_eq!(matches, vec![b"aba", b"aba"]);
861/// ```
862pub struct RangeRMatches<'a> {
863 haystack: &'a [u8],
864 matcher: MatcherType<'a>,
865 position: usize,
866 include_overlaps: bool,
867}
868
869impl<'a> RangeRMatches<'a> {
870 pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>, include_overlaps: bool) -> Self {
871 Self {
872 haystack,
873 matcher,
874 position: haystack.len(),
875 include_overlaps,
876 }
877 }
878}
879
880impl<'a> Iterator for RangeRMatches<'a> {
881 type Item = &'a [u8];
882
883 fn next(&mut self) -> Option<Self::Item> {
884 if self.position == 0 {
885 return None;
886 }
887
888 let search_area = &self.haystack[..self.position];
889 if let Some(index) = self.matcher.find(search_area) {
890 let start = index;
891 let end = start + self.matcher.needle_length();
892 let result = Some(&self.haystack[start..end]);
893
894 let skip = self.matcher.skip_length(self.include_overlaps, true);
895 self.position = start + skip;
896
897 result
898 } else {
899 None
900 }
901 }
902}
903
904/// An iterator over non-overlapping splits of a string slice by a pattern, searching from the end.
905/// This iterator yields the substrings between the matches of the pattern in reverse order.
906///
907/// # Examples
908///
909/// ```
910/// use stringzilla::{sz, MatcherType, RangeRSplits};
911///
912/// let haystack = b"a,b,c,d";
913/// let matcher = MatcherType::RFind(b",");
914/// let splits: Vec<&[u8]> = RangeRSplits::new(haystack, matcher).collect();
915/// assert_eq!(splits, vec![b"d", b"c", b"b", b"a"]);
916/// ```
917pub struct RangeRSplits<'a> {
918 haystack: &'a [u8],
919 matcher: MatcherType<'a>,
920 position: usize,
921}
922
923impl<'a> RangeRSplits<'a> {
924 pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>) -> Self {
925 Self {
926 haystack,
927 matcher,
928 position: haystack.len(),
929 }
930 }
931}
932
933impl<'a> Iterator for RangeRSplits<'a> {
934 type Item = &'a [u8];
935
936 fn next(&mut self) -> Option<Self::Item> {
937 if self.position == 0 {
938 return None;
939 }
940
941 let search_area = &self.haystack[..self.position];
942 if let Some(index) = self.matcher.find(search_area) {
943 let end = self.position;
944 let start = index + self.matcher.needle_length();
945 let result = Some(&self.haystack[start..end]);
946
947 self.position = index;
948
949 result
950 } else {
951 let result = Some(&self.haystack[..self.position]);
952 self.position = 0;
953 result
954 }
955 }
956}
957
958/// Provides extensions for string searching and manipulation functionalities
959/// on types that can reference byte slices ([u8]). This trait extends the capability
960/// of any type implementing `AsRef<[u8]>`, allowing easy integration of SIMD-accelerated
961/// string processing functions.
962///
963/// # Examples
964///
965/// Basic usage on a `Vec<u8>`:
966///
967/// ```
968/// use stringzilla::StringZilla;
969///
970/// let haystack: &[u8] = &[b'a', b'b', b'c', b'd', b'e'];
971/// let needle: &[u8] = &[b'c', b'd'];
972///
973/// assert_eq!(haystack.sz_find(needle.as_ref()), Some(2));
974/// ```
975///
976/// Searching in a string slice:
977///
978/// ```
979/// use stringzilla::StringZilla;
980///
981/// let haystack = "abcdef";
982/// let needle = "cd";
983///
984/// assert_eq!(haystack.sz_find(needle.as_bytes()), Some(2));
985/// ```
986pub trait StringZilla<'a, N>
987where
988 N: AsRef<[u8]> + 'a,
989{
990 /// Searches for the first occurrence of `needle` in `self`.
991 ///
992 /// # Examples
993 ///
994 /// ```
995 /// use stringzilla::StringZilla;
996 ///
997 /// let haystack = "Hello, world!";
998 /// assert_eq!(haystack.sz_find("world".as_bytes()), Some(7));
999 /// ```
1000 fn sz_find(&self, needle: N) -> Option<usize>;
1001
1002 /// Searches for the last occurrence of `needle` in `self`.
1003 ///
1004 /// # Examples
1005 ///
1006 /// ```
1007 /// use stringzilla::StringZilla;
1008 ///
1009 /// let haystack = "Hello, world, world!";
1010 /// assert_eq!(haystack.sz_rfind("world".as_bytes()), Some(14));
1011 /// ```
1012 fn sz_rfind(&self, needle: N) -> Option<usize>;
1013
1014 /// Finds the index of the first character in `self` that is also present in `needles`.
1015 ///
1016 /// # Examples
1017 ///
1018 /// ```
1019 /// use stringzilla::StringZilla;
1020 ///
1021 /// let haystack = "Hello, world!";
1022 /// assert_eq!(haystack.sz_find_char_from("aeiou".as_bytes()), Some(1));
1023 /// ```
1024 fn sz_find_char_from(&self, needles: N) -> Option<usize>;
1025
1026 /// Finds the index of the last character in `self` that is also present in `needles`.
1027 ///
1028 /// # Examples
1029 ///
1030 /// ```
1031 /// use stringzilla::StringZilla;
1032 ///
1033 /// let haystack = "Hello, world!";
1034 /// assert_eq!(haystack.sz_rfind_char_from("aeiou".as_bytes()), Some(8));
1035 /// ```
1036 fn sz_rfind_char_from(&self, needles: N) -> Option<usize>;
1037
1038 /// Finds the index of the first character in `self` that is not present in `needles`.
1039 ///
1040 /// # Examples
1041 ///
1042 /// ```
1043 /// use stringzilla::StringZilla;
1044 ///
1045 /// let haystack = "Hello, world!";
1046 /// assert_eq!(haystack.sz_find_char_not_from("aeiou".as_bytes()), Some(0));
1047 /// ```
1048 fn sz_find_char_not_from(&self, needles: N) -> Option<usize>;
1049
1050 /// Finds the index of the last character in `self` that is not present in `needles`.
1051 ///
1052 /// # Examples
1053 ///
1054 /// ```
1055 /// use stringzilla::StringZilla;
1056 ///
1057 /// let haystack = "Hello, world!";
1058 /// assert_eq!(haystack.sz_rfind_char_not_from("aeiou".as_bytes()), Some(12));
1059 /// ```
1060 fn sz_rfind_char_not_from(&self, needles: N) -> Option<usize>;
1061
1062 /// Computes the Levenshtein edit distance between `self` and `other`.
1063 ///
1064 /// # Examples
1065 ///
1066 /// ```
1067 /// use stringzilla::StringZilla;
1068 ///
1069 /// let first = "kitten";
1070 /// let second = "sitting";
1071 /// assert_eq!(first.sz_edit_distance(second.as_bytes()), 3);
1072 /// ```
1073 fn sz_edit_distance(&self, other: N) -> usize;
1074
1075 /// Computes the alignment score between `self` and `other` using the specified
1076 /// substitution matrix and gap penalty.
1077 ///
1078 /// # Examples
1079 ///
1080 /// ```
1081 /// use stringzilla::{sz, StringZilla};
1082 ///
1083 /// let first = "kitten";
1084 /// let second = "sitting";
1085 /// let matrix = sz::unary_substitution_costs();
1086 /// let gap_penalty = -1;
1087 /// assert_eq!(first.sz_alignment_score(second.as_bytes(), matrix, gap_penalty), -3);
1088 /// ```
1089 fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize;
1090
1091 /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`.
1092 ///
1093 /// # Arguments
1094 ///
1095 /// * `needle`: The byte slice to search for within `self`.
1096 ///
1097 /// # Examples
1098 ///
1099 /// ```
1100 /// use stringzilla::StringZilla;
1101 ///
1102 /// let haystack = b"abababa";
1103 /// let needle = b"aba";
1104 /// let matches: Vec<&[u8]> = haystack.sz_matches(needle).collect();
1105 /// assert_eq!(matches, vec![b"aba", b"aba", b"aba"]);
1106 /// ```
1107 fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a>;
1108
1109 /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`, searching from the end.
1110 ///
1111 /// # Arguments
1112 ///
1113 /// * `needle`: The byte slice to search for within `self`.
1114 ///
1115 /// # Examples
1116 ///
1117 /// ```
1118 /// use stringzilla::StringZilla;
1119 ///
1120 /// let haystack = b"abababa";
1121 /// let needle = b"aba";
1122 /// let matches: Vec<&[u8]> = haystack.sz_rmatches(needle).collect();
1123 /// assert_eq!(matches, vec![b"aba", b"aba", b"aba"]);
1124 /// ```
1125 fn sz_rmatches(&'a self, needle: &'a N) -> RangeRMatches<'a>;
1126
1127 /// Returns an iterator over the substrings of `self` that are separated by the given `needle`.
1128 ///
1129 /// # Arguments
1130 ///
1131 /// * `needle`: The byte slice to split `self` by.
1132 ///
1133 /// # Examples
1134 ///
1135 /// ```
1136 /// use stringzilla::StringZilla;
1137 ///
1138 /// let haystack = b"a,b,c,d";
1139 /// let needle = b",";
1140 /// let splits: Vec<&[u8]> = haystack.sz_splits(needle).collect();
1141 /// assert_eq!(splits, vec![b"a", b"b", b"c", b"d"]);
1142 /// ```
1143 fn sz_splits(&'a self, needle: &'a N) -> RangeSplits<'a>;
1144
1145 /// Returns an iterator over the substrings of `self` that are separated by the given `needle`, searching from the end.
1146 ///
1147 /// # Arguments
1148 ///
1149 /// * `needle`: The byte slice to split `self` by.
1150 ///
1151 /// # Examples
1152 ///
1153 /// ```
1154 /// use stringzilla::StringZilla;
1155 ///
1156 /// let haystack = b"a,b,c,d";
1157 /// let needle = b",";
1158 /// let splits: Vec<&[u8]> = haystack.sz_rsplits(needle).collect();
1159 /// assert_eq!(splits, vec![b"d", b"c", b"b", b"a"]);
1160 /// ```
1161 fn sz_rsplits(&'a self, needle: &'a N) -> RangeRSplits<'a>;
1162
1163 /// Returns an iterator over all non-overlapping matches of any of the bytes in `needles` within `self`.
1164 ///
1165 /// # Arguments
1166 ///
1167 /// * `needles`: The set of bytes to search for within `self`.
1168 ///
1169 /// # Examples
1170 ///
1171 /// ```
1172 /// use stringzilla::StringZilla;
1173 ///
1174 /// let haystack = b"Hello, world!";
1175 /// let needles = b"aeiou";
1176 /// let matches: Vec<&[u8]> = haystack.sz_find_first_of(needles).collect();
1177 /// assert_eq!(matches, vec![b"e", b"o", b"o"]);
1178 /// ```
1179 fn sz_find_first_of(&'a self, needles: &'a N) -> RangeMatches<'a>;
1180
1181 /// Returns an iterator over all non-overlapping matches of any of the bytes in `needles` within `self`, searching from the end.
1182 ///
1183 /// # Arguments
1184 ///
1185 /// * `needles`: The set of bytes to search for within `self`.
1186 ///
1187 /// # Examples
1188 ///
1189 /// ```
1190 /// use stringzilla::StringZilla;
1191 ///
1192 /// let haystack = b"Hello, world!";
1193 /// let needles = b"aeiou";
1194 /// let matches: Vec<&[u8]> = haystack.sz_find_last_of(needles).collect();
1195 /// assert_eq!(matches, vec![b"o", b"o", b"e"]);
1196 /// ```
1197 fn sz_find_last_of(&'a self, needles: &'a N) -> RangeRMatches<'a>;
1198
1199 /// Returns an iterator over all non-overlapping matches of any byte not in `needles` within `self`.
1200 ///
1201 /// # Arguments
1202 ///
1203 /// * `needles`: The set of bytes that should not be matched within `self`.
1204 ///
1205 /// # Examples
1206 ///
1207 /// ```
1208 /// use stringzilla::StringZilla;
1209 ///
1210 /// let haystack = b"Hello, world!";
1211 /// let needles = b"aeiou";
1212 /// let matches: Vec<&[u8]> = haystack.sz_find_first_not_of(needles).collect();
1213 /// assert_eq!(matches, vec![b"H", b"l", b"l", b",", b" ", b"w", b"r", b"l", b"d", b"!"]);
1214 /// ```
1215 fn sz_find_first_not_of(&'a self, needles: &'a N) -> RangeMatches<'a>;
1216
1217 /// Returns an iterator over all non-overlapping matches of any byte not in `needles` within `self`, searching from the end.
1218 ///
1219 /// # Arguments
1220 ///
1221 /// * `needles`: The set of bytes that should not be matched within `self`.
1222 ///q
1223 /// # Examples
1224 ///
1225 /// ```
1226 /// use stringzilla::StringZilla;
1227 ///
1228 /// let haystack = b"Hello, world!";
1229 /// let needles = b"aeiou";
1230 /// let matches: Vec<&[u8]> = haystack.sz_find_last_not_of(needles).collect();
1231 /// assert_eq!(matches, vec![b"!", b"d", b"l", b"r", b"w", b" ", b",", b"l", b"l", b"H"]);
1232 /// ```
1233 fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a>;
1234
1235}
1236
1237impl<'a, T, N> StringZilla<'a, N> for T
1238where
1239 T: AsRef<[u8]> + ?Sized,
1240 N: AsRef<[u8]> + 'a,
1241{
1242 fn sz_find(&self, needle: N) -> Option<usize> {
1243 sz::find(self, needle)
1244 }
1245
1246 fn sz_rfind(&self, needle: N) -> Option<usize> {
1247 sz::rfind(self, needle)
1248 }
1249
1250 fn sz_find_char_from(&self, needles: N) -> Option<usize> {
1251 sz::find_char_from(self, needles)
1252 }
1253
1254 fn sz_rfind_char_from(&self, needles: N) -> Option<usize> {
1255 sz::rfind_char_from(self, needles)
1256 }
1257
1258 fn sz_find_char_not_from(&self, needles: N) -> Option<usize> {
1259 sz::find_char_not_from(self, needles)
1260 }
1261
1262 fn sz_rfind_char_not_from(&self, needles: N) -> Option<usize> {
1263 sz::rfind_char_not_from(self, needles)
1264 }
1265
1266 fn sz_edit_distance(&self, other: N) -> usize {
1267 sz::edit_distance(self, other)
1268 }
1269
1270 fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize {
1271 sz::alignment_score(self, other, matrix, gap)
1272 }
1273
1274 fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a> {
1275 RangeMatches::new(self.as_ref(), MatcherType::Find(needle.as_ref()), true)
1276 }
1277
1278 fn sz_rmatches(&'a self, needle: &'a N) -> RangeRMatches<'a> {
1279 RangeRMatches::new(self.as_ref(), MatcherType::RFind(needle.as_ref()), true)
1280 }
1281
1282 fn sz_splits(&'a self, needle: &'a N) -> RangeSplits<'a> {
1283 RangeSplits::new(self.as_ref(), MatcherType::Find(needle.as_ref()))
1284 }
1285
1286 fn sz_rsplits(&'a self, needle: &'a N) -> RangeRSplits<'a> {
1287 RangeRSplits::new(self.as_ref(), MatcherType::RFind(needle.as_ref()))
1288 }
1289
1290 fn sz_find_first_of(&'a self, needles: &'a N) -> RangeMatches<'a> {
1291 RangeMatches::new(
1292 self.as_ref(),
1293 MatcherType::FindFirstOf(needles.as_ref()),
1294 true,
1295 )
1296 }
1297
1298 fn sz_find_last_of(&'a self, needles: &'a N) -> RangeRMatches<'a> {
1299 RangeRMatches::new(
1300 self.as_ref(),
1301 MatcherType::FindLastOf(needles.as_ref()),
1302 true,
1303 )
1304 }
1305
1306 fn sz_find_first_not_of(&'a self, needles: &'a N) -> RangeMatches<'a> {
1307 RangeMatches::new(
1308 self.as_ref(),
1309 MatcherType::FindFirstNotOf(needles.as_ref()),
1310 true,
1311 )
1312 }
1313
1314 fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a> {
1315 RangeRMatches::new(
1316 self.as_ref(),
1317 MatcherType::FindLastNotOf(needles.as_ref()),
1318 true,
1319 )
1320 }
1321}
1322
1323/// Provides a tool for mutating a byte slice by filling it with random data from a specified alphabet.
1324/// This trait is especially useful for types that need to be mutable and can reference or be converted to byte slices.
1325///
1326/// # Examples
1327///
1328/// Filling a mutable byte buffer with random ASCII letters:
1329///
1330/// ```
1331/// use stringzilla::MutableStringZilla;
1332///
1333/// let mut buffer = vec![0u8; 10]; // A buffer to randomize
1334/// let alphabet = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; // Alphabet to use
1335/// buffer.sz_randomize(alphabet);
1336///
1337/// println!("Random buffer: {:?}", buffer);
1338/// // The buffer will now contain random ASCII letters.
1339/// ```
1340pub trait MutableStringZilla<A>
1341where
1342 A: AsRef<[u8]>,
1343{
1344 /// Fills the implementing byte slice with random bytes from the specified `alphabet`.
1345 ///
1346 /// # Examples
1347 ///
1348 /// ```
1349 /// use stringzilla::MutableStringZilla;
1350 ///
1351 /// let mut text = vec![0; 1000]; // A buffer to randomize
1352 /// let alphabet = b"AGTC"; // Using a DNA alphabet
1353 /// text.sz_randomize(alphabet);
1354 ///
1355 /// // `text` is now filled with random 'A', 'G', 'T', or 'C' values.
1356 /// ```
1357 fn sz_randomize(&mut self, alphabet: A);
1358}
1359
1360impl<T, A> MutableStringZilla<A> for T
1361where
1362 T: AsMut<[u8]>,
1363 A: AsRef<[u8]>,
1364{
1365 fn sz_randomize(&mut self, alphabet: A) {
1366 let self_mut = self.as_mut();
1367 let alphabet_ref = alphabet.as_ref();
1368 sz::randomize(self_mut, alphabet_ref);
1369 }
1370}
1371
1372#[cfg(test)]
1373mod tests {
1374 use std::borrow::Cow;
1375
1376 use crate::sz;
1377 use crate::MutableStringZilla;
1378 use crate::StringZilla;
1379
1380 #[test]
1381 fn hamming() {
1382 assert_eq!(sz::hamming_distance("hello", "hello"), 0);
1383 assert_eq!(sz::hamming_distance("hello", "hell"), 1);
1384 assert_eq!(sz::hamming_distance("abc", "adc"), 1);
1385
1386 assert_eq!(sz::hamming_distance_bounded("abcdefgh", "ABCDEFGH", 2), 2);
1387 assert_eq!(sz::hamming_distance_utf8("αβγδ", "αγγδ"), 1);
1388 }
1389
1390 #[test]
1391 fn levenshtein() {
1392 assert_eq!(sz::edit_distance("hello", "hell"), 1);
1393 assert_eq!(sz::edit_distance("hello", "hell"), 1);
1394 assert_eq!(sz::edit_distance("abc", ""), 3);
1395 assert_eq!(sz::edit_distance("abc", "ac"), 1);
1396 assert_eq!(sz::edit_distance("abc", "a_bc"), 1);
1397 assert_eq!(sz::edit_distance("abc", "adc"), 1);
1398 assert_eq!(sz::edit_distance("fitting", "kitty"), 4);
1399 assert_eq!(sz::edit_distance("smitten", "mitten"), 1);
1400 assert_eq!(sz::edit_distance("ggbuzgjux{}l", "gbuzgjux{}l"), 1);
1401 assert_eq!(sz::edit_distance("abcdefgABCDEFG", "ABCDEFGabcdefg"), 14);
1402
1403 assert_eq!(sz::edit_distance_bounded("fitting", "kitty", 2), 2);
1404 assert_eq!(sz::edit_distance_utf8("façade", "facade"), 1);
1405 }
1406
1407 #[test]
1408 fn needleman() {
1409 let costs_vector = sz::unary_substitution_costs();
1410 assert_eq!(
1411 sz::alignment_score("listen", "silent", costs_vector, -1),
1412 -4
1413 );
1414 assert_eq!(
1415 sz::alignment_score("abcdefgABCDEFG", "ABCDEFGabcdefg", costs_vector, -1),
1416 -14
1417 );
1418 assert_eq!(sz::alignment_score("hello", "hello", costs_vector, -1), 0);
1419 assert_eq!(sz::alignment_score("hello", "hell", costs_vector, -1), -1);
1420 }
1421
1422 #[test]
1423 fn search() {
1424 let my_string: String = String::from("Hello, world!");
1425 let my_str: &str = my_string.as_str();
1426 let my_cow_str: Cow<'_, str> = Cow::from(&my_string);
1427
1428 // Identical to `memchr::memmem::find` and `memchr::memmem::rfind` functions
1429 assert_eq!(sz::find("Hello, world!", "world"), Some(7));
1430 assert_eq!(sz::rfind("Hello, world!", "world"), Some(7));
1431
1432 // Use the generic function with a String
1433 assert_eq!(my_string.sz_find("world"), Some(7));
1434 assert_eq!(my_string.sz_rfind("world"), Some(7));
1435 assert_eq!(my_string.sz_find_char_from("world"), Some(2));
1436 assert_eq!(my_string.sz_rfind_char_from("world"), Some(11));
1437 assert_eq!(my_string.sz_find_char_not_from("world"), Some(0));
1438 assert_eq!(my_string.sz_rfind_char_not_from("world"), Some(12));
1439
1440 // Use the generic function with a &str
1441 assert_eq!(my_str.sz_find("world"), Some(7));
1442 assert_eq!(my_str.sz_find("world"), Some(7));
1443 assert_eq!(my_str.sz_find_char_from("world"), Some(2));
1444 assert_eq!(my_str.sz_rfind_char_from("world"), Some(11));
1445 assert_eq!(my_str.sz_find_char_not_from("world"), Some(0));
1446 assert_eq!(my_str.sz_rfind_char_not_from("world"), Some(12));
1447
1448 // Use the generic function with a Cow<'_, str>
1449 assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
1450 assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
1451 assert_eq!(my_cow_str.as_ref().sz_find_char_from("world"), Some(2));
1452 assert_eq!(my_cow_str.as_ref().sz_rfind_char_from("world"), Some(11));
1453 assert_eq!(my_cow_str.as_ref().sz_find_char_not_from("world"), Some(0));
1454 assert_eq!(
1455 my_cow_str.as_ref().sz_rfind_char_not_from("world"),
1456 Some(12)
1457 );
1458 }
1459
1460 #[test]
1461 fn randomize() {
1462 let mut text: Vec<u8> = vec![0; 10]; // A buffer of ten zeros
1463 let alphabet: &[u8] = b"abcd"; // A byte slice alphabet
1464 text.sz_randomize(alphabet);
1465
1466 // Iterate throught text and check that it only contains letters from the alphabet
1467 assert!(text
1468 .iter()
1469 .all(|&b| b == b'd' || b == b'c' || b == b'b' || b == b'a'));
1470 }
1471
1472 mod search_split_iterators {
1473 use super::*;
1474 use crate::{MatcherType, RangeMatches, RangeRMatches};
1475
1476 #[test]
1477 fn test_matches() {
1478 let haystack = b"hello world hello universe";
1479 let needle = b"hello";
1480 let matches: Vec<_> = haystack.sz_matches(needle).collect();
1481 assert_eq!(matches, vec![b"hello", b"hello"]);
1482 }
1483
1484 #[test]
1485 fn test_rmatches() {
1486 let haystack = b"hello world hello universe";
1487 let needle = b"hello";
1488 let matches: Vec<_> = haystack.sz_rmatches(needle).collect();
1489 assert_eq!(matches, vec![b"hello", b"hello"]);
1490 }
1491
1492 #[test]
1493 fn test_splits() {
1494 let haystack = b"alpha,beta;gamma";
1495 let needle = b",";
1496 let splits: Vec<_> = haystack.sz_splits(needle).collect();
1497 assert_eq!(splits, vec![&b"alpha"[..], &b"beta;gamma"[..]]);
1498 }
1499
1500 #[test]
1501 fn test_rsplits() {
1502 let haystack = b"alpha,beta;gamma";
1503 let needle = b";";
1504 let splits: Vec<_> = haystack.sz_rsplits(needle).collect();
1505 assert_eq!(splits, vec![&b"gamma"[..], &b"alpha,beta"[..]]);
1506 }
1507
1508 #[test]
1509 fn test_splits_with_empty_parts() {
1510 let haystack = b"a,,b,";
1511 let needle = b",";
1512 let splits: Vec<_> = haystack.sz_splits(needle).collect();
1513 assert_eq!(splits, vec![b"a", &b""[..], b"b", &b""[..]]);
1514 }
1515
1516 #[test]
1517 fn test_matches_with_overlaps() {
1518 let haystack = b"aaaa";
1519 let needle = b"aa";
1520 let matches: Vec<_> = haystack.sz_matches(needle).collect();
1521 assert_eq!(matches, vec![b"aa", b"aa", b"aa"]);
1522 }
1523
1524 #[test]
1525 fn test_splits_with_utf8() {
1526 let haystack = "こんにちは,世界".as_bytes();
1527 let needle = b",";
1528 let splits: Vec<_> = haystack.sz_splits(needle).collect();
1529 assert_eq!(splits, vec!["こんにちは".as_bytes(), "世界".as_bytes()]);
1530 }
1531
1532 #[test]
1533 fn test_find_first_of() {
1534 let haystack = b"hello world";
1535 let needles = b"or";
1536 let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
1537 assert_eq!(matches, vec![b"o", b"o", b"r"]);
1538 }
1539
1540 #[test]
1541 fn test_find_last_of() {
1542 let haystack = b"hello world";
1543 let needles = b"or";
1544 let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
1545 assert_eq!(matches, vec![b"r", b"o", b"o"]);
1546 }
1547
1548 #[test]
1549 fn test_find_first_not_of() {
1550 let haystack = b"aabbbcccd";
1551 let needles = b"ab";
1552 let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
1553 assert_eq!(matches, vec![b"c", b"c", b"c", b"d"]);
1554 }
1555
1556 #[test]
1557 fn test_find_last_not_of() {
1558 let haystack = b"aabbbcccd";
1559 let needles = b"cd";
1560 let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
1561 assert_eq!(matches, vec![b"b", b"b", b"b", b"a", b"a"]);
1562 }
1563
1564 #[test]
1565 fn test_find_first_of_empty_needles() {
1566 let haystack = b"hello world";
1567 let needles = b"";
1568 let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
1569 assert_eq!(matches, Vec::<&[u8]>::new());
1570 }
1571
1572 #[test]
1573 fn test_find_last_of_empty_haystack() {
1574 let haystack = b"";
1575 let needles = b"abc";
1576 let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
1577 assert_eq!(matches, Vec::<&[u8]>::new());
1578 }
1579
1580 #[test]
1581 fn test_find_first_not_of_all_matching() {
1582 let haystack = b"aaabbbccc";
1583 let needles = b"abc";
1584 let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
1585 assert_eq!(matches, Vec::<&[u8]>::new());
1586 }
1587
1588 #[test]
1589 fn test_find_last_not_of_all_not_matching() {
1590 let haystack = b"hello world";
1591 let needles = b"xyz";
1592 let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
1593 assert_eq!(
1594 matches,
1595 vec![b"d", b"l", b"r", b"o", b"w", b" ", b"o", b"l", b"l", b"e", b"h"]
1596 );
1597 }
1598
1599 #[test]
1600 fn test_range_matches_overlapping() {
1601 let haystack = b"aaaa";
1602 let matcher = MatcherType::Find(b"aa");
1603 let matches: Vec<_> = RangeMatches::new(haystack, matcher, true).collect();
1604 assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
1605 }
1606
1607 #[test]
1608 fn test_range_matches_non_overlapping() {
1609 let haystack = b"aaaa";
1610 let matcher = MatcherType::Find(b"aa");
1611 let matches: Vec<_> = RangeMatches::new(haystack, matcher, false).collect();
1612 assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
1613 }
1614
1615 #[test]
1616 fn test_range_rmatches_overlapping() {
1617 let haystack = b"aaaa";
1618 let matcher = MatcherType::RFind(b"aa");
1619 let matches: Vec<_> = RangeRMatches::new(haystack, matcher, true).collect();
1620 assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
1621 }
1622
1623 #[test]
1624 fn test_range_rmatches_non_overlapping() {
1625 let haystack = b"aaaa";
1626 let matcher = MatcherType::RFind(b"aa");
1627 let matches: Vec<_> = RangeRMatches::new(haystack, matcher, false).collect();
1628 assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
1629 }
1630 }
1631}